# Pandas Extra functions
> Extra pandas functions at import

In [1]:
from typing import Callable
#from forgebox.imports import *
#from .widgets import search_box, paginate

In [2]:
import pandas as pd
from typing import Callable
import numpy as np

def display_df(df): display(df)
    
def list_vc(
    df, colname: str, value: str
) -> pd.DataFrame:
    """
    count the values in a column
        that each cell is a list
    """
    return df[colname].list_vc(value)

def col_list_vc(
    col, value: str
) -> pd.DataFrame:
    """
    count the values in a column
        that each cell is a list
    """
    return pd.DataFrame(
        col.apply(lambda x: value in x).value_counts()
    )

pd.DataFrame.vc = lambda self,col:pd.DataFrame(self[col].value_counts())
pd.Series.list_vc = col_list_vc
pd.DataFrame.list_vc = list_vc

def split(df, valid=0.2, ensure_factor=2):
    """
    df: dataframe
    valid: valid ratio, default 0.1
    ensure_factor, ensuring the row number to be the multiplication of this factor, default 2
    return train_df, valid_df
    """
    split_ = (np.random.rand(len(df)) > valid)
    train_df = df[split_].sample(frac=1.).reset_index().drop("index", axis=1)
    valid_df = df[~split_].sample(frac=1.).reset_index().drop("index", axis=1)

    if ensure_factor:
        train_mod = len(train_df) % ensure_factor
        valid_mod = len(valid_df) % ensure_factor
        if train_mod: train_df = train_df[:-train_mod]
        if valid_mod: valid_df = valid_df[:-valid_mod]
    return train_df, valid_df

pd.DataFrame.split = split


def default_rename_rule(x: str) -> str:
    return x.replace(" ", "_").replace("-", "_").lower()


def rename_by_rule(
    df,
    rule: Callable = default_rename_rule
) -> pd.DataFrame:
    """
    rename the columns by a rule function
    """
    df = df.rename(
        columns=dict((c, rule(c)) for c in df.columns))
    return df

pd.DataFrame.rename_by_rule = rename_by_rule


def column_order(df, *col_names) -> pd.DataFrame:
    """
    df = df.column_order("col1", "col2", "col3")
    will put col1, col2, and col3 as the 1st 3 column
    """
    cols = list(df.columns)

    for col_name in list(col_names)[::-1]:

        # warn if the column exist
        if col_name not in cols:
            print(f"Column:'{col_name}' not in dataframe")
            continue
        cols.insert(0, cols.pop(cols.index(col_name)))
    return df[cols]

pd.DataFrame.column_order = column_order

In [3]:

def search_box(df, columns, manual=False, max_rows=10, callback=display_df):
    """
    create a search box based on dataframe
    df: pandas dataframe
    columns: str, dataframe field name
    manual: bool, search the dataframe on when click the button(manual=True),
        or on keypress reaction to inputbox (manual=False), default False
    max_rows:int, max rows of show result, default 10
    callback: python callable, discribe the action you want to put on
        search result (a filtered dataframe), default is to display the dataframe
    """
    from ipywidgets import interact, interact_manual
    from IPython.display import HTML

    intera = interact_manual if manual else interact

    @intera
    def search(KeyWord="",):
        for col in columns:
            result = df[col].fillna("NaN Value").str.contains(KeyWord)
            if sum(result) > 0:
                with PandasDisplay(max_colwidth=0, max_rows=max_rows):
                    display(
                        HTML(f"<h3>\"{KeyWord}\" matched on column:[{col}]</h3>"))
                    callback(df[result].head(max_rows))
                    return
        print(f"Nothing found on any column on keyword:{KeyWord}")
        return




In [4]:
def paginate(df, page_len=20):
    """
    Paginate dataframe in jupyter notebook interactively
    Like you can flip through the page
    """
    from ipywidgets import interact, interact_manual
    from IPython.display import display, HTML
    pages = len(df)//page_len

    @interact
    def preview(page=(0, pages)):
        display(HTML(f"<h4>page:{page}/{pages}</4>"))
        end = (page+1)*page_len
        display(df.head(end).tail(page_len))

### Testing

In [5]:
from sklearn.datasets import _california_housing
cdata = _california_housing.fetch_california_housing()
df = pd.DataFrame(cdata["data"], columns=cdata["feature_names"])

## Value Counts

In [6]:
df["is_old"] = df.HouseAge>20
df.vc("is_old")

Unnamed: 0,is_old
True,14347
False,6293


## Rename columns

In [7]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'is_old'],
      dtype='object')

In [8]:
print(list(df.rename_by_rule().columns))

['medinc', 'houseage', 'averooms', 'avebedrms', 'population', 'aveoccup', 'latitude', 'longitude', 'is_old']


In [9]:
print(list(df.column_order("is_old","AveOccup").columns))

['is_old', 'AveOccup', 'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'Latitude', 'Longitude']


## Paginate

In [10]:
pd.DataFrame.paginate = paginate

In [11]:
df.paginate()

interactive(children=(IntSlider(value=516, description='page', max=1032), Output()), _dom_classes=('widget-int…