In [None]:
# default_exp core

# utilities

> API details.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import pandas as pd
import numpy as np
import re
from fastcore.all import *

In [None]:
#hide
import fastcore
print('Pandas version:', pd.__version__)
print('Numpy version:', np.__version__)
print('Regular expresions version:', re.__version__)
print('Fastcore version:', fastcore.__version__)

Pandas version: 1.1.3
Numpy version: 1.19.2
Regular expresions version: 2.2.1
Fastcore version: 1.1.0


In [None]:
titanic = pd.read_csv('../Datasets/titanic.csv')

In [None]:
#export
def is_default_index(df):
    # Check if the index is the same as the default index. We use the name as a proxy
    check_index = ((df.index == pd.RangeIndex(start=0,stop=df.shape[0], step=1)).all())
    return check_index

In [None]:
#export
def is_multiindex_row_df(df):
    if isinstance(df, pd.core.frame.DataFrame):
        if isinstance(df.index, pd.core.indexes.multi.MultiIndex):
            return True
    return False

In [None]:
#export
def is_multiindex_col_df(df):
    if isinstance(df, pd.core.frame.DataFrame):
        if isinstance(df.columns, pd.core.indexes.multi.MultiIndex):
            return True
    return False

# Pandas Dataframe methods

Approach inspired by:
1. Official pandas guide: https://pandas.pydata.org/pandas-docs/stable/development/extending.html
2. Pandas Flavor library

## Dataframe methods

The follwing methods enable users to leverage the pandas data analysis library more productively by automating many of the most common tasks. This avoids the user having to search constatnly for answers.

### remove_indexes
Converts a multi-index dataframe into a more readable flat dataframe

In [None]:
# export
@pd.api.extensions.register_dataframe_accessor('fdt')
class FastDataDataframeUtilities:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
    
    def remove_indexes(self,axis='all'):
        df = self._obj.copy()
        if is_multiindex_col_df(df) and axis in ['columns','all']:
            df.columns = df.columns.map(lambda x: '_'.join([str(i) for i in x]))
        if ((is_multiindex_row_df(df)) or (is_default_index(df) == False)) and axis in ['index','all']:
            df = df.reset_index()
        return df

  


#### Tests

##### Test the flatten_multiindex on all axis

In [None]:
titanic_pivot = titanic.pivot_table(
    aggfunc={"Survived" : ["sum","size"]}, 
    index=["Pclass","Parents/Children Aboard"], 
    columns=["Sex","Siblings/Spouses Aboard"])

In [None]:
titanic_pivot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,size,size,size,size,size,size,size,size,size,size,...,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_2,Sex,female,female,female,female,female,female,female,male,male,male,...,female,female,female,male,male,male,male,male,male,male
Unnamed: 0_level_3,Siblings/Spouses Aboard,0,1,2,3,4,5,8,0,1,2,...,4,5,8,0,1,2,3,4,5,8
Pclass,Parents/Children Aboard,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
1,0,34.0,29.0,1.0,,,,,75.0,22.0,2.0,...,,,,25.0,10.0,1.0,,,,
1,1,10.0,7.0,,,,,,9.0,5.0,,...,,,,2.0,2.0,,,,,
1,2,5.0,4.0,2.0,2.0,,,,4.0,3.0,,...,,,,2.0,3.0,,0.0,,,
1,4,,,,,,,,,1.0,,...,,,,,0.0,,,,,
2,0,32.0,12.0,,1.0,,,,72.0,14.0,3.0,...,,,,7.0,1.0,0.0,,,,


**Remove row index**

In [None]:
tianic_pivot_flat_rows = titanic_pivot.fdt.remove_indexes(axis='index')

Assert that the column indexes are flattened correctly and the original remains unchanged

In [None]:
tianic_pivot_flat_rows.head()

Unnamed: 0_level_0,Pclass,Parents/Children Aboard,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,size,size,size,size,size,size,size,size,...,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,female,female,female,female,female,female,female,male,...,female,female,female,male,male,male,male,male,male,male
Siblings/Spouses Aboard,Unnamed: 1_level_3,Unnamed: 2_level_3,0,1,2,3,4,5,8,0,...,4,5,8,0,1,2,3,4,5,8
0,1,0,34.0,29.0,1.0,,,,,75.0,...,,,,25.0,10.0,1.0,,,,
1,1,1,10.0,7.0,,,,,,9.0,...,,,,2.0,2.0,,,,,
2,1,2,5.0,4.0,2.0,2.0,,,,4.0,...,,,,2.0,3.0,,0.0,,,
3,1,4,,,,,,,,,...,,,,,0.0,,,,,
4,2,0,32.0,12.0,,1.0,,,,72.0,...,,,,7.0,1.0,0.0,,,,


In [None]:
assert is_multiindex_row_df(tianic_pivot_flat_rows) == False

In [None]:
assert is_multiindex_col_df(tianic_pivot_flat_rows) == True

**Remove col index**

In [None]:
tianic_pivot_flat_cols = titanic_pivot.fdt.remove_indexes(axis='columns')

In [None]:
tianic_pivot_flat_cols.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived_size_female_0,Survived_size_female_1,Survived_size_female_2,Survived_size_female_3,Survived_size_female_4,Survived_size_female_5,Survived_size_female_8,Survived_size_male_0,Survived_size_male_1,Survived_size_male_2,...,Survived_sum_female_4,Survived_sum_female_5,Survived_sum_female_8,Survived_sum_male_0,Survived_sum_male_1,Survived_sum_male_2,Survived_sum_male_3,Survived_sum_male_4,Survived_sum_male_5,Survived_sum_male_8
Pclass,Parents/Children Aboard,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,34.0,29.0,1.0,,,,,75.0,22.0,2.0,...,,,,25.0,10.0,1.0,,,,
1,1,10.0,7.0,,,,,,9.0,5.0,,...,,,,2.0,2.0,,,,,
1,2,5.0,4.0,2.0,2.0,,,,4.0,3.0,,...,,,,2.0,3.0,,0.0,,,
1,4,,,,,,,,,1.0,,...,,,,,0.0,,,,,
2,0,32.0,12.0,,1.0,,,,72.0,14.0,3.0,...,,,,7.0,1.0,0.0,,,,


In [None]:
assert is_multiindex_row_df(tianic_pivot_flat_cols) == True

In [None]:
assert is_multiindex_col_df(tianic_pivot_flat_cols) == False

**Remove both**

In [None]:
titanic_pivot_flat = titanic_pivot.fdt.remove_indexes(axis='all')

In [None]:
titanic_pivot_flat

Unnamed: 0,Pclass,Parents/Children Aboard,Survived_size_female_0,Survived_size_female_1,Survived_size_female_2,Survived_size_female_3,Survived_size_female_4,Survived_size_female_5,Survived_size_female_8,Survived_size_male_0,...,Survived_sum_female_4,Survived_sum_female_5,Survived_sum_female_8,Survived_sum_male_0,Survived_sum_male_1,Survived_sum_male_2,Survived_sum_male_3,Survived_sum_male_4,Survived_sum_male_5,Survived_sum_male_8
0,1,0,34.0,29.0,1.0,,,,,75.0,...,,,,25.0,10.0,1.0,,,,
1,1,1,10.0,7.0,,,,,,9.0,...,,,,2.0,2.0,,,,,
2,1,2,5.0,4.0,2.0,2.0,,,,4.0,...,,,,2.0,3.0,,0.0,,,
3,1,4,,,,,,,,,...,,,,,0.0,,,,,
4,2,0,32.0,12.0,,1.0,,,,72.0,...,,,,7.0,1.0,0.0,,,,
5,2,1,7.0,9.0,2.0,,,,,1.0,...,,,,0.0,6.0,1.0,,,,
6,2,2,5.0,6.0,,,,,,3.0,...,,,,2.0,0.0,,,,,
7,2,3,,1.0,1.0,,,,,,...,,,,,,,,,,
8,3,0,60.0,22.0,2.0,1.0,,,,260.0,...,,,,32.0,3.0,1.0,,,,
9,3,1,7.0,10.0,3.0,5.0,,,,4.0,...,,,,2.0,6.0,,0.0,0.0,,


In [None]:
assert is_multiindex_row_df(titanic_pivot_flat) == False

In [None]:
assert is_multiindex_col_df(titanic_pivot_flat) == False

### Expand column with several elements
Converts a column with multiple elements separated by a character into several columns

In [None]:
# export
@patch_to(FastDataDataframeUtilities)
def split_list_to_columns(self, column, separator=',', list_marker='na', split_type='unique'):
    df = self._obj.copy()
    
    type_of_first_not_null_element = type(df[column][df[column].notnull()][0]) 
    
    # First check if it is already a list or it needs pre-processing
    if(type_of_first_not_null_element != list):
        # If not, let's start processing it
        # First we process the surrounding brackets, if they exist
        if list_marker != 'na':
            if list_marker == 'square_brackets':
                df[column] = df[column].str.replace(r"[\[\]']","")
            elif list_marker == 'parentheses':
                df[column] = df[column].str.replace(r'([()])','')
        # Then we process the separator only if we take the unique        
        if split_type == 'unique':
            df[column] = df[column].str.split(separator)
    
    if split_type == 'unique':
        exploded = pd.get_dummies(df[column].apply(pd.Series).stack(dropna=False)).sum(level=0)
        if '' in exploded.columns:
            exploded = exploded.rename(columns={'':'blank'})
        minus_pivoted = df.drop(column,axis=1)
        result = pd.concat([minus_pivoted,exploded], axis=1)
        result = result.fillna(0)
    elif split_type == 'order':
        exploded = df[column].str.split(separator, expand=True)
        minus_pivoted = df.drop(column,axis=1)
        result = pd.concat([minus_pivoted,exploded], axis=1)
    
    result.columns = result.columns.map(str)
    return result

#### Tests

In [None]:
with_brackets = pd.DataFrame({'A': [[1, 2, 3], [1,8],[], [3, 4]], 'B': [1,2,3,4],'C': [4,3,2,1]})
with_brackets

Unnamed: 0,A,B,C
0,"[1, 2, 3]",1,4
1,"[1, 8]",2,3
2,[],3,2
3,"[3, 4]",4,1


In [None]:
assert with_brackets.fdt.split_list_to_columns(
    separator=",", 
    list_marker="square_brackets", 
    column="A", split_type='unique').columns.tolist() == ['B', 'C', '1.0', '2.0', '3.0', '4.0', '8.0']

In [None]:
without_brackets = pd.DataFrame({'A': ['1,2,3', '1,8' , '', '3,4'], 'B': [1,2,3,4],'C': [4,3,2,1]})
without_brackets

Unnamed: 0,A,B,C
0,123.0,1,4
1,18.0,2,3
2,,3,2
3,34.0,4,1


In [None]:
assert without_brackets.fdt.split_list_to_columns(
    separator=",", 
    list_marker="na", 
    column="A",split_type='unique').columns.tolist() == ['B', 'C', 'blank', '1', '2', '3', '4', '8']

In [None]:
without_brackets.fdt.split_list_to_columns(
    separator=",", 
    list_marker="square_brackets", 
    column="A", split_type='order')

Unnamed: 0,B,C,0,1,2
0,1,4,1.0,2.0,3.0
1,2,3,1.0,8.0,
2,3,2,,,
3,4,1,3.0,4.0,


In [None]:
assert without_brackets.fdt.split_list_to_columns(
    separator=",", 
    list_marker="square_brackets", 
    column="A", split_type='order').iloc[:,4].isna().tolist() == [False, True, True, True]

In [None]:
strings_and_square_brackets = pd.DataFrame({'A': ['[1,2,3]', '[1,8]' , '[]', '[3,4]'], 'B': [1,2,3,4],'C': [4,3,2,1]})
strings_and_square_brackets

Unnamed: 0,A,B,C
0,"[1,2,3]",1,4
1,"[1,8]",2,3
2,[],3,2
3,"[3,4]",4,1


In [None]:
assert strings_and_square_brackets.fdt.split_list_to_columns(
    separator=",", 
    list_marker="square_brackets", 
    column="A", split_type='unique').columns.tolist() == ['B', 'C', 'blank', '1', '2', '3', '4', '8']

In [None]:
strings_and_round_brackets = pd.DataFrame({'A': ['(1,2,3)', '(1,8)' , '()', '(3,4)'], 'B': [1,2,3,4],'C': [4,3,2,1]})
strings_and_round_brackets

Unnamed: 0,A,B,C
0,"(1,2,3)",1,4
1,"(1,8)",2,3
2,(),3,2
3,"(3,4)",4,1


In [None]:
assert strings_and_round_brackets.fdt.split_list_to_columns(
    separator=",", 
    list_marker="parentheses", 
    column="A", split_type='unique').columns.tolist() == ['B', 'C', 'blank', '1', '2', '3', '4', '8']

### Pivot flat
Pivot with the option of flattening

In [None]:
# export
@patch_to(FastDataDataframeUtilities)
def pivot_table(self, index_type, **kwargs):
    df = self._obj.copy()
    
    df = df.pivot_table(**kwargs)
    
    if index_type == 'flat':
        df = df.fdt.flatten_multiindex(axis='all')
    
    return df

## Series Methods

### Find between text
Extracts number from string

In [None]:
#export
@pd.api.extensions.register_series_accessor('fdt')
class FastDataSeriesUtilities:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
        
    def find_between_text(self, start_string, end_string):
        series = self._obj
        search_expr = start_string + '(.*)' + end_string
        series = series.str.extract(search_expr)
        return series

  


### extract_number_from_string
Extracts number from string

In [None]:
#export
@patch_to(FastDataSeriesUtilities)
def extract_number_from_string(self, dtype):
    series = self._obj
    series = series.str.extract('(\d+)')
    series = series.astype(dtype)
    return series

### bin column
Converts numeric column to bin

In [None]:
#export
@patch_to(FastDataSeriesUtilities)
def bin_column(self, **kwargs):
    series = self._obj
    #Parameters
    p = kwargs
    if p['mode'] == 'size':
        interval_range = pd.interval_range(start=p['start'], freq=p['size'], end=p['end'])
        #print(interval_range)
        series = pd.cut(series, bins=interval_range)
        return series

    if p['mode'] == 'number':
        series = pd.cut(series, bins=p['bin_number'])
        return series

    if p['mode'] == 'quantiles':
        series = pd.qcut(series, q=p['quantiles'])
        return series

    if p['mode'] == 'custom':
        interval_range=pd.IntervalIndex.from_breaks(p['breaks'], closed=p['closed'])
        series = pd.cut(series, bins= interval_range)
        return series

In [None]:
titanic.Age.max()

80.0

In [None]:
titanic.Age.min()

0.42

In [None]:
titanic['age_bin1'] = titanic['Age'].fdt.bin_column(
    mode = 'size',
    size = 20,
    start = 0,
    end = 80 )

In [None]:
assert type(titanic['age_bin1'].cat.categories.values) == pd.core.arrays.interval.IntervalArray

In [None]:
titanic['age_bin2'] = titanic['Age'].fdt.bin_column(
    mode = 'number',
    bin_number = 4)

In [None]:
assert type(titanic['age_bin2'].cat.categories.values) == pd.core.arrays.interval.IntervalArray

In [None]:
titanic['age_bin3'] = titanic['Age'].fdt.bin_column(
    mode = 'quantiles',
    quantiles = 4)

In [None]:
assert type(titanic['age_bin3'].cat.categories.values) == pd.core.arrays.interval.IntervalArray

In [None]:
titanic['age_bin4'] = titanic['Age'].fdt.bin_column(
    mode = 'custom',
    breaks = [5,18,30,50,80],
    closed='right')

In [None]:
assert type(titanic['age_bin4'].cat.categories.values) == pd.core.arrays.interval.IntervalArray

### fill empty value
Converts numeric column to bin

In [None]:
#export
@patch_to(FastDataSeriesUtilities)
def fill_empty(self, **kwargs):
    series = self._obj
    p = kwargs
    if p['mode'] == 'function':
        if p['function'] == 'ffill':
            series = series.fillna(method='ffill')
        elif p['function'] == 'bfill':
            series = series.fillna(method='bfill')
        elif p['function'] == 'mean':
            series = series.fillna(series.mean())
        elif p['function'] == 'most_frequent':
            series = series.fillna(series.mode()[0])    
    elif p['mode'] == 'value':
        series = series.fillna(p['value'])
        
    return series

In [None]:
df_with_holes = pd.DataFrame({'data_with_holes' : [1, 3, 5 , np.NaN, 100, np.NaN, 5]})
df_with_holes

Unnamed: 0,data_with_holes
0,1.0
1,3.0
2,5.0
3,
4,100.0
5,
6,5.0


In [None]:
assert df_with_holes['data_with_holes'].fdt.fill_empty(mode='function', function='ffill')[3] == 5

In [None]:
assert df_with_holes['data_with_holes'].fdt.fill_empty(mode='function', function='bfill')[3] == 100

In [None]:
assert df_with_holes['data_with_holes'].fdt.fill_empty(mode='function', function='most_frequent')[3] == 5

In [None]:
df_with_holes['data_with_holes'].mean()

22.8

In [None]:
assert df_with_holes['data_with_holes'].fdt.fill_empty(mode='function', function='mean')[3] == 22.8

In [None]:
assert df_with_holes['data_with_holes'].fdt.fill_empty(mode='value', value=13)[3] == 13

### conditional replace
Change a column based on a condition

In [None]:
#export
@patch_to(FastDataSeriesUtilities)
def replace_based_on_condition(self, cond, when, replace_with=np.NaN):
    series = self._obj
    
    if when == True:
        series = series.mask(cond=cond, other=replace_with)
    elif when == False:
        series = series.where(cond=cond, other=replace_with)

    return series

### Tests

In [None]:
FIFA_2020 = pd.read_csv(
    sep=",", 
    decimal=".", 
    filepath_or_buffer="../Datasets/players_20.csv")

In [None]:
FIFA_2020 = FIFA_2020.filter(
    items=["body_type"])

In [None]:
FIFA_2020["is_standard"] = FIFA_2020.eval("(body_type=='Normal' or body_type=='Stocky' or body_type=='Lean')", engine="python")

In [None]:
FIFA_2020["corrected"] = FIFA_2020["body_type"].fdt.replace_based_on_condition(
    when=False, 
    cond=FIFA_2020["is_standard"], 
    replace_with="Normal")

TypeError: replace_based_on_condition() got an unexpected keyword argument 'other'

In [None]:
FIFA_2020["is_not_standard"] = FIFA_2020.eval("~(body_type=='Normal' or body_type=='Stocky' or body_type=='Lean')", engine="python")

In [None]:
FIFA_2020["corrected2"] = FIFA_2020["body_type"].fdt.replace_based_on_condition(
    when=True, 
    cond=FIFA_2020["is_not_standard"], 
    replace_with="Normal")

In [None]:
(FIFA_2020["corrected"] == FIFA_2020["corrected2"]).all()

## Exposing methods directly (not used)
This is another approach at exposing pandas methods

In [None]:
#hide
import pandas as pd
import numpy as np

def flatten_multiindex(self,axis='all'):
            if 'MultiIndex' in str(type(self.columns)) and axis in ('all', 'columns'):
                #print('Removing columns')
                self.columns = self.columns.map(lambda x: '_'.join([str(i) for i in x]))

            if 'MultiIndex' in str(type(self.index)) and axis in ('all', 'index'):
                #print('Removing rows')
                self = self.reset_index()
            return self


@pd.api.extensions.register_dataframe_accessor('flatten_multiindex')
class AccessorMethod(object):
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
    
    def __call__(self, *args, **kwargs):
        return flatten_multiindex(self._obj, *args, **kwargs)

In [None]:
from nbdev.export import *
notebook2script()