In [None]:
# default_exp core

# fastdata-utilities

> API details.

In [None]:
#hide
from nbdev.showdoc import *
import pandas as pd

# Pandas Dataframe methods

Approach inspired by:
1. Official pandas guide: https://pandas.pydata.org/pandas-docs/stable/development/extending.html
2. Pandas Flavor library

## Dataframe methods

The follwing methods enable users to leverage the pandas data analysis library more productively by automating many of the most common tasks. This avoids the user having to search constatnly for answers.

### flatten_multiindex
Converts a multi-index dataframe into a more readable flat dataframe

In [None]:
#export
import pandas as pd
import numpy as np
import re

@pd.api.extensions.register_dataframe_accessor('fdt')
class FastDataUtilities:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
    
    def flatten_multiindex(self,axis='all'):
        df = self._obj.copy()
        if 'MultiIndex' in str(type(df.columns)) and axis in ('all', 'columns'):
            df.columns = df.columns.map(lambda x: '_'.join([str(i) for i in x]))
        if 'MultiIndex' in str(type(df.index)) and axis in ('all', 'index'):
            df = df.reset_index()
        return df

#### Test the flatten_multiindex

In [None]:
titanic = pd.read_csv('test_datasets/titanic.csv')

In [None]:
titanic_pivot = titanic.pivot_table(
    aggfunc={"Survived" : ["sum","size"]}, 
    index=["Pclass","Parents/Children Aboard"], 
    columns=["Sex","Siblings/Spouses Aboard"])

In [None]:
titanic_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,size,size,size,size,size,size,size,size,size,size,...,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_2,Sex,female,female,female,female,female,female,female,male,male,male,...,female,female,female,male,male,male,male,male,male,male
Unnamed: 0_level_3,Siblings/Spouses Aboard,0,1,2,3,4,5,8,0,1,2,...,4,5,8,0,1,2,3,4,5,8
Pclass,Parents/Children Aboard,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
1,0,34.0,29.0,1.0,,,,,75.0,22.0,2.0,...,,,,25.0,10.0,1.0,,,,
1,1,10.0,7.0,,,,,,9.0,5.0,,...,,,,2.0,2.0,,,,,
1,2,5.0,4.0,2.0,2.0,,,,4.0,3.0,,...,,,,2.0,3.0,,0.0,,,
1,4,,,,,,,,,1.0,,...,,,,,0.0,,,,,
2,0,32.0,12.0,,1.0,,,,72.0,14.0,3.0,...,,,,7.0,1.0,0.0,,,,
2,1,7.0,9.0,2.0,,,,,1.0,11.0,2.0,...,,,,0.0,6.0,1.0,,,,
2,2,5.0,6.0,,,,,,3.0,2.0,,...,,,,2.0,0.0,,,,,
2,3,,1.0,1.0,,,,,,,,...,,,,,,,,,,
3,0,60.0,22.0,2.0,1.0,,,,260.0,24.0,8.0,...,,,,32.0,3.0,1.0,,,,
3,1,7.0,10.0,3.0,5.0,,,,4.0,15.0,,...,,,,2.0,6.0,,0.0,0.0,,


In [None]:
titanic_pivot_flat = titanic_pivot.fdt.flatten_multiindex(axis='all')

In [None]:
titanic_pivot_flat

Unnamed: 0,Pclass,Parents/Children Aboard,Survived_size_female_0,Survived_size_female_1,Survived_size_female_2,Survived_size_female_3,Survived_size_female_4,Survived_size_female_5,Survived_size_female_8,Survived_size_male_0,...,Survived_sum_female_4,Survived_sum_female_5,Survived_sum_female_8,Survived_sum_male_0,Survived_sum_male_1,Survived_sum_male_2,Survived_sum_male_3,Survived_sum_male_4,Survived_sum_male_5,Survived_sum_male_8
0,1,0,34.0,29.0,1.0,,,,,75.0,...,,,,25.0,10.0,1.0,,,,
1,1,1,10.0,7.0,,,,,,9.0,...,,,,2.0,2.0,,,,,
2,1,2,5.0,4.0,2.0,2.0,,,,4.0,...,,,,2.0,3.0,,0.0,,,
3,1,4,,,,,,,,,...,,,,,0.0,,,,,
4,2,0,32.0,12.0,,1.0,,,,72.0,...,,,,7.0,1.0,0.0,,,,
5,2,1,7.0,9.0,2.0,,,,,1.0,...,,,,0.0,6.0,1.0,,,,
6,2,2,5.0,6.0,,,,,,3.0,...,,,,2.0,0.0,,,,,
7,2,3,,1.0,1.0,,,,,,...,,,,,,,,,,
8,3,0,60.0,22.0,2.0,1.0,,,,260.0,...,,,,32.0,3.0,1.0,,,,
9,3,1,7.0,10.0,3.0,5.0,,,,4.0,...,,,,2.0,6.0,,0.0,0.0,,


Assert that the column indexes are flattened correctly and the original remains unchanged

In [None]:
assert type(titanic_pivot_flat.columns) != pd.core.indexes.multi.MultiIndex

In [None]:
assert type(titanic_pivot.columns) == pd.core.indexes.multi.MultiIndex

Assert that the row index are flattened

In [None]:
assert type(titanic_pivot_flat.columns) != pd.core.indexes.multi.MultiIndex

In [None]:
assert type(titanic_pivot.columns) == pd.core.indexes.multi.MultiIndex

Check that column names are flattened with underscores and integers do not cause an issue.
This is flatened as:
1. Column name: Survived
2. Aggregation: size
3. Sex column: female (first defined in the columns parameter)
4. Siblings/Spouses Aboard: 1 (second item defined int the columns parameter)

In [None]:
assert titanic_pivot_flat.columns[3] == 'Survived_size_female_1'

## Series Methods

### extract_number_from_string
Extracts number from string

In [None]:
#export
@pd.api.extensions.register_series_accessor('fdt')
class FastDataUtilities:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
        
    def find_between_text(self, start_string, end_string):
        series = self._obj
        search_expr = start_string + '(.*)' + end_string
        series = series.str.extract(search_expr)
        return series
    
    def extract_number_from_string(self, dtype):
        series = self._obj
        series = series.str.extract('(\d+)')
        series = series.astype(dtype)
        return series

## Export code

In [None]:
# export
def generate_function_call_from_form(formReponse, dataframeSelection):
    formData = formReponse["formData"];
    transformationSelection = formReponse["schema"]["function"];
    callerObject =  formReponse["schema"]["callerObject"];
    series = ''

    if 'DataFrame' in callerObject:
        callerObject = callerObject.replace('DataFrame', dataframeSelection)
    if 'Series' in callerObject:
        seriesString = '"' + formData['column'] +'"'
        series = '[' + seriesString + ']'
        callerObject = callerObject.replace('Series',seriesString)

    formula = callerObject + '.' + transformationSelection + '('
 
    variable = ''
    for key in formData:
        parameterPrefix = '\n    '
        # Check for a codegen style
        if("codegenstyle" in formReponse["schema"]["properties"][key]):
            codegenstyle = formReponse["schema"]["properties"][key]['codegenstyle']
            if codegenstyle == 'variable':
                # Remove qutations
                formula = formula + parameterPrefix + key + '=' + str(formData[key]) + ', '
            elif codegenstyle == 'array':
                # Add brackets []
                formula = formula + parameterPrefix + key + '=' + str(formData[key]) + ', '
            elif codegenstyle == 'aggregation':
                # Process aggregations for function merge
                aggregationDict = '{'
                for dict in formData["aggfunc"]:
                    aggregationDict = aggregationDict + '"' + dict["column"] + '" : ' + str(dict["function"]).replace('"', '') + ', '

                aggregationDict = aggregationDict[0: len(aggregationDict) - 2];
                aggregationDict = aggregationDict + '}'
                aggregationDict = parameterPrefix + 'aggfunc=' + aggregationDict
                formula = formula + aggregationDict + ', '
        else:
            if key == 'New table':
                variable = formData[key]
            else:
                formula = formula + parameterPrefix + key + '="' + str(formData[key]) + '", '

    if(variable == '' and dataframeSelection != 'None'):
        variable = dataframeSelection
    elif(dataframeSelection == 'None'):
        variable = 'data'
    
    # Remove last comma and space given there are no more parameters
    formula = formula[0: len(formula) - 2];
    # Close parenthesis
    formula = formula + ')'
    # Finalize formula
    formula = variable + series + ' = ' + formula;
    return formula

## Exposing methods directly (not used)
This is another approach at exposing pandas methods

In [None]:
#hide
import pandas as pd
import numpy as np

def flatten_multiindex(self,axis='all'):
            if 'MultiIndex' in str(type(self.columns)) and axis in ('all', 'columns'):
                #print('Removing columns')
                self.columns = self.columns.map(lambda x: '_'.join([str(i) for i in x]))

            if 'MultiIndex' in str(type(self.index)) and axis in ('all', 'index'):
                #print('Removing rows')
                self = self.reset_index()
            return self


@pd.api.extensions.register_dataframe_accessor('flatten_multiindex')
class AccessorMethod(object):
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
    
    def __call__(self, *args, **kwargs):
        return flatten_multiindex(self._obj, *args, **kwargs)

In [None]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
