In [None]:
#export
import pandas as pd
import numpy as np

import string

In [None]:
# default_exp core

# module name here

> API details.

In [None]:
#hide
from nbdev.showdoc import *

### Basic foundations

In [None]:
# export
def _pct_missing_values(feature:pd.Series)->float:
    """
    Given a feature calculates percentage of missing values
    """
    if not isinstance(feature, pd.Series): feature = pd.Series(feature)
    return feature.isnull().sum() / len(feature) * 100

In [None]:
assert _pct_missing_values(pd.Series([1, np.nan, 2, 3])) == 25.0
assert _pct_missing_values(pd.Series([1, 2, 3, 4])) == 0.0
assert _pct_missing_values([1, 2, 3, 4]) == 0.0
assert _pct_missing_values([1, np.nan, 3, 4]) == 25.0

In [None]:
#export
def _preprocess_categorical(cat_feat:pd.Series)->np.ndarray:
    """
    Given a categorical feature, label encode it.
    """
    return pd.Categorical(cat_feat).codes + 1

In [None]:
example_df = pd.DataFrame({'c': ['a', 'b', 'a']})

assert type(_preprocess_categorical(example_df['c'])) == np.ndarray
assert all(_preprocess_categorical(example_df['c']) == pd.Series([1, 2, 1]))

In [None]:
#export
def _ignore_flds(df:pd.DataFrame, ignore_flds:list)->pd.DataFrame:
    """
    Given a dataframe and list of fields to ignore, this method would drop them from the dataframe
    """
    df_cpy = df.copy()
    df_cpy.drop(ignore_flds, axis=1, inplace=True)
    return df_cpy

In [None]:
example_df = pd.DataFrame({'a': [1, 2, 3],
                           'b': [3, 4, 1],
                           'c': ['a', 'x', 'z']
                          })

assert type(_ignore_flds(example_df, ['a'])) == pd.DataFrame
assert _ignore_flds(example_df, ['a']).shape[1] == 2
assert _ignore_flds(example_df, ['a']).columns.tolist() == ['b', 'c']

In [None]:
#export
class Dataset:
    def __init__(self, df, **kwargs):
        self.df = df
        self.missing_fld = kwargs['missing_fld']
        self.ignore_flds = kwargs['ignore_flds']
        self.cat_cols = kwargs['cat_cols']
        
    @property
    def target(self):
        return self.missing_fld
    
    def remove_ignore_flds(self)->pd.DataFrame:
        if self.ignore_flds is not None:
            self.df = _ignore_flds(self.df, ignore_flds)
        return self.df
    
    def preprocess_categorical(self)->pd.DataFrame:
        cats = self.cat_cols if self.cat_cols else self.df.select_dtypes(include=['object']).columns
        for c, v in self.df.loc[:, cats].items():
            self.df.loc[:, c] = _preprocess_categorical(v)
        return self.df
    
    def preprocess(self):
        self.df = self.remove_ignore_flds()
        self.df = self.preprocess_categorical()
        return self.df

In [None]:
example_df = pd.DataFrame({'c1': np.random.rand(10, ),
                           'c2': [string.ascii_lowercase[np.random.randint(low=0, high=10)] for i in range(10)],
                           'c3': np.random.permutation([np.nan] + list(np.random.rand(9, )))
                          })
example_df

Unnamed: 0,c1,c2,c3
0,0.532222,b,0.549972
1,0.157428,j,
2,0.73207,a,0.512227
3,0.731046,i,0.352248
4,0.913483,h,0.682245
5,0.338139,e,0.934327
6,0.506819,d,0.176334
7,0.314402,b,0.957635
8,0.181144,i,0.644976
9,0.351624,b,0.124792


In [None]:
data = Dataset(example_df, missing_fld='c3', cat_cols=['c2'], ignore_flds=None)
# new_example_df = data.preprocess(); new_example_df

'c3'