In [None]:
# default_exp core

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from task_substitution.external_data import *

In [None]:
#export
_all_ = ['_pct_missing_values', '_preprocess_categorical', '_ignore_flds', '_split_train_test', '_split_by_null']

# Core

> Basic utility functions used across task substitution.

In [None]:
#hide
from nbdev.showdoc import *

### Basic foundations

In [None]:
# export
def _pct_missing_values(feature:pd.Series)->float:
    """
    Given a feature calculates percentage of missing values
    """
    if not isinstance(feature, pd.Series): feature = pd.Series(feature)
    return feature.isnull().sum() / len(feature) * 100

In [None]:
assert _pct_missing_values(pd.Series([1, np.nan, 2, 3])) == 25.0
assert _pct_missing_values(pd.Series([1, 2, 3, 4])) == 0.0
assert _pct_missing_values([1, 2, 3, 4]) == 0.0
assert _pct_missing_values([1, np.nan, 3, 4]) == 25.0

In [None]:
#export
def _preprocess_categorical(cat_feat:pd.Series)->np.ndarray:
    """
    Given a categorical feature, label encode it.
    """
    return pd.Categorical(cat_feat).codes + 1

In [None]:
example_df = pd.DataFrame({'c': ['a', 'b', 'a']})

assert type(_preprocess_categorical(example_df['c'])) == np.ndarray
assert all(_preprocess_categorical(example_df['c']) == pd.Series([1, 2, 1]))

In [None]:
#export
def _ignore_flds(df:pd.DataFrame, ignore_flds:list)->pd.DataFrame:
    """
    Given a dataframe and list of fields to ignore, this method would drop them from the dataframe
    """
    df_cpy = df.copy()
    df_cpy.drop(ignore_flds, axis=1, inplace=True)
    return df_cpy

In [None]:
example_df = pd.DataFrame({'a': [1, 2, 3],
                           'b': [3, 4, 1],
                           'c': ['a', 'x', 'z']
                          })

assert type(_ignore_flds(example_df, ['a'])) == pd.DataFrame
assert _ignore_flds(example_df, ['a']).shape[1] == 2
assert _ignore_flds(example_df, ['a']).columns.tolist() == ['b', 'c']

In [None]:
#export
def _split_by_null(df:pd.DataFrame, target_fld:str)->(pd.DataFrame, pd.DataFrame):
    """
    Given a dataframe with target name it would split df into two dataframes
    and shuffle both the dataframes as well, based on presence of value in the target feature or not.
    """
    mask = df[target_fld].notnull()
    train = df.loc[mask, :].sample(frac=1.)
    test  = df.loc[~mask, :].sample(frac=1.)
    
    return train, test

In [None]:
show_doc(_split_by_null)

<h4 id="_split_by_null" class="doc_header"><code>_split_by_null</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>_split_by_null</code>(**`df`**:`DataFrame`, **`target_fld`**:`str`)

Given a dataframe with target name it would split df into two dataframes
and shuffle both the dataframes as well, based on presence of value in the target feature or not.

In [None]:
df = get_fake_data_with_missing_values()

train, test = _split_by_null(df, 'f3')
assert len(test) == df.loc[df['f3'].isnull()].shape[0]

In [None]:
#export
def _split_train_test(df:pd.DataFrame, split_params:dict)->(pd.DataFrame,pd.DataFrame):
    train, test = train_test_split(df, **split_params)
    return train, test

In [None]:
df = get_fake_numeric_data()

train, test = _split_train_test(df, {'test_size': .2, 'random_state': 41})

assert type(train) == pd.DataFrame
assert type(test) == pd.DataFrame
assert train.shape[1] == test.shape[1]
assert len(test) == .2 * len(df)
assert len(train) == .8 * len(df)

### Export

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_data.ipynb.
Converted 02_model.ipynb.
Converted 03_recover_missing.ipynb.
Converted 04_external_data.ipynb.
Converted 05_train_test_similarity.ipynb.
Converted index.ipynb.
