In [None]:
# default_exp data

# default_cls_lvl 3

In [None]:
#hide
from nbdev.showdoc import *

# Data

> Functions for pre-processing data frames before feeding them into a decision tree etc.

In [None]:
#export
from decision_tree.imports import *
from decision_tree.core import *

## Dataframe pre-processing functions

This is all "borrowed" from https://github.com/fastai/fastai/blob/master/old/fastai/structured.py

In [None]:
#export
def add_dateparts(df, col):
    """converts a column of df from a datetime64 to many columns containing
    the information from the date - inplace."""
    targ_pre = re.sub('[Dd]ate$', '', col.name)
    attrs = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
             'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
             'Is_year_end', 'Is_year_start']
    for attr in attrs: df[targ_pre + attr] = getattr(col.dt, attr.lower())
    df[targ_pre + 'Elapsed'] = col.astype(np.int64) // 10 ** 9
    df.drop(col.name, axis=1, inplace=True)

In [None]:
#export
def fix_missing(df, col, na_dict):
    """ Fill missing data in a column of df with the median, and add a {name}_na column
    which specifies if the data was missing."""
    name = col.name
    if pd.isnull(col).sum() or (name in na_dict):
        df[name + '_na'] = pd.isnull(col)
        na_dict[name] = na_dict[name] if name in na_dict else col.median()
        df[name] = col.fillna(na_dict[name])

In [None]:
#export
def numericalize(df, col):
    """Changes col from date/string categorical type to its integer codes + 1."""
    df[col.name] = pd.Categorical(col).codes+1

In [None]:
#export
def proc_df(df, y_name, na_dict=None):
    """y_name name of the column that holds the dependent variable """
    df = df.infer_objects() # make a copy and convert cols of object type to more specific types
    if not is_numeric_dtype(df[y_name]): df[y_name] = pd.Categorical(df[y_name]).codes
    y = df[y_name]
    df.drop([y_name], axis=1, inplace=True)
    na_dict = {} if na_dict is None else na_dict.copy()
    for _, col in df.items():
        if pd.isnull(col).all() and col.name not in na_dict:
            print(f'WARNING: all values for {col.name} are null. Column will be dropped')
            df.drop(col.name, axis=1, inplace=True)
        elif is_numeric_dtype(col): fix_missing(df, col, na_dict)
        elif np.issubdtype(col.dtype, np.datetime64): add_dateparts(df, col)
        else: numericalize(df, col)
    return df, y, na_dict

In [None]:
#TODO: xxx write proper test
dates = pd.date_range('2000-12-14', periods=3, freq='D')
df = pd.DataFrame({'col1':[1,2,3], 'col2':['a','b','a'], 'col3date': dates, 'col4':[1.1,np.nan,None], 'col5':[None,np.nan,None]})
# for i in [1,4]: df[f'col{i}'] = pd.to_numeric(df[f'col{i}'])
print(df)
test_x, test_y, test_na_dict = proc_df(df, 'col1')
test_x, test_y, test_na_dict
# proced_df, y, na_dict = proc_df(df, 'col1')
# proc_df(df, 'col2', na_dict)

   col1 col2   col3date  col4  col5
0     1    a 2000-12-14   1.1   NaN
1     2    b 2000-12-15   NaN   NaN
2     3    a 2000-12-16   NaN   NaN


(   col2  col4  col3Year  col3Month  col3Week  col3Day  col3Dayofweek  \
 0     1   1.1      2000         12        50       14              3   
 1     2   1.1      2000         12        50       15              4   
 2     1   1.1      2000         12        50       16              5   
 
    col3Dayofyear  col3Is_month_end  col3Is_month_start  col3Is_quarter_end  \
 0            349             False               False               False   
 1            350             False               False               False   
 2            351             False               False               False   
 
    col3Is_quarter_start  col3Is_year_end  col3Is_year_start  col3Elapsed  \
 0                 False            False              False    976752000   
 1                 False            False              False    976838400   
 2                 False            False              False    976924800   
 
    col4_na  
 0    False  
 1     True  
 2     True  ,
 0    1
 1    2
 2   

In [None]:
#export
class DataWrapper():
    "Wraps the data that could be used for training trees or making predictions"

    @classmethod
    def from_pandas(cls, x, y):
        "x:dataframe, y:series" # TODO:  support more input types
        return DataWrapper(x.to_numpy(copy=True), y.to_numpy(copy=True), x.columns.to_numpy(copy=True), y.name)

    @classmethod
    def from_data_wrapper(cls, data, sample_idxs):
        return DataWrapper(data.x[sample_idxs], data.y[sample_idxs], data.x_names, data.y_name)

    def __init__(self, x, y, x_names, y_name=None):
        self.x, self.y, self.x_names, self.y_name = x, y, x_names, y_name if y_name else 'y'
        self.x_rows, self.x_cols = self.x.shape
        self.all_x_col_idxs = np.arange(self.x_cols)
        self.all_x_row_idxs = np.arange(self.x_rows)
        # TODO: check that x and y can work together - same length etc

    def get_sample(self, sample_idxs, col_idx=None):
        "sample_idxs: int for single row, array of ints for multiple rows"
        if col_idx is None: return self.x[sample_idxs], self.y[sample_idxs]
        return self.x[sample_idxs, col_idx], self.y[sample_idxs]

    def head(self, n_rows):
        return DataWrapper.from_data_wrapper(self, slice(n_rows))

    def tail(self, n_rows):
        # TODO: raise error if n_rows > x_rows
        return DataWrapper.from_data_wrapper(self, slice(self.x_rows-n_rows, self.x_rows))

    def __repr__(self):
        return f'DataWrapper(x:{self.x_names} y:{self.y_name}, len:{len(self.x)})'

In [None]:
test_data = DataWrapper.from_pandas(test_x, test_y)
assert np.array_equal([0,1,2], test_data.all_x_row_idxs)
assert np.array_equal([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], test_data.all_x_col_idxs)
assert np.array_equal(([2, 1], [2, 3]), test_data.get_sample([1,2], 0))
# pass array into sample_idxs to get 2d array back - i.e. multiple rows of data
assert test_data.x.shape == test_data.get_sample([0,1,2], None)[0].shape
# pass an into into sample_idxs to get a 1d array back - i.e. one row of data
assert test_data.x.shape[1] == test_data.get_sample(1, None)[0].shape[0]
assert test_data.x_rows == 3
test_head = test_data.head(2)
assert test_head.x_rows == 2
test_tail = test_data.tail(2)
assert test_tail.x_rows == 2
test_data = DataWrapper.from_data_wrapper(test_data, [0,2])
assert test_data.x_rows == 2
assert np.array_equal([0,1], test_data.all_x_row_idxs)
assert np.array_equal(([1, 1], [1, 3]), test_data.get_sample([0,1], 0))

In [None]:
#hide
#export
class TestClassThatDoesNothing():
    ""
    def __init__(self): pass

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 000_target_module.ipynb.
Converted 001_exports_to_target_module.ipynb.
Converted 002_target_module.ipynb.
Converted 00_core.ipynb.
Converted 10_data.ipynb.
Converted 20_models.ipynb.
Converted 21_models-extra.ipynb.
Converted 30_test_flag.ipynb.
Converted 40_test_export.ipynb.
Converted 50_test_doc.ipynb.
Converted 51_test_show_doc.ipynb.
Converted 60_all_test.ipynb.
Converted 61_test_add2__all__.ipynb.
Converted 70_multi_all_test_flag.ipynb.
Converted 71_tensor_patch.ipynb.
Converted 72_if__name__.ipynb.
Converted 73_in_ipython.ipynb.
Converted 80_test_coverage.ipynb.
Converted 81_test_coverage.ipynb.
Converted index.ipynb.
