In [None]:
# default_exp data

In [None]:
#export
import pandas as pd

from   pathlib import Path
from   fastai.tabular.all import *
from   datetime import datetime

In [None]:
#hide
from nbdev.showdoc import *

# Data Preparation

> Module to prepare data for experiments

The methods provide way to create train and test datasets, splitter and dataloaders.

In [None]:
#export
DATA_DIR = Path('~/data/')

### Train Test Split

In [None]:
#export
def read_dataset(nrows:int=14000000): return pd.read_csv(DATA_DIR / 'train', nrows=nrows)
def add_time_feat(df):
    df = df.assign(click_hour=df.assign(click_hour=df.hour.astype('str').str.slice(6)))
    return df
    
def split_train_test(df, crit):
    """
    Split dataset ( df ) based on a criterion/filter
    
    Arguments:
        df  : Dataset
        crit: criterion/filter
    """
    df_test = None
    
    if crit is None: return df, df_test

    # apply filter.
    df_test = df.loc[crit] 
    df      = df.loc[~crit]
    
    return df, df_test

def make_dataset(df, crit=None):
    return split_train_test(df, crit)

In [None]:
#slow
df     = read_dataset()
df     = add_time_feat(df)

crit   = df.hour.astype('str').str.slice(0, 6) == '141023'
tr, te = make_dataset(df, crit)

test_eq(len(tr) == 10129248)
test_eq(len(te) ==  3870752)

tr shape: (10129248, 24), te shape: (3870752, 24)


### Splitting Strategy

In [None]:
def DateSplitter(dt='141022', seed=None):
    "Create function that splits `items` between train/val with based on date."
    def _inner(o):
        o.index  = np.arange(len(o))
        if seed is not None: torch.manual_seed(seed)
        indices  = o.index.values
        valid    = np.where(o.hour.astype(str).str.slice(0, 6) == dt)[0]
        train    = list(set(indices) - set(valid))
        
        return indices[train],indices[valid]
    return _inner

In [None]:
#slow
tr_splits, va_splits = DateSplitter()(tr)

test_eq(len(tr_splits) == 4792122)
test_eq(len(va_splits) == 5337126)

(4792122,) (5337126,)


### DataLoaders

In [None]:
#export
def create_dl(df, bs, target, cat_names, cont_names, procs, splitter):
    "Create function that prepares `dataloader` using several arguments."
    splits = splitter(df)
    dls    = TabularDataLoaders.from_df(df, 
                                     path='.', 
                                     procs=procs, 
                                     cat_names=cat_names, 
                                     cont_names=cont_names, 
                                     y_names="click", 
                                     splits=splits, 
                                     bs=2048
                                    )
    return dls

In [None]:
#slow
cat_names  = ['C1', 
              'banner_pos', 
              'site_id', 
              'site_domain',
              'site_category', 
              'app_id', 
              'app_domain', 
              'app_category',
              'device_id', 
              'device_ip', 
              'device_model', 
              'device_type',
              'device_conn_type', 
              'C14', 
              'C15',
              'C16', 
              'C17', 
              'C18', 
              'C19', 
              'C20', 
              'C21'
             ]

cont_names = []
procs      = [Categorify, FillMissing]
target     = 'click'
bs         = 2048
splitter   = DateSplitter(seed=41)

dls        = create_dl(tr, bs, target, cat_names, cont_names, procs, splitter)

dls.show()

Unnamed: 0,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,click
3280536,1010,1,85f751fd,c4e18dd6,50e219e0,826c565e,7801e8d9,0f2161f8,8a7fb045,be23dbfa,f07e20f8,4,0,21790,320,50,2513,3,35,-1,68,0
10013407,1002,0,1ee69d19,ce81106c,50e219e0,ecad2386,7801e8d9,07d7df22,60b5cd1b,e2c233db,507de649,0,0,15908,320,50,1752,3,297,100081,82,1
7790564,1005,0,85f751fd,c4e18dd6,50e219e0,36f58dec,2347f47a,f95efa07,a99f214a,8e787681,d780319b,1,0,21767,320,50,2506,0,35,-1,157,0
6718210,1005,0,85f751fd,c4e18dd6,50e219e0,e9739828,df32afa9,cef3e649,a99f214a,d6c0acb1,e55b4de4,1,0,21768,320,50,2506,0,35,100020,157,0
4512824,1005,0,85f751fd,c4e18dd6,50e219e0,66f5e02e,6f7ca2ba,0f2161f8,a99f214a,fc895806,2891f384,1,0,21678,320,50,2495,2,167,-1,23,0
6886687,1002,0,85f751fd,c4e18dd6,50e219e0,a37bf1e4,7801e8d9,07d7df22,dce843f8,97d296f6,3bcda2fe,0,0,14265,320,50,1526,2,169,100111,35,0
6909109,1005,0,cd58172f,b9c4ab81,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,cfcbdff7,158e4944,1,0,17877,320,50,2036,3,47,-1,156,0
648145,1005,0,23d99ea0,6bdbd889,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,339649fc,6e1e2240,1,0,16208,320,50,1800,3,167,100074,23,0
5713179,1005,0,4e7614cf,c1aa3c04,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,e70f5b2e,8ef046ab,1,0,21764,216,36,2506,0,35,100076,157,0
3903661,1005,0,85f751fd,c4e18dd6,50e219e0,7358e05e,b9528b13,cef3e649,8c2d936a,4b44384d,be6db1d7,1,0,456,320,50,122,3,1319,-1,15,0
