In [44]:
import pandas as pd
import numpy as np
import time
pd.set_option('max_columns',500)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Data Cleaning

In [192]:
def date_to_int(row, col, df):
    index = df.columns.get_loc(col)
    date = row[index]
    return time.mktime(time.strptime(str(date), '%Y-%m-%d %H:%M:%S'))

In [249]:
def data_type(df):
    # datetime and date to int
    date_cols = ['visit_date', 'prev_visit_date', 'prev_item_move_date', 
                 'last_edit_date', 'creation_date']
    for col in date_cols:
        # convert multiple time formats into single string format
        df[col] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d %H:%M:%S')
        # make time features specific data type in order to distinguish from other numberic values
        df['{}_int'.format(col)] = df.apply(date_to_int, col=col, df=df, axis=1).astype(np.float32)
        # convert string format back into datetime
        df[col] = pd.to_datetime(df[col])
    
    # objects
    obj_cols = ['ship_id', 'address1', 'customer_id', 'sales_rep_id', 'item_id', 'old_item_id', 
                'item_UPC', 'old_item_UPC', 'ship_list_pk', 'sales_rep_id_2', 'list_header_id']
    for col in obj_cols:
        df[col] = df[col].astype(object)

In [194]:
def nans(df):
    df.dropna(axis=0, inplace=True)
    # caution: crime data cuts total data in ~1/3

In [195]:
def zip_code_inc(row, df):
    index = df.columns.get_loc('postal_code')
    code = row[index]
    return int(code[:5])

In [196]:
def days_between_visits(df):
    out_arr = []
    i = 0
    for index, row in df.iterrows():
        diff = row['visit_date'] - row['prev_visit_date']
        out_arr.append(pd.Timedelta(diff).days)
    return out_arr

In [197]:
# not working
def get_masked_df(df, level_list):
    for ff_val in df[first_filter].unique():
        ff_mask = df[first_filter] == ff_val
        for sf_val in df[ff_mask][second_filter].unique():
            sf_mask = df[second_filter] == sf_val
            for tf_val in df[ ff_mask & sf_mask][third_filter].unique():
                tf_mask = df[third_filter] == tf_val
                foo = df[ ff_mask & sf_mask & tf_mask ].sort_values('visit_date', ascending=False)
    return df[ mask ].sort_values('visit_date', ascending=False)

In [198]:
# not working
def lag3(df, num_periods=1, first_filter='address1', second_filter='item_category', third_filter='item_upc'):
    # initialize column(s) of nans
    for period in range(1, num_periods + 1):
        if period == 1:
            continue
        df['qty_shrink_per_day_lag{}'.format(period)] = np.nan
        df['shrink_value_per_day_lag{}'.format(period)] = np.nan
        
    j = 0
    for ff_val in df[first_filter].unique():
        j += 1
        if j % 1000 == 0:
            print('another 10%...')
        ff_mask = df[first_filter] == ff_val
        for sf_val in df[ff_mask][second_filter].unique():
            sf_mask = df[second_filter] == sf_val
            for tf_val in df[ ff_mask & sf_mask][third_filter].unique():
                tf_mask = df[third_filter] == tf_val
                foo = df[ ff_mask & sf_mask & tf_mask ].sort_values('visit_date', ascending=False)
                #print('foo: ',foo[['visit_date']])
                length = len(foo.visit_date.unique()) # determine number of visits (because multiple item categories can be updated in a single visit)
                for period in range(1, num_periods + 1):
                    if period == 1:
                        continue
                    # skip if there's not enough data to create lag variables
                    filt_list = [first_filter, second_filter, third_filter]
                    end_of_filters = 0
                    while (length < period + 1) | end_of_filters:
                        foo = get_masked_df(df, filt_list)
                        # length = len(foo.visit_date.unique())
                        filt_list.pop()
                        continue
                    #print('length: ',length)
                    i = 0
                    foo_shifted = foo.shift(-period)
                    #print('fs: ', foo_shifted[['visit_date']])
                    foo_grouped = foo.groupby('visit_date').mean()
                    #print('fg: ', foo_shifted[['visit_date']])
                    for index, row in foo.iterrows():
                        #print(index)
                        date = foo_shifted[ foo_shifted.index == index].visit_date.values[0]
                        qty = foo_grouped[ foo_grouped.index == date].qty_shrink_per_day.values[0]
                        value = foo_grouped[ foo_grouped.index == date].shrink_value_per_day.values[0]
                        df.set_value(index, 'qty_shrink_per_day_lag{}'.format(period), qty)
                        df.set_value(index, 'shrink_value_per_day_lag{}'.format(period), value)
                        i += 1
                        #print(i)
                        if i + period == length:
                            break # back to cat loop

In [219]:
def init_nans(df, num_periods, lag_vars, col_name_suf):
    for period in range(1, num_periods + 1):
        for lag_var in lag_vars:
            df['{}_lag{}{}'.format(lag_var, period, col_name_suf)] = np.nan

In [272]:
def lag_rec(df, num_periods, col_filters, date_col, lag_vars, col_name_suf, mask=True):
    '''
    Recursively loop through various heirarchaly ordered columns, grouping by date
    INPUTS:
        df - pandas dataframe
        num_periods - number of periods (previous dates) to go back and attempt to fill lag values for
        col_filters - columns to heiracrchally filter down on, with the last column being the one ultimately used
        date_col - date column to use in grouping and lag periods
        col_name_suf - suffix to append to newly created columns (help to distinguish between last filtered column choosen)
        mask - DO NOT CHANGE. Required to be True to maintain dataframe mask between recursive iterations
    Returns:
        Nothing. All dataframe column changes occur in place
    '''
    # begin with mask of all trues
    true_mask = np.ones(len(df), dtype=bool)
    loop_mask = mask & true_mask
    col_filter = col_filters[0]
    for val in df[ loop_mask ][col_filter].unique():
        val_mask = df[col_filter] == val
        comb_mask = loop_mask & val_mask

        if len(col_filters) > 1:
            #recursively update the remaining items' positions
            lag_rec(df, num_periods, col_filters[1:], date_col, lag_vars, col_name_suf, mask=comb_mask)
        else:
            foo = df[ comb_mask ].sort_values(date_col, ascending=False)     
            length = len(foo[date_col].unique()) # determine number of visits (because multiple item categories can be updated in a single visit)
            for period in range(1, num_periods + 1):
                # skip if there's not enough data to create lag variables
                if length < period + 1:
                    continue
                i = 0
                # create duplicate df, but with all indices shifted by the current 'period' number
                foo_shifted = foo.shift(-period)
                foo_grouped = foo.groupby(date_col).mean()
                for index, row in foo.iterrows():
                    date = foo_shifted[ foo_shifted.index == index ][date_col].values[0]
                    for lag_var in lag_vars:
                        lag_val = foo_grouped[ foo_grouped.index == date ][lag_var].values[0]
                        # set value
                        df.set_value(index, '{}_lag{}{}'.format(lag_var, period, col_name_suf), lag_val)
                    i += 1
                    if i + period == length:
                        break # back to period loop

In [232]:
def lag(df, num_periods, col_filters, date_filter, lag_vars, col_name_suf):
    init_nans(df, num_periods, lag_vars, col_name_suf)
    lag_rec(df, num_periods, col_filters, date_filter, lag_vars, col_name_suf)
    return df

In [None]:
def lag_old(df, num_periods=1, first_filter='address1', second_filter='item_category', date_filter='visit_date',
        lag_var1='qty_shrink_per_day', lag_var2='shrink_value_per_day', col_name_suf=''):
    
    # initialize column(s) of nans
    for period in range(1, num_periods + 1):
        df['{}_lag{}{}'.format(lag_var1, period, col_name_suf)] = np.nan
        df['{}_lag{}{}'.format(lag_var2, period, col_name_suf)] = np.nan
        
    for ff_val in df[first_filter].unique():
        ff_mask = df[first_filter] == ff_val
        for sf_val in df[ff_mask][second_filter].unique():
            sf_mask = df[second_filter] == sf_val
            foo = df[ ff_mask & sf_mask ].sort_values(date_filter, ascending=False)
            length = len(foo[date_filter].unique()) # determine number of visits (because multiple item categories can be updated in a single visit)
            for period in range(1, num_periods + 1):
                # skip if there's not enough data to create lag variables
                if length < period + 1:
                    continue
                i = 0
                foo_shifted = foo.shift(-period)
                foo_grouped = foo.groupby(date_filter).mean()
                for index, row in foo.iterrows():
                    date = foo_shifted[ foo_shifted.index == index][date_filter].values[0]
                    lag1_val = foo_grouped[ foo_grouped.index == date][lag_var1].values[0]
                    lag2_val = foo_grouped[ foo_grouped.index == date][lag_var2].values[0]
                    
                    # set values
                    df.set_value(index, '{}_lag{}{}'.format(lag_var1, period, col_name_suf), lag1_val)
                    df.set_value(index, '{}_lag{}{}'.format(lag_var2, period, col_name_suf), lag2_val)
                    i += 1
                    if i + period == length:
                        break # back to cat loop

In [256]:
def create(df, num_periods, col_filters, date_filter, lag_vars, col_name_suf):
    df['zip_code'] = df.apply(zip_code_inc, df=df, axis=1)
    
    # normalize target variables
    days_list = days_between_visits(df)
    df['qty_shrink_per_day'] = df.qty_shrink / days_list
    df['shrink_value_per_day'] = df.shrink_value / days_list
    
    # add lag variables
    df = lag(df, num_periods, col_filters, date_filter, lag_vars, col_name_suf) # caution: takes a long time
    return df

In [250]:
def drop(df):
    del df['address3'] # redundant info (same as address 2)
    del df['postal_code'] # create zip code
    del df['duration'] # all zero values
    del df['dist_customer_id'] # all -1 values
    del df['POG_version_timestamp'] # dup of visit_date

In [205]:
def dummy(df):
    dummy_cols = ['item_category', 'customer_id']
    foo = pd.DataFrame()
    foo[dummy_cols] = df[dummy_cols].astype(str)
    df = pd.get_dummies(df, columns=dummy_cols)
    df[dummy_cols] = foo[dummy_cols]
    del foo
    return df

In [229]:
def clean(file, lag_periods, col_filters, date_filter, lag_vars, col_name_suf, remove_nan_rows=True):
    df = pd.read_pickle(file)
    data_type(df)
    df = create(df, lag_periods, col_filters, date_filter, lag_vars, col_name_suf)
    drop(df)
    df = dummy(df)
    if remove_nan_rows:
        nans(df)
    return df

In [240]:
df = pd.read_pickle('data/SRP/raw_mini_subset.pkl')

In [273]:
# get SRP data and clean
df = clean(file='data/SRP/raw_subset_300k.pkl', lag_periods=3, col_filters=['address1'], date_filter='visit_date',
        lag_vars=['qty_shrink_per_day', 'shrink_value_per_day'], col_name_suf='_by_store', remove_nan_rows=True)



In [274]:
df.to_pickle('data/SRP/clean_data_no_public_lag3_by_store.pkl')

# Add public data

In [279]:
df = pd.read_pickle('data/SRP/clean_data_no_public_lag3_by_store.pkl')

In [280]:
def add_data(df, include_crime=True, remove_nan_rows=True):
    # load data
    fd = pd.read_pickle('data/Food_Deserts/FD_clean.pkl').set_index('Zip Code')
    unemp = pd.read_pickle('data/Unemployment/unemp_clean.pkl').set_index('Zip')
    #inc = pd.read_pickle('data/Income/income_clean.pkl').set_index('ZIPCODE')
    dens = pd.read_pickle('data/Pop_Density/density_clean.pkl').set_index('Zip/ZCTA')
    crime = pd.read_pickle('data/Crime/grouped_clean.pkl').set_index(['state', 'city'])

    # join via zip code
    df = df.join(fd, on=['zip_code'], how='left')
    df = df.join(unemp, on=['zip_code'], how='left')
    # df = df.join(inc, on=['zip_code'], how='left')
    df = df.join(dens, on=['zip_code'], how='left')
    df['dens_sq_mile'] = df['dens/sq_mile'].replace(0, np.nan)
    del df['dens/sq_mile']
    
    # join via city/state
    if include_crime:
        df = df.join(crime, on=['state', 'city'], how='left')
        
    # drop all rows that contain nan
    if remove_nan_rows:
        nans(df)
        
    return df

In [281]:
def zip_code_str(row, df):
    index = df.columns.get_loc('zip_code')
    code = row[index]
    return str(code).zfill(5)

In [282]:
df = add_data(df, include_crime=True, remove_nan_rows=True)
df['zip_code'] = df.apply(zip_code_str, df=df, axis=1)

# still need to impute nans and 0's

In [283]:
df.to_pickle('data/SRP/clean_data_public_all_lag3_by_store.pkl')

# POA
- Create averages:
    - Avg qty shrink/day, shink_sales/day, etc
- Engineer lag terms (ie last visit, last month, last season)
    - Use these in whatever model I want
    - Use the averaged values