In [2]:
# This version of notebook is compact and for directly generating features, modeling and testing

In [3]:
%config Completer.use_jedi = False # to enable auto complete
import pandas as pd
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [4]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('Main_Churn_Perdictors')

In [22]:
# data cleaning
# this function is designed to handle the long tailed distributions (heavily found in the data)
# hence it makes sense to test it on heavil tailed distributions ()
from numpy import random
def remove_outliers_quantiles(df,columns,q=0.99):
    # this function remove outliers based on quantiles, columns must be numeric
    n_rows = df.shape[0]
    running_idx = pd.Series(np.repeat(True,n_rows))
    for col in columns:
        if isinstance(df[col][0],(np.float_,np.int_)):
            
            val_q = np.quantile(a=df[col],q=q)
            idx = df[col]<=val_q
            running_idx = np.logical_and(running_idx,idx)

            row_to_remove_count = idx.value_counts().sort_values().values[0]
            logger.info(f"""Removing {row_to_remove_count} based on columns {col} which is
                {np.round(1.0*row_to_remove_count/n_rows,4)*100} % of the rows""")
        else:
            logger.error(f'Cannot remove outliers from column {col} as it is not numeric!')
        filterd_df = df.loc[running_idx,:]
        
    total_rows_removed = running_idx.value_counts().sort_values().values[0]
    logger.info(f"""Total rows remove = {total_rows_removed} which is 
            {np.round(1.0*total_rows_removed/n_rows,4)*100} % of the rows""")
    return filterd_df

def test_remove_outliers_quantiles():
    n = 10e6
    alpha = 0.01
    df = pd.DataFrame({'X':random.exponential(scale=0.1,size=int(n)),\
                       'Y': random.exponential(scale=0.01,size=int(n))})
    
    df_processed = remove_outliers_quantiles(df,['X','Y'],1-alpha)
    n_processed = df_processed.shape[0]
    fraction = (n-n_processed) / n_processed
    # assert the right amount of data is cleaned
    # note because the and condtion in filtering, the lower bound of filerting fraction is alpha
    logger.info(f'clean fraction = {fraction}')
    assert fraction >= alpha , 'cleaned less than exepcted fraction'
test_remove_outliers_quantiles()

INFO:Main_Churn_Perdictors:Removing 100000 based on columns X which is
                1.0 % of the rows
INFO:Main_Churn_Perdictors:Removing 100000 based on columns Y which is
                1.0 % of the rows
INFO:Main_Churn_Perdictors:Total rows remove = 198958 which is 
            1.9900000000000002 % of the rows
INFO:Main_Churn_Perdictors:clean fraction = 0.020299678340323406


In [6]:
# set of Feature generation functions along with their unit testing

In [47]:
def upsample(df,target_col):
    if len(df[target_col].unique())!=2:
        raise ValueError('Numer of target unique values must = 2')
    count = df[target_col].value_counts(normalize=True).sort_values()
    minority_df = df.loc[df[target_col]==count.index[0],:]
    majority_df = df.loc[df[target_col]==count.index[1],:]
    # upsampling
    n_majority = majority_df.shape[0]
    minority_df_upsampled = minority_df.sample(n=n_majority,replace=True)
    balanced_df = majority_df.append(minority_df_upsampled,ignore_index = True)
    count_balanced = balanced_df[target_col].value_counts(normalize=True)
    return balanced_df
def test_upsample():
    logger = logging.getLogger('test_upsample')
    n = 100
    df = pd.DataFrame({'Y':np.repeat(0,100)})
    df.loc[0:10,'Y'] = 1
    #logger.info(f"""Y value counts = \n{df['Y'].value_counts(normalize=True)}""")
    df_upsampled = upsample(df,'Y') 
    #logger.info(f"""Y value counts = \n{df_upsampled['Y'].value_counts(normalize=True)}""")
    upsampled_counts = df_upsampled['Y'].value_counts(normalize=True)
    assert np.abs(1-upsampled_counts.values[0]/upsampled_counts.values[1]) <= 1e-2
    assert df_upsampled.shape[0] > df.shape[0]
test_upsample()

In [66]:
# wrapper to just get a scalar version of the mode
# TODO , add simple unit test
def get_mode_scalar(series):# wrapper to get scalar value, not a series
    return pd.Series(series).mode()[0]

In [67]:
# Feature generation function, focusing on recurring customers
# intuitively, the less the better 
def avg_days_between_orders(dates,default = 365): # need to revisit the default value , set to be 1 year
    # the main intuition is that the higher the number , the less loyal the customer
    # for corner case of only 1 order, setting the default to high, which is in the same direction of less loyal
    # customers
    if len(dates) <2:
        return default
    dates_series = pd.Series(data=pd.to_datetime(dates).values).sort_values()
    dates_series_lag1 = dates_series.shift(1)
    time_delta =  dates_series - dates_series_lag1
    time_delta_day = time_delta.apply(lambda x:x.days)
    return np.nanmean(time_delta_day)
def test_avg_days_between_orders():
    dates = ['2020-01-20','2020-01-23','2020-01-30']
    assert np.abs(avg_days_between_orders(dates)-5) <1e-3
    dates = ['2020-01-20']
    assert np.abs(avg_days_between_orders(dates)-365) <1e-3
test_avg_days_between_orders()

In [68]:
def time_since_last_order(dates,current_date):
    dates = pd.to_datetime(dates)
    max_date = np.nanmax(dates)
    current_date = pd.to_datetime(current_date)
    if current_date < max_date:
        raise ValueError('Current date must be >= max date in the dateset')
    return (current_date-max_date).days
def test_time_since_last_order():
    time_val = time_since_last_order(dates=['2020-01-20','2020-01-23','2020-01-30'],current_date='2020-02-02')
    assert np.abs(time_val-3) <=1e-2
test_time_since_last_order()

In [104]:
# Feature matrix generation function
def generate_feature_matrix(df,target,cache_filepath):
    # TODO : improve the parameterization of the feature_marix function
    logger = logging.getLogger('Feature Matrix Generator')
    if cache_filepath is not None:
        feature_mtx = pd.read_csv(cache_filepath)
        logger.info(f'loading raw feature matrix from cache file {cache_filepath}')
    else:
        logger.info(f'Computing raw feature matrix from cache file {cache_filepath}')
        df['order_date_2'] = df['order_date'] # a quick workaround to be able to generate 2 features from
        # the same column at once
        feature_mtx = df.groupby(by='customer_id',as_index = False)\
                .agg({'customer_order_rank':np.nanmax,'is_failed':np.nanmean,\
                      'delivery_fee':np.nanmean,'amount_paid':np.nanmean,\
                      'payment_id':get_mode_scalar,'platform_id':get_mode_scalar,\
                      'is_returning_customer':get_mode_scalar,
                     'order_date':avg_days_between_orders,\
                      'order_date_2':lambda x:time_since_last_order(x,current_date=max_order_date)})
        feature_mtx.to_csv(cache_filepath,index=False)
    # remove outliers
    logger.info(f'shape of raw feature matrix = {feature_mtx.shape[0]}')
    feature_mtx = remove_outliers_quantiles(df=feature_mtx,columns=['delivery_fee', 'amount_paid'])
    logger.info(f'shape of feature matrix after removing outliers= {feature_mtx.shape[0]}')
    
    # upsampling
    feature_mtx = upsample(df=feature_mtx,target_col=target)
    logger.info(f'shape of feature matrix after upsampling = {feature_mtx.shape[0]}')
    
    # one-hot encoding
    feature_mtx = pd.get_dummies(data=feature_mtx,columns=['payment_id','platform_id'])
    feature_mtx.drop(columns ='customer_id',inplace=True)
    logger.info(f'shape of feature matrix after one-hot encoding = {feature_mtx.shape[0]}')
    return feature_mtx

In [105]:
# Main code snippet 
def main():
    main_logger = logging.getLogger('Main')
    # data loading and merging 
    orders_df = pd.read_csv('../data/machine_learning_challenge_order_data.csv')
    labeled_df = pd.read_csv('../data/machine_learning_challenge_labeled_data.csv')
    
    merged_df = pd.merge(left=orders_df,right=labeled_df,on='customer_id')
    # FIXME , to remove
    main_logger.info(f'Data loaded and merged, with shape {merged_df.shape}')
    #train_df_filtered = remove_outliers_quantiles()
    main_logger.info('Generate feature matrix')
    raw_feature_matrix_cache_file = 'raw_feature_matrix.csv'
    target_col = 'is_returning_customer'
    generate_feature_matrix(df=merged_df,target = target_col,cache_filepath=raw_feature_matrix_cache_file)
    
    # Model Buil
main()

INFO:Main:Data loaded and merged, with shape (786600, 14)
INFO:Main:Generate feature matrix
INFO:Feature Matrix Generator:loading raw feature matrix from cache file raw_feature_matrix.csv
INFO:Feature Matrix Generator:shape of raw feature matrix = 245455
INFO:Main_Churn_Perdictors:Removing 1058 based on columns delivery_fee which is
                0.43 % of the rows
INFO:Main_Churn_Perdictors:Removing 2455 based on columns amount_paid which is
                1.0 % of the rows
INFO:Main_Churn_Perdictors:Total rows remove = 3468 which is 
            1.41 % of the rows
INFO:Feature Matrix Generator:shape of feature matrix after removing outliers= 241987
INFO:Feature Matrix Generator:shape of feature matrix after upsampling = 373728
INFO:Feature Matrix Generator:shape of feature matrix after one-hot encoding = 373728
