In [2]:
# This version of notebook is compact and for directly generating features, modeling and testing

In [3]:
%config Completer.use_jedi = False # to enable auto complete
import pandas as pd
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [4]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('Main_Churn_Perdictors')

In [22]:
# data cleaning
# this function is designed to handle the long tailed distributions (heavily found in the data)
# hence it makes sense to test it on heavil tailed distributions ()
from numpy import random
def remove_outliers_quantiles(df,columns,q=0.99):
    # this function remove outliers based on quantiles, columns must be numeric
    n_rows = df.shape[0]
    running_idx = pd.Series(np.repeat(True,n_rows))
    for col in columns:
        if isinstance(df[col][0],(np.float_,np.int_)):
            
            val_q = np.quantile(a=df[col],q=q)
            idx = df[col]<=val_q
            running_idx = np.logical_and(running_idx,idx)

            row_to_remove_count = idx.value_counts().sort_values().values[0]
            logger.info(f"""Removing {row_to_remove_count} based on columns {col} which is
                {np.round(1.0*row_to_remove_count/n_rows,4)*100} % of the rows""")
        else:
            logger.error(f'Cannot remove outliers from column {col} as it is not numeric!')
        filterd_df = df.loc[running_idx,:]
        
    total_rows_removed = running_idx.value_counts().sort_values().values[0]
    logger.info(f"""Total rows remove = {total_rows_removed} which is 
            {np.round(1.0*total_rows_removed/n_rows,4)*100} % of the rows""")
    return filterd_df

def test_remove_outliers_quantiles():
    n = 10e6
    alpha = 0.01
    df = pd.DataFrame({'X':random.exponential(scale=0.1,size=int(n)),\
                       'Y': random.exponential(scale=0.01,size=int(n))})
    
    df_processed = remove_outliers_quantiles(df,['X','Y'],1-alpha)
    n_processed = df_processed.shape[0]
    fraction = (n-n_processed) / n_processed
    # assert the right amount of data is cleaned
    # note because the and condtion in filtering, the lower bound of filerting fraction is alpha
    logger.info(f'clean fraction = {fraction}')
    assert fraction >= alpha , 'cleaned less than exepcted fraction'
test_remove_outliers_quantiles()

INFO:Main_Churn_Perdictors:Removing 100000 based on columns X which is
                1.0 % of the rows
INFO:Main_Churn_Perdictors:Removing 100000 based on columns Y which is
                1.0 % of the rows
INFO:Main_Churn_Perdictors:Total rows remove = 198958 which is 
            1.9900000000000002 % of the rows
INFO:Main_Churn_Perdictors:clean fraction = 0.020299678340323406


In [6]:
# set of Feature generation functions along with their unit testing

In [47]:
def upsample(df,target_col):
    if len(df[target_col].unique())!=2:
        raise ValueError('Numer of target unique values must = 2')
    count = df[target_col].value_counts(normalize=True).sort_values()
    minority_df = df.loc[df[target_col]==count.index[0],:]
    majority_df = df.loc[df[target_col]==count.index[1],:]
    # upsampling
    n_majority = majority_df.shape[0]
    minority_df_upsampled = minority_df.sample(n=n_majority,replace=True)
    balanced_df = majority_df.append(minority_df_upsampled,ignore_index = True)
    count_balanced = balanced_df[target_col].value_counts(normalize=True)
    return balanced_df
def test_upsample():
    logger = logging.getLogger('test_upsample')
    n = 100
    df = pd.DataFrame({'Y':np.repeat(0,100)})
    df.loc[0:10,'Y'] = 1
    #logger.info(f"""Y value counts = \n{df['Y'].value_counts(normalize=True)}""")
    df_upsampled = upsample(df,'Y') 
    #logger.info(f"""Y value counts = \n{df_upsampled['Y'].value_counts(normalize=True)}""")
    upsampled_counts = df_upsampled['Y'].value_counts(normalize=True)
    assert np.abs(1-upsampled_counts.values[0]/upsampled_counts.values[1]) <= 1e-2
    assert df_upsampled.shape[0] > df.shape[0]
test_upsample()

In [7]:
# Feature generation function

In [8]:
# Modeling function

In [None]:
# Main code snippet 

In [28]:
def main():
    main_logger = logging.getLogger('Main')
    # data loading and merging 
    orders_df = pd.read_csv('../data/machine_learning_challenge_order_data.csv')
    labeled_df = pd.read_csv('../data/machine_learning_challenge_labeled_data.csv')
    merged_df = pd.merge(left=orders_df,right=labeled_df,on='customer_id')
    # FIXME , to remove
    print(merged_df.columns)
    main_logger.info(f'Data loaded and merged, with shape {merged_df.shape}')
    #train_df_filtered = remove_outliers_quantiles()
main()

INFO:Main:Data loaded and merged, with shape (786600, 14)


Index(['customer_id', 'order_date', 'order_hour', 'customer_order_rank',
       'is_failed', 'voucher_amount', 'delivery_fee', 'amount_paid',
       'restaurant_id', 'city_id', 'payment_id', 'platform_id',
       'transmission_id', 'is_returning_customer'],
      dtype='object')
