In [351]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_curve
import xgboost as xgb

In [352]:
data_path = os.path.join(os.getcwd(), 'ynap_data')
print(data_path)

/nfs/science/shared/ipythonNotebooks/leom/Kaggle/Ynap-master/ynap_data


In [353]:
df_tot = pd.read_csv(os.path.join(data_path, 'df_quarterly.csv'))

In [358]:
def column_finder(df, col, method=2):
    if method == 1:
        # method 1:
        cols_as_string = ' '.join(df.columns.values)
        cols_found = list(re.findall(col + '.*?\ ', cols_as_string))
        cols_found = [x.strip(' ') for x in cols_found]
    elif method == 2:
        # method 2:
        cols_found = []
        for col_elem in df.columns.values:
            if col in col_elem:
                cols_found.append(col_elem)
    else:
        raise ValueError
            
    return cols_found


def column_remover(df, col_list, noprint=False):    
    if type(col_list) != list:
        cl = []
        cl.append(col_list)
        col_list =cl
    
    col_to_remove = []
    for col in col_list:
        #print(col)
        col_to_remove.extend(column_finder(df, col, method=2))
    
    df.drop(col_to_remove, axis=1, inplace=True)
    if not noprint:
        print(col_to_remove)

In [359]:
column_remover(df_tot, ['product_id', 'designer_id', '_1989', '_199001', '_199002'], noprint=True)
# print(df_tot.columns.values)

In [360]:
df_tot.columns.values

array(['customer_id', 'var3', 'var4', 'var5', 'var6', 'lapsed_next_period',
       'order_id_199003', 'order_id_199004', 'product_type_id_199003',
       'product_type_id_199004', 'gross_spend_199003',
       'gross_spend_199004', 'net_spend_199003', 'net_spend_199004',
       'item_bought_199003', 'item_bought_199004', 'item_returned_199003',
       'item_returned_199004', 'quote_spend_returned_199003',
       'quote_spend_returned_199004', 'quote_var1_199003',
       'quote_var1_199004', 'quote_var2_199003', 'quote_var2_199004',
       'ns_per_order_199004', 'gs_per_order_199004', 'ib_per_order_199004',
       'ir_per_order_199004', 'ns_per_ib_199004', 'gs_per_item_199004',
       'ns_per_order_199003', 'gs_per_order_199003', 'ib_per_order_199003',
       'ir_per_order_199003', 'ns_per_ib_199003', 'gs_per_item_199003'], dtype=object)

In [362]:
df_tot.drop('customer_id', axis=1).corr().loc['lapsed_next_period'] # response not correlated with anything...

var3                          -0.049122
var4                           0.028221
var5                          -0.049656
var6                          -0.026682
lapsed_next_period             1.000000
order_id_199003               -0.118498
order_id_199004               -0.125503
product_type_id_199003        -0.147999
product_type_id_199004        -0.191623
gross_spend_199003            -0.076103
gross_spend_199004            -0.091865
net_spend_199003              -0.063114
net_spend_199004              -0.088734
item_bought_199003            -0.085690
item_bought_199004            -0.112901
item_returned_199003          -0.074607
item_returned_199004          -0.076264
quote_spend_returned_199003   -0.113148
quote_spend_returned_199004   -0.131464
quote_var1_199003             -0.012729
quote_var1_199004             -0.032535
quote_var2_199003             -0.069943
quote_var2_199004             -0.098047
ns_per_order_199004           -0.042647
gs_per_order_199004           -0.048669


### Data transformation

In [365]:
def data_transformer(df, unchangeable_variables, method='std'):
    if method == 'std':
        scaler = StandardScaler().fit_transform(df.values)
    elif method == 'minmax':
        scaler = MinMaxScaler().fit_transform(df.values)
    elif method == 'normal':
        scaler = Normalizer().fit_transform(df.values)
    else:
        print('Method not recognized')
        
    df_transformed = pd.DataFrame(scaler, index=df.index, columns=df.columns)
    if len(unchangeable_variables) > 0:
        df_transformed[unchangeable_variables] = df[unchangeable_variables]
    
    return df_transformed

In [368]:
df_minmax = data_transformer(df_tot, ['customer_id','lapsed_next_period'], 'minmax')
# df_minmax.drop('customer_id', axis=1).corr().loc['lapsed_next_period']

In [370]:
df_norm = data_transformer(df_tot, ['customer_id','lapsed_next_period'], 'normal')
# df_norm.drop('customer_id', axis=1).corr().loc['lapsed_next_period']

In [372]:
df_std = data_transformer(df_tot, ['customer_id','lapsed_next_period'], 'std')
# df_std.drop('customer_id', axis=1).corr().loc['lapsed_next_period']

Transformations don't seem to work. One reason could the high sproportion between 0s and 1s in the response variable.
Let's try a balance method.

## Balance
Let's use the standard transformed data

In [373]:
df_for_model = df_std

In [379]:
X = df_for_model.drop(['customer_id', 'lapsed_next_period'], axis=1)
y = df_for_model[['customer_id', 'lapsed_next_period']]

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

(15695, 34)
(15695, 2)


In [380]:
X_train_1 = X_train.loc[y_train[y_train['lapsed_next_period'] == 1].index]
X_train_0 = X_train.loc[y_train[y_train['lapsed_next_period'] == 0].index]

print(X_train_1.shape)
print(X_train_0.shape)
print(X_train_1.shape[0]/X_train.shape[0])

(657, 34)
(11899, 34)
0.05232558139534884


In [437]:
# try different balance ratios with a couple of easy models

def balance_tester(Xtr0, Xtr1, ytr, Xte, yte, zero_quotas = [0.5, 0.66, 0.75], seeds = [21, 2121, 1212], 
                   method='log', method_params=None):
    
    if (method == 'rf') & (method_params == None):
        # Random Forest parameters
        rf_params_bal = {'n_jobs': -1, 'n_estimators': 500, 'warm_start': True, 
                         'max_depth': 4, 'min_samples_leaf': 2, 'max_features' : 'sqrt',
                         'verbose': 0}
    
    results = pd.DataFrame(columns=['quota', 'accuracy', 'pred_perc'])
    
    for quota in zero_quotas:
        accuracy = []
        pred_perc = []
        
        n_sample = int(Xtr1.shape[0] * quota / (1 - quota) // 1)
        
        for seed in seeds:
            Xtr0_ = Xtr0.sample(n=n_sample, random_state=seed)
            Xtr_ = Xtr0_.append(Xtr1)
            ytr_ = ytr.loc[Xtr_.index]
            
            if method == 'log':
                clf = LogisticRegression(max_iter=100, solver='liblinear')
            elif method == 'rf':
                clf = RandomForestClassifier(**rf_params_bal)
            else:
                return 'Method not recognized'
            
            clf.fit(Xtr_, ytr_)
            clf_pred = clf.predict(Xte)
            
            pred_perc.append(clf_pred.sum()/len(clf_pred))
            accuracy.append(accuracy_score(yte, clf_pred))
            
        # print(np.array(accuracy).mean())
        acc_mean = np.array(accuracy).mean()
        pp_mean = np.array(pred_perc).mean()
        
        results = results.append(pd.Series({'quota': quota, 'accuracy': acc_mean,
                                            'pred_perc': pp_mean}), ignore_index=True)
    
    return results
            
# quotas = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.825, 0.85, 0.9]
quotas = [0.8, 0.805, 0.81, 0.815, 0.82, 0.825, 0.83, 0.835, 0.84, 0.845, 0.85]
balance_res_rf = balance_tester(X_train_0, X_train_1, y_train['lapsed_next_period'], X_test, 
                                y_test['lapsed_next_period'], quotas, method='rf')

balance_res_log = balance_tester(X_train_0, X_train_1, y_train['lapsed_next_period'], X_test, 
                                y_test['lapsed_next_period'], quotas, method='log')

In [435]:
balance_res_log

Unnamed: 0,quota,accuracy,pred_perc
0,0.8,0.89519,0.088669
1,0.805,0.897101,0.085484
2,0.81,0.899225,0.082086
3,0.815,0.902517,0.077732
4,0.82,0.904853,0.074334
5,0.825,0.906977,0.070723
6,0.83,0.917596,0.054157
7,0.835,0.927259,0.039397
8,0.84,0.930339,0.033344
9,0.845,0.931082,0.031751


In [436]:
balance_res_rf

Unnamed: 0,quota,accuracy,pred_perc
0,0.8,0.881916,0.107465
1,0.805,0.885314,0.101943
2,0.81,0.899543,0.077732
3,0.815,0.921525,0.044706
4,0.82,0.925773,0.037698
5,0.825,0.935648,0.021026
6,0.83,0.943931,0.007858
7,0.835,0.945205,0.005522
8,0.84,0.946267,0.002761
9,0.845,0.946692,0.001911


## Models

A proportion of 81.5% for not lapsed values seems to lead to good results. Let's use that for the GridSearch step

In [438]:
quota = 0.815
n_sample = int(X_train_1.shape[0] * quota / (1 - quota) // 1)
X_train_0_ = X_train_0.sample(n=n_sample, random_state=101)

In [439]:
X_train_ = X_train_0_.append(X_train_1)
print(X_train_.shape)

y_train_ = y_train.loc[X_train_.index]
print(y_train_.shape)

(3551, 34)
(3551, 2)


In [443]:
df_ = y_train_.join(X_train_)
y_corr = df_.drop('customer_id', axis=1).corr().loc['lapsed_next_period']

In [441]:
model_cols = list(y_corr[y_corr.apply(lambda x: abs(x) > 0.2)].index.values)
model_cols.remove('lapsed_next_period')

In [444]:
model_cols

['product_type_id_199003',
 'product_type_id_199004',
 'quote_spend_returned_199004']

In [445]:
# Random Forest example
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

rf = RandomForestClassifier(**rf_params)
rf.fit(X_train_[model_cols], y_train_['lapsed_next_period'])

rf_pred = rf.predict(X_test[model_cols])

print(rf_pred.sum()/len(rf_pred))

print(accuracy_score(y_test['lapsed_next_period'], rf_pred))

0.0442816183498
0.91876393756


In [447]:
# Setting multiple hyperparameters for every classifier we are going to implement

# Random Forest
rf_params_gs = {
    'n_jobs': [-1],
    'n_estimators': [200, 300, 350, 400, 500],
    'warm_start': [True], 
     #'max_features': 0.2,
    'max_depth': [3, 4, 5, 6],
    'min_samples_leaf': [2, 3, 4, 5],
    'max_features' : ['sqrt'],
    'verbose': [0]
}

# Extra Trees
et_params_gs = {
    'n_jobs': [-1],
    'n_estimators': [200, 300, 350, 400, 500],
    'max_depth': [3, 4, 5, 6],
    'min_samples_leaf': [2, 3, 4, 5],
    'verbose': [0]
}

# AdaBoost parameters
ada_params_gs = {
    'n_estimators': [200, 300, 400, 500],
    'learning_rate': [0.1, 0.5, 1, 2]
}

# Gradient Boosting parameters
gb_params_gs = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [2, 3, 4, 5],
    'min_samples_leaf': [2, 3, 4],
    'verbose': [0]
}

# SVC parameters
svc_params_gs = {
    'kernel': ['linear', 'rbf'],
    'C': [0.05, 0.1, 0.5, 1, 2]
}

# Logistic regression parameters
log_params_gs = {
    'solver': ['liblinear'],
    'max_iter': [100, 200, 300],
    'verbose': [0]
}

# XGBoosting parameters
xgb_params_gs = {
    'objective':['binary:logistic'],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1, 2], #so called `eta` value
    'max_depth': [4, 5, 6],
    'min_child_weight': [11],
    'silent': [1],
    'subsample': [0.8],
    'colsample_bytree': [0.7],
    'n_estimators': [200, 500, 1000], #number of trees, change it to 1000 for better results
    'missing':[-999]
}

In [448]:
# Istantiate the classifiers
rf = GridSearchCV(RandomForestClassifier(), rf_params_gs, cv=5)
et = GridSearchCV(ExtraTreesClassifier(), et_params_gs, cv=5)
ada = GridSearchCV(AdaBoostClassifier(), ada_params_gs, cv=5)
gb = GridSearchCV(GradientBoostingClassifier(), gb_params_gs, cv=5)
svc = GridSearchCV(SVC(), svc_params_gs, cv=5)

In [449]:
log = GridSearchCV(LogisticRegression(), log_params_gs, cv=5)

In [450]:
xgbm = GridSearchCV(xgb.XGBClassifier(), xgb_params_gs, cv=5)

In [451]:
def classifier_runner(clf, Xtr, ytr, Xte, yte):
    print('-'*40)
    print(clf.estimator)
    print('-'*40)
    clf.fit(Xtr, ytr)
    clf.best_params_
    print(clf.best_params_)
    print()
    
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
#     for mean, std, params in zip(means, stds, clf.cv_results_['params']):
#         print("%0.3f (+/-%0.03f) for %r" % (mean, std*2, params))
        
    print()
    y_true, y_pred = yte, clf.predict(Xte)
    print(classification_report(y_true, y_pred))
    print(accuracy_score(y_true, y_pred))
    #return clf.feature_importances_, y_pred
    
rf_feat, rf_ypred = classifier_runner(rf, X_train_[model_cols], y_train_['lapsed_next_period'],
                                      X_test[model_cols], y_test['lapsed_next_period'])
et_feat, et_ypred = classifier_runner(et, X_train_[model_cols], y_train_['lapsed_next_period'],
                                      X_test[model_cols], y_test['lapsed_next_period'])
ada_feat, ada_ypred = classifier_runner(ada, X_train_[model_cols], y_train_['lapsed_next_period'],
                                        X_test[model_cols], y_test['lapsed_next_period'])

----------------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
----------------------------------------
{'min_samples_leaf': 2, 'max_depth': 3, 'n_estimators': 200, 'warm_start': True, 'n_jobs': -1, 'max_features': 'sqrt', 'verbose': 0}


             precision    recall  f1-score   support

          0       0.95      1.00      0.97      2975
          1       0.00      0.00      0.00       164

avg / total       0.90      0.95      0.92      3139

0.947754061803


  'precision', 'predicted', average, warn_for)


TypeError: 'NoneType' object is not iterable

In [452]:
gb = GridSearchCV(GradientBoostingClassifier(), gb_params_gs, cv=5)
svc = GridSearchCV(SVC(), svc_params_gs, cv=5)
classifier_runner(gb, X_train_, y_train_['lapsed_next_period'], X_test, y_test['lapsed_next_period'])
classifier_runner(svc, X_train_, y_train_['lapsed_next_period'], X_test, y_test['lapsed_next_period'])

----------------------------------------
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
----------------------------------------
{'min_samples_leaf': 2, 'n_estimators': 400, 'max_depth': 3, 'verbose': 0}


             precision    recall  f1-score   support

          0       0.96      0.91      0.93      2975
          1       0.16      0.30      0.20       164

avg / total       0.92      0.88      0.90      3139

0.878305192737
----------------------------------------
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3,

In [453]:
classifier_runner(log, X_train_[model_cols], y_train_['lapsed_next_period'], X_test[model_cols], y_test['lapsed_next_period'])

----------------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
----------------------------------------
{'solver': 'liblinear', 'max_iter': 100, 'verbose': 0}


             precision    recall  f1-score   support

          0       0.95      1.00      0.97      2975
          1       0.00      0.00      0.00       164

avg / total       0.90      0.95      0.92      3139

0.947754061803


  'precision', 'predicted', average, warn_for)


In [454]:
classifier_runner(xgbm, X_train_[model_cols], y_train_['lapsed_next_period'], X_test[model_cols], y_test['lapsed_next_period'])

----------------------------------------
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
----------------------------------------
{'learning_rate': 2, 'objective': 'binary:logistic', 'silent': 1, 'min_child_weight': 11, 'n_estimators': 200, 'missing': -999, 'colsample_bytree': 0.7, 'subsample': 0.8, 'max_depth': 4}


             precision    recall  f1-score   support

          0       0.95      1.00      0.97      2975
          1       0.00      0.00      0.00       164

avg / total       0.90      0.95      0.92      3139

0.947754061803


  'precision', 'predicted', average, warn_for)


In [134]:
# Feature importance
rf.

