In [86]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from catboost import CatBoostClassifier
from scipy.stats import chisquare, chi2_contingency
from sklearn.metrics import mean_squared_error, roc_auc_score
import os
import gc
import shap

%matplotlib inline

In [49]:
datapath= 'UCI_Credit_Card.csv'
def get_type(datapath):
    extension = datapath.split('.')[1]
    assert datapath.endswith(tuple(['xls', 'xlsx', 'csv'])), 'Our system currently only accepts csv, xls or xlsx extensions, your input was {}'.format(extension)
    if 'csv' in datapath:
        seplist = [',', '|', ';', '\t']
        return seplist
    elif 'xls'in datapath or 'xlsx' in datapath:
        xl = pd.ExcelFile(datapath)
        return xl.sheet_names
    else:
        print('Our system currently only accepts csv, xls or xlsx extensions')

def read_data(datapath, select):
    extension = datapath.split('.')[1]
    assert datapath.endswith(tuple(['xls', 'xlsx', 'csv'])), 'Our system currently only accepts csv, xls or xlsx extensions, your input was {}'.format(extension)
    if 'csv' in datapath:
        return pd.read_csv(datapath, sep=select)
    elif 'xls'in datapath or 'xlsx' in datapath:
        return pd.read_excel(datapath, sheet=select)
    else:
        print('Our system currently only accepts csv, xls or xlsx extensions')
    

In [50]:
data = read_data(datapath, get_type(datapath)[0])

In [51]:
columns = list(data)
target = 'default.payment.next.month'

In [52]:
## Remove IDs and single values
a = data.shape[1]
for col in list(data):
    if data[col].nunique() ==1:
        data.drop(columns=[col], inplace=True)
    elif data[col].nunique() == len(data):
        data.drop(columns=[col], inplace=True)
    else:
        None

b = data.shape[1]
print("data was dropped from {} to {}".format(a, b))

data was dropped from 25 to 24


In [53]:
addition = [np.nan] * (data.shape[1]-1)
additional = pd.concat([pd.DataFrame(addition), pd.DataFrame([0])], ignore_index=True).T
additional.columns = data.columns.values

data = pd.concat([data, additional], ignore_index=True)


In [54]:
data

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1.0
1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,0.0,0.0,0.0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1.0
2,90000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0.0
3,50000.0,2.0,2.0,1.0,37.0,0.0,0.0,0.0,0.0,0.0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0.0
4,50000.0,1.0,2.0,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,150000.0,1.0,3.0,2.0,43.0,-1.0,-1.0,-1.0,-1.0,0.0,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0.0
29997,30000.0,1.0,2.0,2.0,37.0,4.0,3.0,2.0,-1.0,0.0,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1.0
29998,80000.0,1.0,3.0,1.0,41.0,1.0,-1.0,0.0,0.0,0.0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1.0
29999,50000.0,1.0,2.0,1.0,46.0,0.0,0.0,0.0,0.0,0.0,...,36535.0,32428.0,15313.0,2078.0,1800.0,1430.0,1000.0,1000.0,1000.0,1.0


In [55]:
data = data.dropna(axis=1, thresh=int(np.ceil(0.2*len(data))))
y = data[target]
x = data.drop([target], axis=1)

In [56]:
initial_use = list(x)

In [57]:
def get_num_ob(x):
    x_num = x.select_dtypes(exclude='object')
    x_ob = x.select_dtypes(include='object')
    return x_num, x_ob
def imput_fit_transform(x):
    imput = SimpleImputer(strategy='median')
    x_numeric_imp = pd.DataFrame(imput.fit_transform(x), columns = x.columns, index = x.index)
    return imput, x_numeric_imp

def imput_transform(x, imput):
    x_numeric_imp = pd.DataFrame(imput.transform(x), columns = x.columns, index = x.index)
    return x_numeric_imp
    

In [58]:

xnum, xobj = get_num_ob(x)

In [59]:
categories = list(xobj)

In [60]:
categories

In [61]:
from collections import defaultdict
d = defaultdict(LabelEncoder)

xobj.fillna('Unknown', inplace=True)

## get labeled
def le_fit_transform(df):
    le = LabelEncoder()
    fit = df.apply(lambda x: d[x.name].fit_transform(x))
    return fit, d
def le_transform(df, le):
    x_transformed = df.apply(lambda x: le[x.name].transform(x))
    return x_transformed

x_obj_le, le = le_fit_transform(xobj)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [62]:
x_con = pd.concat([xnum, x_obj_le], axis=1)

In [63]:
x_con = x_con.dropna(axis=1, thresh=int(np.ceil(0.2*len(x_con))))

In [64]:
droppage = len(x_con)-1
x_con.drop(droppage, inplace=True)
y.drop(droppage, inplace=True)

In [66]:
x_train, x_test, y_train, y_test = train_test_split(x_con, y, test_size=0.2, random_state=42)

In [67]:
x_train_obj = x_train[categories]
x_train_num = x_train[set(list(x_train)) - set(categories)]

In [68]:
def multicol_filter(df, min_v, max_v):
    columnss = np.full((df.shape[0],), True, dtype=bool)
    for i in range(df.shape[0]):
        for j in range(i+1, df.shape[0]):
            if df.iloc[i,j] >=max_v or df.iloc[i,j] <=min_v:
                if columnss[j]:
                    columnss[j] = False

    ss = df.head(1)
    selected_columnss = ss.columns[columnss]
    ts = list(ss[selected_columnss])
    return ts

In [69]:

ze = x_train_num.corr()
t = multicol_filter(ze, -0.8, 0.8)

xnum_clean = x_train_num[t]
print(x_train_num.shape, xnum_clean.shape)

(24000, 23) (24000, 17)


In [70]:
def cramers_V(var1,var2) :
    crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
    stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
    obs = np.sum(crosstab) # Number of observations
    mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
    return (stat/(obs*mini))

def chi_test(data, categories):
    drop_cols = []
    for j in range(len(categories)-1):
            for k in range(j+1, len(categories)):

                pvalue = chi2_contingency(pd.crosstab(data[categories[j]],data[categories[k]]))[1]
                if pvalue < 0.05:
                    if categories[k] in drop_cols:
                        None
                    else:
                        drop_cols.append(categories[k])
                else:
                    None
    return set(categories) - set(np.unique(drop_cols))

def cramer_test(data, max_v):
    rows= []
    for var1 in data:
        col = []
        for var2 in data :
            cramers =cramers_V(data[var1], data[var2]) # Cramer's V test
            col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V  
        rows.append(col)

    cramers_results = np.array(rows)
    df = pd.DataFrame(cramers_results, columns = data.columns, index =data.columns)
    return multicol_filter(df, -max_v, max_v)
def categorical_filter(data, max_v, method = 'chi2'):
    '''
    filter the categorical features using either cramers v, chi squared or intersection of both
    data = dataframe to add
    max_v = used for the common benchmark of colinearity value
    method = string input accepting either 'chi2', 'cramer', or 'both'
    '''
    assert method in ['chi2', 'cramer', 'both'], 'method not understandable, please use either chi2, cramer or both'
    categories = list(data)
    
    if method == 'chi2':
        keep_cols = chi_test(data, categories)
        
    elif method == 'cramer':
        keep_cols = cramer_test(data, max_v)
    
    elif method == 'both':
        keep_cols_chi = chi_test(data, categories)
        keep_cols_cv = cramer_test(data, max_v)
        del_chi = set(categories) - set(keep_cols_chi)
        del_cv = set(categories) - set(keep_cols_cv)
        del_both = set(del_chi).intersection(set(del_cv))
        keep_cols = set(categories) -  set(del_both)
    else:
        print("error with method")
    
    return keep_cols


In [71]:
try:
    filterer = categorical_filter(x_train_obj, 0.8, method = 'both')
except:
    filterer = []

In [74]:
x_obj_clean = x_train_obj[filterer]

In [75]:
# Concat back to x_train
x_train_clean = pd.concat([xnum_clean, x_obj_clean], axis=1)
x_test = x_test[x_train_clean.columns]

In [77]:
imputer, x_train_imputed = imput_fit_transform(x_train_clean)
x_test_imputed = imput_transform(x_test, imputer)

### Machine Learning

In [83]:
sm = SMOTE(random_state=108)
tl = TomekLinks()
x_train_ov, y_train_ov = sm.fit_resample(x_train_imputed, y_train)
x_train_un, y_train_un = tl.fit_resample(x_train_imputed, y_train)

In [84]:
dt = DecisionTreeClassifier(random_state=108)
rf = RandomForestClassifier(random_state=108)
gb = GradientBoostingClassifier(random_state=108)
cb = CatBoostClassifier(random_state=108, verbose=False)
dt_param = {'max_depth':[1, 3, 5, 10], 'min_samples_split':[2,4,8,16], 'min_samples_leaf':[1,2,4,6,8,10]}


n_estimators = [10, 25, 50, 100]
max_features = ['auto', 'sqrt']
max_depth = [3, 5, 10, 12, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
random_strength = [0.0001, 0.001, 0.1, 1]
border_count = [1, 5, 10, 25, 50, 100, 255]
l2_leaf_reg = [1, 2, 3, 4, 5, 6, 10, 15, 30]
bagging_temperature = [0, 1, 2, 3, 4, 5]

rf_param = {'n_estimators': n_estimators, 'max_features':max_features, 'max_depth':max_depth, 'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}

learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
gb_param = {'learning_rate':learning_rates, 'n_estimators': n_estimators, 'max_depth':max_depth, 'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf, 'max_features':max_features}
cb_param = {'learning_rate':learning_rates, 'iterations': n_estimators, 'depth':max_depth, 'random_strength':random_strength,'border_count':border_count, 'l2_leaf_reg':l2_leaf_reg, 'bagging_temperature':bagging_temperature}

In [125]:

name = []
k = []
tr_auc = []
te_auc = []
method = []
features = []
trans = dict()
for data_used in [[x_train_ov, y_train_ov, 'oversampling'], [x_train_un, y_train_un, 'undersampling']]:
    x_use = data_used[0]
    y_use = data_used[1]
    gdt = RandomizedSearchCV(dt, dt_param, n_jobs=-1, scoring='roc_auc', n_iter=10, random_state=108)
    grf = RandomizedSearchCV(rf, rf_param, n_jobs=-1, scoring='roc_auc', n_iter=10, random_state=108)
    ggb = RandomizedSearchCV(gb, gb_param, n_jobs=-1, scoring='roc_auc', n_iter=10, random_state=108)
    gcb = RandomizedSearchCV(cb, cb_param, n_jobs=-1, scoring='roc_auc', n_iter=20, random_state=108)
    new_dt = DecisionTreeClassifier(**gdt.fit(x_use, y_use).best_params_, random_state=108)
    
    new_rf = RandomForestClassifier(**grf.fit(x_use, y_use).best_params_, random_state=108)
    
    new_gb = GradientBoostingClassifier(**ggb.fit(x_use, y_use).best_params_, random_state=108)
    
    new_cb = CatBoostClassifier(**gcb.fit(x_use, y_use).best_params_, random_state=108, verbose=False)


    for algo in [[new_dt, 'dt'], [new_rf, 'rf'], [new_gb, 'gb'], [new_cb, 'cb']]:
        algo[0].fit(x_use, y_use)
        current = 0
        num = x_train_imputed.shape[1]
        used_feature = list(x_use)
        sampling = 'normal'
        usee = pd.DataFrame({'params':x_use.columns, 'importances':algo[0].feature_importances_}).sort_values('importances', ascending=False)
        for kbest in [5, 10, 15, 25, 50]:
            uses = usee.head(kbest)['params']
            

            x_tr_try= x_use[uses]
            
            hold = np.mean(cross_val_score(estimator=algo[0], X=x_tr_try, y=y_use, cv = 5, scoring = 'roc_auc'))
            if hold > current:
                current = hold
                num = kbest       
                sampling = data_used[2]
                used_feature = list(uses)
            else:
                None

        x_tr_fin = x_use[usee.head(num)['params']]
        x_te_fin = x_test_imputed[usee.head(num)['params']]
        
        y_pred = algo[0].fit(x_tr_fin, y_use).predict_proba(x_te_fin)
        store = roc_auc_score(y_test, y_pred[:,1])
        
        name.append(algo[1])
        k.append(num)
        tr_auc.append(current)
        te_auc.append(store)
        method.append(sampling)
        features.append(used_feature)

result = pd.DataFrame({'algo':name, 'n_features':k, 'train_auc':tr_auc, 'test_auc':te_auc, 'method':method, 'features':features}).sort_values('test_auc', ascending=False)
result.sort_values('test_auc', ascending=False).head(1)



Unnamed: 0,algo,n_features,train_auc,test_auc,method,features
6,gb,25,0.783913,0.773701,undersampling,"[PAY_0, PAY_2, PAY_3, BILL_AMT4, PAY_4, LIMIT_..."


In [126]:
result

Unnamed: 0,algo,n_features,train_auc,test_auc,method,features
6,gb,25,0.783913,0.773701,undersampling,"[PAY_0, PAY_2, PAY_3, BILL_AMT4, PAY_4, LIMIT_..."
5,rf,15,0.786314,0.772733,undersampling,"[PAY_0, PAY_2, PAY_3, BILL_AMT4, LIMIT_BAL, PA..."
7,cb,25,0.783223,0.771076,undersampling,"[PAY_0, LIMIT_BAL, PAY_AMT1, PAY_3, PAY_2, PAY..."
1,rf,25,0.938444,0.760515,oversampling,"[PAY_0, MARRIAGE, PAY_2, PAY_3, EDUCATION, SEX..."
3,cb,15,0.928123,0.758531,oversampling,"[SEX, EDUCATION, MARRIAGE, PAY_0, PAY_6, LIMIT..."
4,dt,5,0.759917,0.751347,undersampling,"[PAY_0, PAY_2, PAY_AMT3, LIMIT_BAL, PAY_6]"
0,dt,15,0.864892,0.746982,oversampling,"[PAY_0, PAY_2, PAY_4, PAY_AMT1, PAY_6, LIMIT_B..."
2,gb,15,0.938421,0.740443,oversampling,"[PAY_0, PAY_2, MARRIAGE, SEX, PAY_3, BILL_AMT4..."


In [127]:
algo_used = result['algo'].iloc[0]
features_used = result['features'].iloc[0]
sampling_used = result['method'].iloc[0]

In [128]:
if algo_used == 'dt':
    do_train = new_dt
elif algo_used == 'gb':
    do_train = new_gb
elif algo_used == 'rf':
    do_train = new_rf
elif algo_used == 'cb':
    do_train = new_cb
    
if sampling_used == 'undersampling':
    do_sampling = TomekLinks()
elif sampling_used == 'oversampling':
    do_sampling = SMOTE(random_state=108)

## Prepare to retrain using all dataset

In [129]:
# we happened to already do our part in the x_con, so we will reuse x_con as our main retraining dataset.
# Since we already do label encoding, we no longer need to label encode it again
imputer, x_imputed = imput_fit_transform(x_con)


In [130]:
# use only the best features from train_test_split
x_imputed_use = x_imputed[features_used]

In [131]:
x_imputed_use.shape

(30000, 17)

In [132]:
x_sampled_use, y_sampled = do_sampling.fit_resample(x_imputed_use, y)

In [133]:
do_train.fit(x_sampled_use, y_sampled)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.25, loss='deviance', max_depth=5,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=4, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=108, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [138]:
roc_auc_score(y,  do_train.predict_proba(x_imputed_use)[:,1])

0.8281781641106357

# Prepare data to predict

In [139]:
pred_datapath= 'train.csv'
pred_data = read_data(datapath, get_type(datapath)[0])
pred_data = pred_data[list(x_con)]

In [140]:
pred_data_obj = pred_data[categories]
pred_data_obj.fillna('Unknown', inplace=True)
pred_data_num = pred_data[set(pred_data) - set(categories)]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [141]:
pred_data_obj_le = le_transform(pred_data_obj, le)

In [142]:
pred_data_con = pd.concat([pred_data_num, pred_data_obj_le], axis=1)

In [143]:
pred_data_con = pred_data_con[list(x_con)]
pred_con_imputed = imput_transform(pred_data_con, imputer)

In [144]:
pred_con_use = pred_con_imputed[features_used]

In [145]:
pred_data['prediction_result'] = do_train.predict_proba(pred_con_use)[:,1]