## Introduction

### Loading libraries

In [232]:
import pandas as pd
import pandas_profiling
import numpy as np

import seaborn as sns

import xgboost as xgb

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

In [236]:
# from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble

In [237]:
from sklearn.model_selection import cross_val_score as cv

In [238]:
from sklearn import tree
from sklearn import naive_bayes 

In [239]:
import time

### Loading data

In [202]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

### Quick overview of our data:

In [203]:
print("Number of satisfied customers: ", len(df_train[df_train['TARGET'] == 0]))
print("Number of unsatisfied customers: ", len(df_train[df_train['TARGET'] == 1]))
print("% of dissatisfied customers: {:.2f}%".format(100*len(df_train[df_train['TARGET'] == 1])/len(df_train[df_train['TARGET'] == 0])))

Number of satisfied customers:  73012
Number of unsatisfied customers:  3008
% of dissatisfied customers: 4.12%


In [204]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB


In [205]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75818 entries, 0 to 75817
Columns: 370 entries, ID to var38
dtypes: float64(110), int64(260)
memory usage: 214.0 MB


## Data Processing

### Removing constant columns

In [206]:
train_cols_to_remove = [col for col in df_train.columns if df_train[col].std() == 0]
test_cols_to_remove = [col for col in df_test.columns if df_test[col].std() == 0]

df_train.drop(train_cols_to_remove, axis=1, inplace=True) 
df_test.drop(test_cols_to_remove, axis=1, inplace=True) 

print("Removed {} constant columns in the train dataframe".format(len(train_cols_to_remove)))
print("Removed {} constant columns in the test dataframe".format(len(test_cols_to_remove)))

Removed 34 constant columns in the train dataframe
Removed 45 constant columns in the test dataframe


### Removing duplicate columns

In [207]:
def rm_dup_cols(df, cols):

    cols_to_remove = []
    
    for i in range(len(cols)-1):
        for j in range(i+1,len(cols)):
            if np.array_equal(df[cols[i]].values, df[cols[j]].values):
                cols_to_remove.append(cols[j])
    return cols_to_remove

In [210]:
train_cols_to_remove = rm_dup_cols(df_train, df_train.columns)
test_cols_to_remove = rm_dup_cols(df_test, df_test.columns)

df_train.drop(train_cols_to_remove, axis=1, inplace=True)
df_test.drop(test_cols_to_remove, axis=1, inplace=True)

print("Removed {} duplicate columns in the train dataframe".format(len(train_cols_to_remove)))
print("Removed {} duplicate columns in the test dataframe".format(len(test_cols_to_remove)))

Removed 0 duplicate columns in the train dataframe
Removed 0 duplicate columns in the test dataframe


### Splitting into the train and test sets

In [213]:
Y = df_train['TARGET'].values
del df_train['TARGET']

# remove the redundant ID column
del df_train['ID']

In [214]:
X_train, X_test, y_train, y_test = train_test_split(df_train.values, Y, test_size = 0.2, random_state = 42)

### Modelling

In [None]:
# Define scoring function (performance metrics)
def score_model(clf):
    print ("\nClassifier: {}...".format(clf.__class__.__name__))
    start = time.time()
    # use 3-fold CV
    scores = cv(clf, X_train, y_train, scoring='roc_auc', cv=3) 
    end = time.time()
    print("time (secs): {:.3f}".format(end - start))
    print("roc_auc: {:.3f}".format(scores.mean()))
    return scores.mean()

# Compare different algrithem
scores = {}
# Decision Tree
scores['tree'] = score_model(tree.DecisionTreeClassifier()) 
# naive bayes
scores['gaussian'] = score_model(naive_bayes.GaussianNB())
# logistic regression
scores['logistic_regression'] = score_model(LogisticRegression()) 

# ensemble methors
# AdaBoost
scores['ada_boost'] = score_model(ensemble.AdaBoostClassifier()) 
# Random Forest
scores['random_forest'] = score_model(ensemble.RandomForestClassifier()) 
# bagging
scores['bagging'] = score_model(ensemble.BaggingClassifier()) 
# gradient boosting
scores['gradient_boosting'] = score_model(ensemble.GradientBoostingClassifier())


Classifier: DecisionTreeClassifier...
time (secs): 28.787
roc_auc: 0.549

Classifier: GaussianNB...
time (secs): 5.046
roc_auc: 0.514

Classifier: LogisticRegression...




time (secs): 13.765
roc_auc: 0.593

Classifier: AdaBoostClassifier...


In [192]:
df_tr.shape

(75817, 341)

In [None]:
df_tr.T.drop_duplicates().T

In [181]:
df_tr.shape

(75817, 370)

#### Make a copy of dataframes (for dev purposes only)

In [194]:
df_tr = df_train.copy()
df_te = df_test.copy()

In [55]:
df_test.shape

(75818, 370)

In [56]:
df_tr.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'TARGET'],
      dtype='object', length=371)

In [57]:
target_col = df_train['TARGET'].values
del df_train['TARGET']

In [79]:
df_tr['var38'].describe()

count    7.602000e+04
mean     1.172358e+05
std      1.826646e+05
min      5.163750e+03
25%      6.787061e+04
50%      1.064092e+05
75%      1.187563e+05
max      2.203474e+07
Name: var38, dtype: float64

### Checking the dataframes for duplicate features

In [120]:
print(df_tr.shape)
print(df_te.shape)

(76020, 371)
(75818, 370)


In [121]:
df_tr.drop_duplicates()
df_te.drop_duplicates()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,2,32,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,40532.100000
1,5,2,35,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,45486.720000
2,6,2,23,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,46993.950000
3,7,2,24,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,187898.610000
4,9,2,23,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,73649.730000
5,11,2,43,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,53250.870000
6,12,2,39,495.0,2334.42,4815.42,0.0,0.0,0.0,0.0,...,0.0,7077.51,0.0,7599.0,7338.27,0.0,0.0,0.0,0.0,58316.640000
7,15,2,29,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,46898.490000
8,16,2,53,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,110356.980000
9,17,2,37,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,41366.490000


### Removing constant features

In [122]:
df_tr.loc[:, (df_tr != df_tr.iloc[0]).any()]
df_te.loc[:, (df_te != df_te.iloc[0]).any()]

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,2,32,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,40532.100000
1,5,2,35,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,45486.720000
2,6,2,23,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,46993.950000
3,7,2,24,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,187898.610000
4,9,2,23,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,73649.730000
5,11,2,43,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,53250.870000
6,12,2,39,495.0,2334.42,4815.42,0.0,0.0,0.0,0.0,...,0.0,7077.51,0.0,7599.0,7338.27,0.0,0.0,0.0,0.0,58316.640000
7,15,2,29,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,46898.490000
8,16,2,53,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,110356.980000
9,17,2,37,0.0,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,41366.490000


### Identifying and removing outliers

In [123]:
def identify_outliers(data):
    # identify outliers
    cut_off = data.std() * 3
    lower, upper = data.mean() - cut_off, data.mean() + cut_off
    # identify outliers
    outliers = [x for x in data if x < lower or x > upper]
    print('Identified outliers: %d' % len(outliers))
#     print(outliers)
    return outliers

In [124]:
df_tr_out = identify_outliers(df_tr["var38"])
df_te_out = identify_outliers(df_te["var38"])

Identified outliers: 306
Identified outliers: 170


In [152]:
for outlier in df_tr_out:
    df_tr = df_tr[df_tr.var38 != outlier]

In [153]:
for outlier in df_te_out:
    df_tr = df_te[df_te.var38 != outlier]

In [127]:
print(df_tr.shape)
print(df_te.shape)

(76020, 371)
(75818, 370)


In [128]:
target_col = df_tr['TARGET'].values
del df_tr['TARGET']

In [129]:
X_train, X_test, y_train, y_test = train_test_split(df_tr.values, target_col, test_size=0.2,random_state=42)

### Logisitic regression

In [130]:
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [131]:
train_acc_1 = model.score(X_train, y_train)
print("Train accuracy: ", train_acc_1)

Train accuracy:  0.9605202578268877


In [132]:
test_acc_1 = metrics.accuracy_score(y_test, model.predict(X_test))
print("Test accuracy: ", test_acc_1)

Test accuracy:  0.9600762957116549


### XGBoost

In [142]:
xgtrain = xgb.DMatrix(df_tr, target_col)
xgtest = xgb.DMatrix(df_te)

In [151]:
parameters = {
    "objective" : "binary:logistic", 
    "booster" : "gbtree",
    "eval_metric" : "auc",
    "eta" : 0.0202048,
    "max_depth" : 5,
    "subsample" : 0.6815,
    "colsample_bytree" : 0.701
}

xg_reg = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [150]:
print(preds)

[0 0 0 ... 0 0 0]


In [161]:
"age" in df_tr.columns

False

In [162]:
df_tr.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var29_ult3', 'saldo_medio_var33_hace2',
       'saldo_medio_var33_hace3', 'saldo_medio_var33_ult1',
       'saldo_medio_var33_ult3', 'saldo_medio_var44_hace2',
       'saldo_medio_var44_hace3', 'saldo_medio_var44_ult1',
       'saldo_medio_var44_ult3', 'var38'],
      dtype='object', length=370)

In [26]:
def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
    print ('--------- For Model : ', trained_model_name)
    predicted_values = trained_model.predict(X_test)
    print (metrics.classification_report(y_test,predicted_values))
    print ("Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values))
    print ("---------------------------------------\n")
