# Loading libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,Ridge,Lasso
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.metrics import f1_score,accuracy_score,classification_report,confusion_matrix,roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from mypipes import *

import warnings
warnings.filterwarnings('ignore') # ignore the warnings.


import matplotlib.pyplot as plt
import seaborn as sb
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

def Threshold_Finder(Y_test,Y_predict):
    I = []
    roc_ = []
    for i in np.linspace(0.01,1,1000):
        Y_predict = Y_predict_prob[:,1] > i
        roc = roc_auc_score(Y_test,Y_predict)
        I.append(i)
        roc_.append(roc)
    Temp = I[roc_.index(max(roc_))]
    return Temp

#  Loading the datasets

In [2]:
datafile_train=r'Consumer_Complaints_train.csv'
datafile_test=r'Consumer_Complaints_test_share.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)

In [3]:
cd_train.dtypes

Date received                   object
Product                         object
Sub-product                     object
Issue                           object
Sub-issue                       object
Consumer complaint narrative    object
Company public response         object
Company                         object
State                           object
ZIP code                        object
Tags                            object
Consumer consent provided?      object
Submitted via                   object
Date sent to company            object
Company response to consumer    object
Timely response?                object
Consumer disputed?              object
Complaint ID                     int64
dtype: object

In [4]:
for col in ['Date received','Date sent to company']:
    cd_train[col]=pd.to_datetime(cd_train[col],infer_datetime_format=True)
    cd_test[col]=pd.to_datetime(cd_test[col],infer_datetime_format=True)

In [5]:
cd_train['day_diff']=pd.to_numeric(cd_train['Date sent to company']-cd_train['Date received'])
cd_test['day_diff']=pd.to_numeric(cd_test['Date sent to company']-cd_test['Date received'])

In [6]:
for col in ['Date received','Date sent to company']:
    cd_train.drop([col],1,inplace=True)
    cd_test.drop([col],1,inplace=True)

In [7]:
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())

Product : 12
Sub-product : 47
Issue : 95
Sub-issue : 69
Consumer complaint narrative : 74019
Company public response : 10
Company : 3276
State : 62
ZIP code : 25942
Tags : 3
Consumer consent provided? : 4
Submitted via : 6
Company response to consumer : 7
Timely response? : 2
Consumer disputed? : 2


In [8]:
cd_train.isnull().sum()

Product                              0
Sub-product                     138473
Issue                                0
Sub-issue                       292624
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Tags                            411215
Consumer consent provided?      342934
Submitted via                        0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
Complaint ID                         0
day_diff                             0
dtype: int64

In [9]:
len(pd.isnull(cd_train['Tags']))
len(cd_train)

478421

In [10]:
cd_train.head(4)

Unnamed: 0,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,day_diff
0,Credit card,,Billing statement,,,,Wells Fargo & Company,MI,48342,Older American,,Web,Closed with explanation,Yes,No,856103,86400000000000
1,Bank account or service,(CD) Certificate of deposit,"Making/receiving payments, sending money",,,,Santander Bank US,PA,18042,,,Referral,Closed,Yes,No,1034666,518400000000000
2,Credit reporting,,Incorrect information on credit report,Account status,,,Equifax,CA,92427,,,Referral,Closed with non-monetary relief,Yes,No,756363,1814400000000000
3,Credit card,,Billing statement,,"My credit card statement from US Bank, XXXX. X...",Company chooses not to provide a public response,U.S. Bancorp,GA,305XX,Older American,Consent provided,Web,Closed with monetary relief,Yes,No,1474177,0


In [11]:
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
            'Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    cd_train[varname]=np.where(pd.isnull(cd_train[col]),1,0)
    cd_train.drop([col],1,inplace=True)
    cd_test[varname]=np.where(pd.isnull(cd_test[col]),1,0)
    cd_test.drop([col],1,inplace=True)
    

In [12]:
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())

Product : 12
Issue : 95
Company : 3276
State : 62
ZIP code : 25942
Submitted via : 6
Company response to consumer : 7
Timely response? : 2
Consumer disputed? : 2


In [13]:
for col in ['ZIP code','Company']:
    cd_train.drop([col],1,inplace=True)
    cd_test.drop([col],1,inplace=True)

In [14]:
cd_train['Consumer disputed?']=np.where(cd_train['Consumer disputed?']=="Yes",1,0)

In [15]:
k=cd_train['Issue'].value_counts()
for val in k.axes[0][0:10]:
    varname='Issue_'+val.replace(',','_').replace(' ','_')
    cd_train[varname]=np.where(cd_train['Issue']==val,1,0)
    cd_test[varname]=np.where(cd_test['Issue']==val,1,0)
del cd_train['Issue']
del cd_test['Issue']

In [16]:
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())

Product : 12
State : 62
Submitted via : 6
Company response to consumer : 7
Timely response? : 2


In [17]:
k=cd_train['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    cd_train[varname]=np.where(cd_train['State']==val,1,0)
    cd_test[varname]=np.where(cd_test['State']==val,1,0)
del cd_train['State']
del cd_test['State']

In [18]:
for col in ['Product','Submitted via','Company response to consumer','Timely response?']:
    
    temp=pd.get_dummies(cd_train[col],prefix=col,drop_first=True)
    cd_train=pd.concat([temp,cd_train],1)
    cd_train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(cd_test[col],prefix=col,drop_first=True)
    cd_test=pd.concat([temp,cd_test],1)
    cd_test.drop([col],1,inplace=True)


In [19]:
x=cd_train.drop(['Consumer disputed?','Complaint ID'],1)
y=cd_train['Consumer disputed?']

# Create train_test_spit (for 1 time validation)


In [20]:
X_train,X_test,Y_train,Y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# Logistic Regression

##  Fine a general model (default hyper parameter setting)


In [21]:
LG_1 = LogisticRegression()
LG_1.fit(X_train,Y_train)

LogisticRegression()

In [22]:
LG_1.coef_

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.]])

In [23]:
LG_1.intercept_

array([0.])

In [24]:
Y_predict = LG_1.predict(X_test)
Y_predict

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
confusion_matrix(Y_test,Y_predict)

array([[75453,     0],
       [20232,     0]], dtype=int64)

In [26]:
accuracy_score(Y_test,Y_predict)

0.7885562000313528

In [27]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88     75453
           1       0.00      0.00      0.00     20232

    accuracy                           0.79     95685
   macro avg       0.39      0.50      0.44     95685
weighted avg       0.62      0.79      0.70     95685



In [28]:
f1_score(Y_test,Y_predict)

0.0

In [29]:
Y_predict_prob = LG_1.predict_proba(X_test)
Y_predict_prob

array([[0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       ...,
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5]])

In [32]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     75453
           1       0.21      1.00      0.35     20232

    accuracy                           0.21     95685
   macro avg       0.11      0.50      0.17     95685
weighted avg       0.04      0.21      0.07     95685



##  Hyper parameter with Cross validation (select best hyper parameter settings with multifold validations)


In [33]:
param_grid = {'penalty':['l1', 'l2', 'elasticnet', None],
              'C':np.linspace(0.01,6,200),
              'class_weight':['balanced',None],
              'solver' : ['liblinear','saga']}

In [34]:
Log_Reg_model = LogisticRegression()

Log_Reg_Random_Search = RandomizedSearchCV(Log_Reg_model,
                                          param_distributions = param_grid,
                                          cv = 5,
                                          n_iter = 25,
                                          scoring = 'roc_auc')



In [None]:
Log_Reg_Random_Search.fit(X_train,Y_train)

In [None]:
Log_Reg_Random_Search.best_estimator_

In [None]:
param_grid = {'penalty':['l1', 'l2', 'elasticnet', None],
              'C':np.linspace(0.1,1,25),
              'class_weight':['balanced',None],
              'solver' : ['liblinear','saga']}
Log_Reg_model = LogisticRegression()

Log_Reg_Grid_Search = GridSearchCV(Log_Reg_model,
                             param_grid = param_grid,
                             cv = 5,
                             scoring = 'roc_auc')
Log_Reg_Grid_Search.fit(X_train,Y_train)

In [None]:
Log_Reg_Grid_Search.best_estimator_

In [None]:
Log_Reg_model = Log_Reg_Grid_Search.best_estimator_

In [None]:
Log_Reg_model.fit(X_train,Y_train)
Y_predict = Log_Reg_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3421,  320],
       [ 527, 1732]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3741
           1       0.84      0.77      0.80      2259

    accuracy                           0.86      6000
   macro avg       0.86      0.84      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8035258640686614

In [None]:
Y_predict_prob = Log_Reg_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict)

0.42522522522522527

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      3741
           1       0.80      0.84      0.82      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
dtree_model = DecisionTreeClassifier()
dtree_model_cv_scores = cross_val_score(dtree_model,X_train,Y_train,cv=5,scoring='roc_auc')
dtree_model_cv_scores.mean() #uncontrolled decision tree's validation performance in auc is less than logistic

0.7694991694046608

In [None]:
dtree_model.fit(X_train,Y_train)
Y_predict = dtree_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3089,  652],
       [ 685, 1574]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      3741
           1       0.71      0.70      0.70      2259

    accuracy                           0.78      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.78      0.78      0.78      6000



In [None]:
f1_score(Y_test,Y_predict)

0.7018952062430324

In [None]:
Y_predict_prob = dtree_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict)

0.01

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      3741
           1       0.71      0.70      0.70      2259

    accuracy                           0.78      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.78      0.78      0.78      6000



In [None]:
%%time
rf_model = RandomForestClassifier()
rf_model_cv_scores = cross_val_score(rf_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
rf_model_cv_scores.mean()
#slower processing compared to logistic, dtree
#better validation result

CPU times: user 409 ms, sys: 152 ms, total: 561 ms
Wall time: 37.1 s


0.884275076264538

In [None]:
rf_model.fit(X_train,Y_train)
Y_predict = rf_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3407,  334],
       [ 509, 1750]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3741
           1       0.84      0.77      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.84      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8058945429426665

In [None]:
Y_predict_prob = rf_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.40045045045045047

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      3741
           1       0.80      0.83      0.82      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
%%time
et_model = ExtraTreesClassifier()
et_model_cv_scores = cross_val_score(et_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
et_model_cv_scores.mean()
#faster processing compared to randomforest
#better validation result than logistic, dtree

CPU times: user 200 ms, sys: 55.6 ms, total: 255 ms
Wall time: 18.9 s


0.8613057313753556

In [None]:
et_model.fit(X_train,Y_train)
Y_predict = et_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3431,  310],
       [ 864, 1395]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.80      0.92      0.85      3741
           1       0.82      0.62      0.70      2259

    accuracy                           0.80      6000
   macro avg       0.81      0.77      0.78      6000
weighted avg       0.81      0.80      0.80      6000



In [None]:
f1_score(Y_test,Y_predict)

0.7038345105953582

In [None]:
Y_predict_prob = et_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4202702702702703

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      3741
           1       0.74      0.76      0.75      2259

    accuracy                           0.81      6000
   macro avg       0.80      0.80      0.80      6000
weighted avg       0.81      0.81      0.81      6000



In [None]:
%%time
ada_model = AdaBoostClassifier()
ada_model_cv_scores = cross_val_score(ada_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
ada_model_cv_scores.mean()

CPU times: user 160 ms, sys: 36.5 ms, total: 196 ms
Wall time: 14.5 s


0.8810854317302965

In [None]:
ada_model.fit(X_train,Y_train)
Y_predict = ada_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3398,  343],
       [ 588, 1671]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88      3741
           1       0.83      0.74      0.78      2259

    accuracy                           0.84      6000
   macro avg       0.84      0.82      0.83      6000
weighted avg       0.84      0.84      0.84      6000



In [None]:
f1_score(Y_test,Y_predict)

0.7821202901942429

In [None]:
Y_predict_prob = ada_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4975675675675676

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.85      0.87      3741
           1       0.77      0.84      0.80      2259

    accuracy                           0.84      6000
   macro avg       0.83      0.84      0.83      6000
weighted avg       0.85      0.84      0.84      6000



In [None]:
%%time
gb_model = GradientBoostingClassifier()
gb_model_cv_scores = cross_val_score(gb_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
gb_model_cv_scores.mean()

CPU times: user 614 ms, sys: 125 ms, total: 739 ms
Wall time: 1min 6s


0.8889860046791135

In [None]:
gb_model.fit(X_train,Y_train)
Y_predict = gb_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3405,  336],
       [ 462, 1797]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90      3741
           1       0.84      0.80      0.82      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.85      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8183060109289617

In [None]:
Y_predict_prob = gb_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4242342342342343

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3741
           1       0.82      0.84      0.83      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.86      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [None]:
%%time
xgb_model = XGBClassifier()
xgb_model_cv_scores = cross_val_score(xgb_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
xgb_model_cv_scores.mean()

CPU times: user 108 ms, sys: 21.2 ms, total: 130 ms
Wall time: 5.64 s


0.8829256553207896

In [None]:
xgb_model.fit(X_train,Y_train)
Y_predict = xgb_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3379,  362],
       [ 433, 1826]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89      3741
           1       0.83      0.81      0.82      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.86      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8212277940184395

In [None]:
Y_predict_prob = xgb_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.42225225225225227

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3741
           1       0.82      0.83      0.82      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.86      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [None]:
%%time
lgb_model = LGBMClassifier()
lgb_model_cv_scores = cross_val_score(lgb_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
lgb_model_cv_scores.mean()

CPU times: user 120 ms, sys: 15 ms, total: 135 ms
Wall time: 8.55 s


0.8891084378389884

In [None]:
lgb_model.fit(X_train,Y_train)
Y_predict = lgb_model.predict(X_test)

[LightGBM] [Info] Number of positive: 8986, number of negative: 15014
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5184
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374417 -> initscore=-0.513315
[LightGBM] [Info] Start training from score -0.513315


In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3411,  330],
       [ 416, 1843]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      3741
           1       0.85      0.82      0.83      2259

    accuracy                           0.88      6000
   macro avg       0.87      0.86      0.87      6000
weighted avg       0.88      0.88      0.88      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8316787003610107

In [None]:
Y_predict_prob = lgb_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.35981981981981986

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      3741
           1       0.82      0.85      0.84      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.87      0.87      6000
weighted avg       0.88      0.87      0.87      6000



In [None]:
%%time
cb_model = CatBoostClassifier()
cb_model_cv_scores = cross_val_score(cb_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
cb_model_cv_scores.mean()

CPU times: user 910 ms, sys: 148 ms, total: 1.06 s
Wall time: 1min 33s


0.8909371352020707

In [None]:
cb_model.fit(X_train,Y_train)
Y_predict = cb_model.predict(X_test)

Learning rate set to 0.040021
0:	learn: 0.6675296	total: 64.9ms	remaining: 1m 4s
1:	learn: 0.6452212	total: 78.3ms	remaining: 39s
2:	learn: 0.6254495	total: 91ms	remaining: 30.3s
3:	learn: 0.6059213	total: 104ms	remaining: 26s
4:	learn: 0.5880209	total: 118ms	remaining: 23.5s
5:	learn: 0.5718522	total: 133ms	remaining: 22.1s
6:	learn: 0.5597428	total: 147ms	remaining: 20.9s
7:	learn: 0.5461153	total: 161ms	remaining: 20s
8:	learn: 0.5334705	total: 177ms	remaining: 19.5s
9:	learn: 0.5231118	total: 190ms	remaining: 18.9s
10:	learn: 0.5126780	total: 207ms	remaining: 18.6s
11:	learn: 0.5032010	total: 221ms	remaining: 18.2s
12:	learn: 0.4956384	total: 237ms	remaining: 18s
13:	learn: 0.4877502	total: 251ms	remaining: 17.7s
14:	learn: 0.4801944	total: 264ms	remaining: 17.4s
15:	learn: 0.4732024	total: 278ms	remaining: 17.1s
16:	learn: 0.4675784	total: 291ms	remaining: 16.8s
17:	learn: 0.4622923	total: 305ms	remaining: 16.7s
18:	learn: 0.4569810	total: 319ms	remaining: 16.5s
19:	learn: 0.45215

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3422,  319],
       [ 428, 1831]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      3741
           1       0.85      0.81      0.83      2259

    accuracy                           0.88      6000
   macro avg       0.87      0.86      0.87      6000
weighted avg       0.87      0.88      0.87      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8305738262644592

In [None]:
Y_predict_prob = cb_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.39054054054054055

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      3741
           1       0.82      0.85      0.84      2259

    accuracy                           0.87      6000
   macro avg       0.87      0.87      0.87      6000
weighted avg       0.88      0.87      0.88      6000



In [None]:
roc_auc_score(Y_test,Y_predict_prob[:,1])

0.8837418747002546

In [None]:
accuracy_score(Y_test,Y_predict)

0.8746666666666667

In [None]:
%%time
dtree_clf_model = DecisionTreeClassifier()

dtree_clf_params = {
    'class_weight':[None,'balanced'],
    'criterion':['entropy','gini'],
    'max_depth':[None,5,10,15,20,30,50,70],
    'min_samples_leaf':[1,2,5,10,15,20],
    'min_samples_split':[2,5,10,15,20]
}

dtree_clf_random_search = RandomizedSearchCV(dtree_clf_model,param_distributions=dtree_clf_params,
                                             cv = 5, n_iter=10,scoring='roc_auc',n_jobs=-1)

dtree_clf_random_search.fit(X_train,Y_train)
dtree_clf_random_search.best_score_

CPU times: user 1.61 s, sys: 86.9 ms, total: 1.7 s
Wall time: 37.2 s


0.8724810232909943

In [None]:
dtree_clf_model = dtree_clf_random_search.best_estimator_

In [None]:
dtree_clf_model.fit(X_train,Y_train)
Y_predict = dtree_clf_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3361,  380],
       [ 475, 1784]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3741
           1       0.82      0.79      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.84      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8066922903007009

In [None]:
Y_predict_prob = dtree_clf_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4450450450450451

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.89      0.89      3741
           1       0.82      0.80      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
%%time
rf_clf_model = RandomForestClassifier()

rf_clf_params = {
    'n_estimators':[100,200,300,500,700,1000], #number of individual decision trees to be created
    'max_features': [5,10,20,25,30,35], #how many features would be available at a split
    'bootstrap': [True, False], #should different data subsets go in or not
    'class_weight':[None,'balanced'],
    'criterion':['entropy','gini'],
    'max_depth':[None,5,10,15,20,30,50,70],
    'min_samples_leaf':[1,2,5,10,15,20],
    'min_samples_split':[2,5,10,15,20]
}

rf_clf_random_search = RandomizedSearchCV(rf_clf_model,param_distributions=rf_clf_params,
                                             cv = 5, n_jobs=-1, n_iter=10,scoring='roc_auc')

rf_clf_random_search.fit(X_train,Y_train)
rf_clf_random_search.best_score_

CPU times: user 1min 39s, sys: 7.47 s, total: 1min 46s
Wall time: 1h 13min 26s


0.889465651526206

In [None]:
rf_clf_model = dtree_clf_random_search.best_estimator_

In [None]:
rf_clf_model.fit(X_train,Y_train)
Y_predict = rf_clf_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3363,  378],
       [ 475, 1784]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3741
           1       0.83      0.79      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.84      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8070572268717485

In [None]:
Y_predict_prob = rf_clf_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4450450450450451

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.89      0.89      3741
           1       0.82      0.80      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
%%time
cb_clf_model = CatBoostClassifier()

cb_clf_params = {
   'n_estimators':[50,100,200,500,700],
   'learning_rate': [0.01,.05,0.1,0.4,0.8,1],
   'max_depth':[1,2,3,4,5,6],
   'subsample':[0.5,0.8,1],
   'colsample_bylevel':[i/10.0 for i in range(5,10)],
   'l2_leaf_reg':[1e-5, 1e-2, 0.1, 1, 100]
}

cb_clf_random_search = RandomizedSearchCV(cb_clf_model,param_distributions=cb_clf_params,
                                             cv = 5, n_iter=2,scoring='roc_auc',n_jobs=-1)

cb_clf_random_search.fit(X_train,Y_train)
cb_clf_random_search.best_score_

0:	learn: 0.4910770	total: 18.2ms	remaining: 12.7s
1:	learn: 0.4631929	total: 38.5ms	remaining: 13.4s
2:	learn: 0.4439984	total: 49.3ms	remaining: 11.5s
3:	learn: 0.4326397	total: 63ms	remaining: 11s
4:	learn: 0.4267872	total: 73.5ms	remaining: 10.2s
5:	learn: 0.4221899	total: 83.2ms	remaining: 9.62s
6:	learn: 0.4192316	total: 98.6ms	remaining: 9.76s
7:	learn: 0.4183684	total: 112ms	remaining: 9.64s
8:	learn: 0.4124583	total: 127ms	remaining: 9.78s
9:	learn: 0.4108027	total: 140ms	remaining: 9.63s
10:	learn: 0.4096621	total: 146ms	remaining: 9.12s
11:	learn: 0.4084841	total: 151ms	remaining: 8.64s
12:	learn: 0.4064864	total: 156ms	remaining: 8.23s
13:	learn: 0.4056487	total: 161ms	remaining: 7.91s
14:	learn: 0.4043958	total: 167ms	remaining: 7.63s
15:	learn: 0.4037841	total: 173ms	remaining: 7.38s
16:	learn: 0.4033970	total: 179ms	remaining: 7.18s
17:	learn: 0.4029414	total: 184ms	remaining: 6.99s
18:	learn: 0.4021635	total: 190ms	remaining: 6.82s
19:	learn: 0.4018177	total: 196ms	rema

0.8647770375760011

In [None]:
cb_clf_model = cb_clf_random_search.best_estimator_

In [None]:
cb_clf_model.fit(X_train,Y_train)
Y_predict = cb_clf_model.predict(X_test)

0:	learn: 0.4910770	total: 13.1ms	remaining: 9.16s
1:	learn: 0.4631929	total: 19ms	remaining: 6.64s
2:	learn: 0.4439984	total: 26.6ms	remaining: 6.18s
3:	learn: 0.4326397	total: 32.9ms	remaining: 5.72s
4:	learn: 0.4267872	total: 38.5ms	remaining: 5.35s
5:	learn: 0.4221899	total: 43.9ms	remaining: 5.08s
6:	learn: 0.4192316	total: 49.5ms	remaining: 4.9s
7:	learn: 0.4183684	total: 54.7ms	remaining: 4.73s
8:	learn: 0.4124583	total: 60.2ms	remaining: 4.62s
9:	learn: 0.4108027	total: 65.6ms	remaining: 4.53s
10:	learn: 0.4096621	total: 71.4ms	remaining: 4.47s
11:	learn: 0.4084841	total: 76.6ms	remaining: 4.39s
12:	learn: 0.4064864	total: 84.3ms	remaining: 4.46s
13:	learn: 0.4056487	total: 90.1ms	remaining: 4.42s
14:	learn: 0.4043958	total: 95.7ms	remaining: 4.37s
15:	learn: 0.4037841	total: 102ms	remaining: 4.35s
16:	learn: 0.4033970	total: 107ms	remaining: 4.32s
17:	learn: 0.4029414	total: 113ms	remaining: 4.28s
18:	learn: 0.4021635	total: 119ms	remaining: 4.26s
19:	learn: 0.4018177	total: 1

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3303,  438],
       [ 573, 1686]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87      3741
           1       0.79      0.75      0.77      2259

    accuracy                           0.83      6000
   macro avg       0.82      0.81      0.82      6000
weighted avg       0.83      0.83      0.83      6000



In [None]:
f1_score(Y_test,Y_predict)

0.7693360711841204

In [None]:
Y_predict_prob = cb_clf_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.39351351351351355

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      3741
           1       0.76      0.79      0.78      2259

    accuracy                           0.83      6000
   macro avg       0.82      0.82      0.82      6000
weighted avg       0.83      0.83      0.83      6000



In [None]:
temp = X_train.columns

In [None]:
scalar = StandardScaler()

X_train = scalar.fit_transform(X_train)
X_train = pd.DataFrame(data = X_train, columns = temp)
X_train.head()

Unnamed: 0,var3,var4,var5,var6,var7,var8,var12,var14,var15,var16,...,var23_tf,var23_ub,var23_qu,var23_ri,var23_fe,var23_da,var23_cz,var23_sy,var23_yv,var29_ev
0,1.795857,-0.92554,0.212231,-1.845218,2.16426,0.107453,0.164472,0.874381,0.814711,1.301922,...,1.690716,-0.517781,-0.444259,-0.386983,-0.315444,-0.265377,-0.210141,-0.15186,-0.10363,-1.995584
1,0.994893,-0.617507,-0.646956,0.710405,1.021448,-0.450844,-0.078446,-0.263682,-1.304482,-0.196447,...,-0.591465,1.93132,-0.444259,-0.386983,-0.315444,-0.265377,-0.210141,-0.15186,-0.10363,0.501107
2,-0.341154,0.928364,0.358946,1.471601,0.108987,1.240615,-1.172287,-0.980612,-0.757384,2.261316,...,-0.591465,-0.517781,-0.444259,2.584094,-0.315444,-0.265377,-0.210141,-0.15186,-0.10363,0.501107
3,-0.111833,-0.886323,-1.113731,-0.344039,0.197326,1.275405,0.40739,-0.925464,0.023932,0.517626,...,-0.591465,-0.517781,-0.444259,-0.386983,3.17013,-0.265377,-0.210141,-0.15186,-0.10363,0.501107
4,-0.18495,-1.549451,-0.526871,1.278182,0.636783,-0.097974,1.59783,-0.594573,-0.8532,-2.03407,...,-0.591465,-0.517781,2.250941,-0.386983,-0.315444,-0.265377,-0.210141,-0.15186,-0.10363,0.501107


In [None]:
knn_model = KNeighborsClassifier(10)
knn_cv_scores= cross_val_score(knn_model,X_train.values,Y_train,cv=5,scoring='roc_auc')
knn_cv_scores.mean()

0.7112238648031439

In [None]:
svm_model = SVC()
svm_cv_scores= cross_val_score(svm_model,X_train,Y_train,cv=5,scoring='roc_auc')
svm_cv_scores.mean()

0.8814048549704323

In [None]:
%%time
svm_model = SVC()

svm_params = {
   'C':[1,5,10], #to control regularization
   'kernel': ['linear','poly','rbf','sigmoid'],
   'degree':[3,4,5],
   'gamma':['scale','auto'],
   'class_weight' :['balanced',None]
}

svm_random_search = RandomizedSearchCV(svm_model,param_distributions=svm_params,cv = 5, n_iter=10,scoring='roc_auc',
                                         n_jobs=-1)

svm_random_search.fit(X_train,Y_train)
svm_random_search.best_score_

CPU times: user 21min 32s, sys: 14.6 s, total: 21min 47s
Wall time: 2h 24min 2s


0.8853397990828054

## Based on the above cb_model and lgb_model are two model we can consider for classification of Consumer Complaints Resolution. 