# Predictions using different models


In [43]:
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import seaborn as sns
from sklearn.metrics import recall_score



In [2]:
%store -r X_train
%store -r y_train

%store -r X_test
%store -r y_test

%store -r X_val
%store -r y_val

In [3]:
def custom_recall(y_true, y_pred):
    return recall_score(y_true, y_pred, average='weighted')

In [4]:
print(X_train.shape)
X_train.head()

(360642, 38)


Unnamed: 0,FLAG_OWN_CAR,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,EXT_SOURCE_2,FLAG_DOCUMENT_3,ANNUITY_INCOME_RATIO,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,NAME_INCOME_TYPE_Unemployed,NAME_EDUCATION_TYPE_Academic degree,...,avg_DAYS_DECISION_iqr_Q2,avg_AMT_DOWN_PAYMENT_iqr_Q2,ENTRANCES_MEDI_iqr_missing,EXT_SOURCE_1_iqr_Q1,EXT_SOURCE_1_iqr_Q3,EXT_SOURCE_1_iqr_missing,max_num_days_of_payment_late_iqr_Q2,COMMONAREA_AVG_iqr_missing,max_DAYS_ENDDATE_FACT_iqr_missing,OWN_CAR_AGE_iqr_Q3
0,1.0,1.0,0.0,0.568746,1.0,0.640187,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,0.0,1.0,0.0,0.583768,1.0,0.737624,0,1,0,0,...,1,1,1,0,0,1,1,1,1,0
2,0.0,0.5,0.0,0.683768,0.0,0.442641,1,1,0,0,...,1,0,0,0,0,1,0,0,0,0
3,0.0,0.5,0.0,0.450003,0.0,0.564583,1,1,0,0,...,0,1,0,1,0,0,1,1,1,0
4,0.0,0.5,1.0,0.733906,1.0,0.68634,0,1,0,0,...,1,1,0,0,0,1,1,0,0,0


In [5]:
X_train.dtypes

FLAG_OWN_CAR                                    float64
REGION_RATING_CLIENT_W_CITY                     float64
REG_CITY_NOT_LIVE_CITY                          float64
EXT_SOURCE_2                                    float64
FLAG_DOCUMENT_3                                 float64
ANNUITY_INCOME_RATIO                            float64
NAME_CONTRACT_TYPE_Revolving loans                uint8
CODE_GENDER_F                                     uint8
NAME_INCOME_TYPE_Unemployed                       uint8
NAME_EDUCATION_TYPE_Academic degree               uint8
NAME_EDUCATION_TYPE_Lower secondary               uint8
NAME_FAMILY_STATUS_Civil marriage                 uint8
count_NAME_PORTFOLIO_POS_0.0                      uint8
CNT_CHILDREN_CAT_Other                            uint8
count_CREDIT_CURRENCY_currency2_CAT_Other         uint8
count_CREDIT_TYPE_Car_loan_CAT_Other              uint8
avg_SK_DPD_DEF_CAT_Other                          uint8
avg_AMT_CREDIT_SUM_DEBT_iqr_Q1                  

In [6]:
# initiate the model's selection dataframe
res = pd.DataFrame()

# Linear Regression (Ridge)

In [7]:
mod1 = LogisticRegression(penalty='l2')
mod1.fit(X_train,y_train)

mod1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
pred1 = mod1.predict(X_val)

In [9]:
res = res.append({'model':'Logistic Regression (Ridge)','custom_recall':custom_recall(y_val,pred1)},ignore_index=True)

  res = res.append({'model':'Logistic Regression (Ridge)','custom_recall':custom_recall(y_val,pred1)},ignore_index=True)


In [10]:
res

Unnamed: 0,model,custom_recall
0,Logistic Regression (Ridge),0.66388


# LASSO

In [11]:
mod3 = LogisticRegression(penalty='l1',solver= 'liblinear', random_state=10)
mod3.fit(X_train,y_train)
mod3

In [12]:
pred3 = mod3.predict(X_val)

In [13]:
res = res.append({'model':'LASSO','custom_recall':custom_recall(y_val,pred3)},ignore_index=True)

  res = res.append({'model':'LASSO','custom_recall':custom_recall(y_val,pred3)},ignore_index=True)


## Decision Trees

In [14]:
mod5 = DecisionTreeClassifier(random_state=2)
mod5.fit(X_train,y_train)

In [15]:
pred5 = mod5.predict(X_val)

In [16]:
res = res.append({'model':'Decision Tree','custom_recall':custom_recall(y_val,pred5)},ignore_index=True)

  res = res.append({'model':'Decision Tree','custom_recall':custom_recall(y_val,pred5)},ignore_index=True)


In [17]:
res

Unnamed: 0,model,custom_recall
0,Logistic Regression (Ridge),0.66388
1,LASSO,0.663697
2,Decision Tree,0.86021


## Random Forest

In [18]:
mod6 = RandomForestClassifier(random_state=3)

mod6.fit(X_train,y_train)

In [19]:
pred6 = mod6.predict(X_val)

In [20]:
res = res.append({'model':'Random Forest','custom_recall':custom_recall(y_val,pred6)},ignore_index=True)

  res = res.append({'model':'Random Forest','custom_recall':custom_recall(y_val,pred6)},ignore_index=True)


In [21]:
res

Unnamed: 0,model,custom_recall
0,Logistic Regression (Ridge),0.66388
1,LASSO,0.663697
2,Decision Tree,0.86021
3,Random Forest,0.915078


## Adaptive Boosting (ADABoost)

In [22]:
mod7 = AdaBoostClassifier(random_state=1)

mod7.fit(X_train,y_train)

In [23]:
pred7 = mod7.predict(X_val)

In [24]:
res = res.append({'model':'AdaBoost','custom_recall':custom_recall(y_val,pred7)},ignore_index=True)

  res = res.append({'model':'AdaBoost','custom_recall':custom_recall(y_val,pred7)},ignore_index=True)


In [25]:
res

Unnamed: 0,model,custom_recall
0,Logistic Regression (Ridge),0.66388
1,LASSO,0.663697
2,Decision Tree,0.86021
3,Random Forest,0.915078
4,AdaBoost,0.66176


## Gradient Boosting Machine

In [26]:
mod8 = GradientBoostingClassifier(random_state=2)

mod8.fit(X_train,y_train)

In [27]:
pred8 = mod8.predict(X_val)

In [28]:
res = res.append({'model':'Gradient Boosting','custom_recall':custom_recall(y_val,pred8)},ignore_index=True)

  res = res.append({'model':'Gradient Boosting','custom_recall':custom_recall(y_val,pred8)},ignore_index=True)


In [29]:
res

Unnamed: 0,model,custom_recall
0,Logistic Regression (Ridge),0.66388
1,LASSO,0.663697
2,Decision Tree,0.86021
3,Random Forest,0.915078
4,AdaBoost,0.66176
5,Gradient Boosting,0.664512


# kNN


In [30]:
mod9 = KNeighborsClassifier()


In [31]:
mod9.fit(X_train,y_train)

In [38]:
pred9 = mod9.predict(X_val.values)



In [39]:
res = res.append({'model':'kNN','custom_recall':custom_recall(y_val,pred9)},ignore_index=True)

  res = res.append({'model':'kNN','custom_recall':custom_recall(y_val,pred9)},ignore_index=True)


In [40]:
res

Unnamed: 0,model,custom_recall
0,AdaBoost,0.66176
1,LASSO,0.663697
2,Logistic Regression (Ridge),0.66388
3,Gradient Boosting,0.664512
4,SVC,0.689489
5,Decision Tree,0.86021
6,Random Forest,0.915078
7,kNN,0.763095


# SVC



In [34]:
mod10 = SVC()
mod10.fit(X_train,y_train)

In [35]:
pred10 = mod10.predict(X_val)

In [36]:
res = res.append({'model':'SVC','custom_recall':custom_recall(y_val,pred10)},ignore_index=True)

  res = res.append({'model':'SVC','custom_recall':custom_recall(y_val,pred10)},ignore_index=True)


# Model Selection

In [41]:
res = res.sort_values('custom_recall')
print(res)

                         model  custom_recall
0                     AdaBoost       0.661760
1                        LASSO       0.663697
2  Logistic Regression (Ridge)       0.663880
3            Gradient Boosting       0.664512
4                          SVC       0.689489
7                          kNN       0.763095
5                Decision Tree       0.860210
6                Random Forest       0.915078


Based on the custom_recall score, selecting the "Random Forest" model is a reasonable choice as it's likely to provide better recall performance for our specific use case.