In [65]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score , classification_report , precision_recall_fscore_support ,f1_score
import warnings
warnings.filterwarnings('ignore')
import os
import time

In [66]:
print('Program is running......')
print()
start_time = time.time()

Program is running......



In [67]:
# Load the Dataset
a1 = pd.read_excel("E:\Data Science\PythonForDS\Credit Risk Modelling\Credit Modeling\case_study1.xlsx")
a2 = pd.read_excel("E:\Data Science\PythonForDS\Credit Risk Modelling\Credit Modeling\case_study2.xlsx")

df1 = a1.copy()
df2 = a2.copy()


In [68]:
df1.shape

(51336, 26)

In [69]:
df2.shape

(51336, 62)

## DF1

Note :
 `we have -99999 values in the dataset df1 which are basically the null values as some softwares doesnt take null values as input so these numbers are assigned in place of them`
- Columns with these -99999 values are 'Age_Oldest_TL' and 'Age_Newest_TL' and in both the column these values shares the same entries.
- Upon investigation out of 51337 entries only 40 entries have such values we will proceed to drop these rows to increase quality of data.

In [70]:
# Remove null values
df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

selected only those rows where Age_Oldest_TL doesnt have value -99999

In [71]:
df1.shape

(51296, 26)

## DF2


Note : `we have many -99999 values in the dataset df_encoded2 spread accross different columns`
- We have to understand imputing the missing values in the dataset is technically making an assumption that the missing value may be equal to average/median/mode the feature. WE CANT BE SURE as we are dealing the delinquencies in loans and we can bee completey wrong.
- IN REAL WORLD ALWAYS PLAY IT SAFE
- Therefore we will not impute the missing values in the dataset and we will try to keep as much original data as possible.

OUR DATASET HAS 51337 entries:
- We will check for the missing values in each feature if its more than 10000 that is almost 20% of the dataset we will drop that feature.
- If the missing value is less than 10000 we will drop the rows.

Doing so we will only work it the original data but we have to make sure after droping these rows or feature we should retain our at least 70-80% of the original data. if not it will be a huge data loss.




In [72]:
# checking the columns to drop that has -99999 values more than 10000
columns_to_be_removed = []

for col in df2.columns:
    if df2.loc[df2[col]== -99999].shape[0] > 10000:

        columns_to_be_removed.append(col)

columns_to_be_removed        

['time_since_first_deliquency',
 'time_since_recent_deliquency',
 'max_delinquency_level',
 'max_deliq_6mts',
 'max_deliq_12mts',
 'CC_utilization',
 'PL_utilization',
 'max_unsec_exposure_inPct']

- Droping these Columns

In [73]:
df2 = df2.drop(columns_to_be_removed,axis=1)
df2.shape

(51336, 54)

- The remaining feature will contain -99999 value which would be less than 10000 in count
- We will not include those rows in our final df2

In [74]:
for col in df2.columns:
    df2 = df2.loc[df2[col] != -99999] 

df2.shape    

(42066, 54)

- We have almost 81% of the original data df2 intact which is good and acceptable.


In [75]:
#Just cross checking if there any -99999 value present in df2 
df2.eq(-99999).any().any()

False

In [76]:
# checking for the null values as well
df2.isna().sum().sum()

0

In [77]:
df1.isna().sum().sum()

0

- Null values have been handled completely

#### Next Step would be to merge these two dataset and work on a single final dataset

In [78]:
# checking for any common column in both dataset
for i in list(df1.columns):
    if i in list(df2.columns):
        print(i)

PROSPECTID


- we will use PROSPECTID column to merge two datasets

In [79]:
# using inner join to avoid null values
df = pd.merge(df1,df2, how='inner',left_on = ['PROSPECTID'],right_on=['PROSPECTID'])
df

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,5,4,1,0,0,0.000,0.00,0.200,0.800,...,0.0,0.0,0.000,0.0,1,0,PL,PL,696,P2
1,2,1,0,1,0,0,0.000,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,8,0,8,1,0,0.125,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,1,0,ConsumerLoan,others,693,P2
3,5,3,2,1,0,0,0.000,0.00,0.333,0.667,...,0.0,0.0,0.000,0.0,0,0,AL,AL,753,P1
4,6,6,5,1,0,0,0.000,0.00,0.167,0.833,...,1.0,0.0,0.429,0.0,1,0,ConsumerLoan,PL,668,P3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42059,51332,3,0,3,1,0,0.333,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,ConsumerLoan,650,P4
42060,51333,4,2,2,0,1,0.000,0.25,0.500,0.500,...,0.0,0.0,0.000,0.0,0,0,others,others,702,P1
42061,51334,2,1,1,1,1,0.500,0.50,0.500,0.500,...,1.0,0.0,1.000,0.0,0,0,ConsumerLoan,others,661,P3
42062,51335,2,1,1,0,0,0.000,0.00,0.500,0.500,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,others,686,P2


In [80]:
df.isna().sum().sum()

0

NOote : Dividing the features into categorical and Numerical and treating them seperately to get a clear understanding of them.

## Catergorical Features

In [81]:
# checking for categorical columns
for col in df.columns:
    if df[col].dtype == 'object':
        print(col)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [82]:
df['MARITALSTATUS'].value_counts()

MARITALSTATUS
Married    30886
Single     11178
Name: count, dtype: int64

In [83]:
df['EDUCATION'].value_counts()

EDUCATION
GRADUATE          14140
12TH              11703
SSC                7241
UNDER GRADUATE     4572
OTHERS             2291
POST-GRADUATE      1898
PROFESSIONAL        219
Name: count, dtype: int64

In [84]:
df['GENDER'].value_counts()

GENDER
M    37345
F     4719
Name: count, dtype: int64

In [85]:
df['last_prod_enq2'].value_counts()

last_prod_enq2
ConsumerLoan    16480
others          13653
PL               7553
CC               2195
AL               1353
HL                830
Name: count, dtype: int64

In [86]:
df['first_prod_enq2'].value_counts()

first_prod_enq2
others          20640
ConsumerLoan    11075
PL               4431
AL               2641
CC               1988
HL               1289
Name: count, dtype: int64

In [87]:
df['Approved_Flag'].value_counts()

Approved_Flag
P2    25452
P3     6440
P4     5264
P1     4908
Name: count, dtype: int64

- Checking which of these categorical feature is actually important in predicting the target variable Approved_Flag.
- We can do CHI-SQUARE TEST on the each categorical feature opposed to Approved_Flag. 



In [88]:
# Chi-square Test
for i in ['MARITALSTATUS','EDUCATION','GENDER','last_prod_enq2','first_prod_enq2','Approved_Flag']:
    chi2 , pval,_,_ = chi2_contingency(pd.crosstab(df[i],df['Approved_Flag']))
    print(i, '---',pval)

MARITALSTATUS --- 3.578180861038862e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.907936100186563e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.84997610555419e-287
Approved_Flag --- 0.0


- Since all the categorical feature has p-value <=0.05 , we will accept all of them

## Numerical Features

In [89]:
# checking all the features that are numeric
numeric_column = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID','Approved_Flag']:
        numeric_column.append(i)

numeric_column        

['Total_TL',
 'Tot_Closed_TL',
 'Tot_Active_TL',
 'Total_TL_opened_L6M',
 'Tot_TL_closed_L6M',
 'pct_tl_open_L6M',
 'pct_tl_closed_L6M',
 'pct_active_tl',
 'pct_closed_tl',
 'Total_TL_opened_L12M',
 'Tot_TL_closed_L12M',
 'pct_tl_open_L12M',
 'pct_tl_closed_L12M',
 'Tot_Missed_Pmnt',
 'Auto_TL',
 'CC_TL',
 'Consumer_TL',
 'Gold_TL',
 'Home_TL',
 'PL_TL',
 'Secured_TL',
 'Unsecured_TL',
 'Other_TL',
 'Age_Oldest_TL',
 'Age_Newest_TL',
 'time_since_recent_payment',
 'num_times_delinquent',
 'max_recent_level_of_deliq',
 'num_deliq_6mts',
 'num_deliq_12mts',
 'num_deliq_6_12mts',
 'num_times_30p_dpd',
 'num_times_60p_dpd',
 'num_std',
 'num_std_6mts',
 'num_std_12mts',
 'num_sub',
 'num_sub_6mts',
 'num_sub_12mts',
 'num_dbt',
 'num_dbt_6mts',
 'num_dbt_12mts',
 'num_lss',
 'num_lss_6mts',
 'num_lss_12mts',
 'recent_level_of_deliq',
 'tot_enq',
 'CC_enq',
 'CC_enq_L6m',
 'CC_enq_L12m',
 'PL_enq',
 'PL_enq_L6m',
 'PL_enq_L12m',
 'time_since_recent_enq',
 'enq_L12m',
 'enq_L6m',
 'enq_L3m',

In [90]:
len(numeric_column)

72

- There are 72 features which are numeric and we will check if each of them is associated with target variable 'Approved_Flag' which is categorical in nature
- So the test would be NUM VS CAT every single time
- As we have 4 categories in our target variable we will apply ANOVA Test. 

- But before checking the association to target variable we have to check whether there are any features among these 72 which are associated to each other "MULTICOLLINEARITY".
- If yes we have to remove the multicolinearity.

In [91]:
# Checking VIF for every numeric coumn
vif_data = df[numeric_column]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

for i in range (0,total_columns):
    vif_value = variance_inflation_factor(vif_data,column_index)
    print(column_index ,"---",vif_value)

    if vif_value < 6:   # Typically kept 6 in banking sector but also depend on the project
        columns_to_be_kept.append(numeric_column[i])
        column_index = column_index + 1

    else:
        vif_data = vif_data.drop([numeric_column[i]], axis=1 )    # droping the column with VIF value >= 6 side by side that way in every iteration there is less and less column



0 --- inf
0 --- inf
0 --- 11.320180023967996
0 --- 8.363698035000336
0 --- 6.520647877790928
0 --- 5.149501618212625
1 --- 2.611111040579735
2 --- inf
2 --- 1788.7926256209232
2 --- 8.601028256477228
2 --- 3.8328007921530785
3 --- 6.099653381646727
3 --- 5.5813520096427585
4 --- 1.985584353098778
5 --- inf
5 --- 4.809538302819343
6 --- 23.270628983464636
6 --- 30.595522588100053
6 --- 4.384346405965583
7 --- 3.064658415523423
8 --- 2.898639771299253
9 --- 4.377876915347324
10 --- 2.2078535836958433
11 --- 4.916914200506864
12 --- 5.214702030064725


13 --- 3.3861625024231476
14 --- 7.840583309478997
14 --- 5.255034641721438
15 --- inf
15 --- 7.380634506427232
15 --- 1.4210050015175733
16 --- 8.083255010190323
16 --- 1.624122752404011
17 --- 7.257811920140003
17 --- 15.59624383268298
17 --- 1.825857047132431
18 --- 1.5080839450032664
19 --- 2.172088834824577
20 --- 2.623397553527229
21 --- 2.2959970812106167
22 --- 7.360578319196439
22 --- 2.160238777310255
23 --- 2.8686288267891467
24 --- 6.458218003637277
24 --- 2.8474118865638265
25 --- 4.753198156284083
26 --- 16.22735475594825
26 --- 6.424377256363877
26 --- 8.887080381808687
26 --- 2.3804746142952653
27 --- 8.609513476514548
27 --- 13.06755093547673
27 --- 3.5000400566546555
28 --- 1.9087955874813773
29 --- 17.006562234161628
29 --- 10.730485153719197
29 --- 2.3538497522950275
30 --- 22.104855915136433
30 --- 2.7971639638512906
31 --- 3.424171203217696
32 --- 10.175021454450922
32 --- 6.408710354561301
32 --- 1.001151196262562
33 --- 3.069197305397274
34 --- 2.809126160064372

In [92]:
len(columns_to_be_kept)

39

In [93]:
vif_data.shape

(42064, 39)

NOte : Now as we have 39 numeric columns let test each one against our target variable to see whether or not they are good associated predictors
- We will achieve this using ANOVA test : NUM vs CAT 

## ANOVA Test

In [94]:
from scipy.stats import f_oneway
columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
    a = list(df[i])
    b = list(df['Approved_Flag'])

    group_P1 = [value  for value, group in zip(a,b) if group =='P1']
    group_P2 = [value  for value, group in zip(a,b) if group =='P2']
    group_P3 = [value  for value, group in zip(a,b) if group =='P3']
    group_P4 = [value  for value, group in zip(a,b) if group =='P4']

    f_statistic,p_value =f_oneway(group_P1,group_P2,group_P3,group_P4)

    if p_value<=0.05:
        columns_to_be_kept_numerical.append(i)

In [95]:
len(columns_to_be_kept_numerical)

37

- We are down to 37 numeric columns from 39 

## Feature Engineering for both categorical and numerical variables


In [96]:
# all the essential features
features = columns_to_be_kept_numerical + ['MARITALSTATUS','EDUCATION','GENDER','last_prod_enq2','first_prod_enq2']

df = df[features +  ['Approved_Flag']]

In [97]:
df['EDUCATION'].unique()

array(['12TH', 'GRADUATE', 'SSC', 'POST-GRADUATE', 'UNDER GRADUATE',
       'OTHERS', 'PROFESSIONAL'], dtype=object)

NOte : 
- OTHERS IN THIS CASE WAS DUE TO NON SUBMISSION OF PROPER DOCUMENT HENCE IT WOULD BE BETTER TO ENCODE TO 1 . BUT IT WILL DEFFERENT IN DIFFERENT CASES WHICH NEEDS TO BE VERIFIED BEFORE ENCODING IT.
- GRADUATE IS SAME AS UNDER-GRADUATE HENCE ASSIGNING IT THE SAME CODE. 

In [98]:
# Encoding Education column
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']] = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']] = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']] = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']] = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']] = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']] = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']] = 3

df['EDUCATION'].value_counts()
df['EDUCATION'] = df['EDUCATION'].astype(int)
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            42064 non-null  float64
 1   pct_tl_closed_L6M          42064 non-null  float64
 2   Tot_TL_closed_L12M         42064 non-null  int64  
 3   pct_tl_closed_L12M         42064 non-null  float64
 4   Tot_Missed_Pmnt            42064 non-null  int64  
 5   CC_TL                      42064 non-null  int64  
 6   Home_TL                    42064 non-null  int64  
 7   PL_TL                      42064 non-null  int64  
 8   Secured_TL                 42064 non-null  int64  
 9   Unsecured_TL               42064 non-null  int64  
 10  Other_TL                   42064 non-null  int64  
 11  Age_Oldest_TL              42064 non-null  int64  
 12  Age_Newest_TL              42064 non-null  int64  
 13  time_since_recent_payment  42064 non-null  int

In [99]:
# On Hot encoding the rest of categorical variables
df_encoded = pd.get_dummies(df,columns=['MARITALSTATUS','GENDER','last_prod_enq2','first_prod_enq2'],drop_first=True)
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 51 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

# Mdel Fitting

#### Data Processing

In [100]:
# 1. Random Forest

x = df_encoded.drop(['Approved_Flag'],axis =1)
y = df_encoded['Approved_Flag']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

rf_classifier = RandomForestClassifier(n_estimators = 200 , random_state = 42)

rf_classifier.fit(x_train,y_train)

y_pred = rf_classifier.predict(x_test)

accuracy = accuracy_score(y_test , y_pred)
print()
print(f"Accuracy : {accuracy}")
print()

precision , recall , f1_score , _ = precision_recall_fscore_support(y_test , y_pred)

for i, v in enumerate (['p1','p2','p3','p4']):
    print(f"Class {v}:")
    print(f"Precision : {precision[i]}")
    print(f"Recall : {recall[i]}")
    print(f"F1 Score {f1_score[i]}")
    print()



Accuracy : 0.7622726732437893

Class p1:
Precision : 0.8368544600938967
Recall : 0.703155818540434
F1 Score 0.7642015005359056

Class p2:
Precision : 0.7959252971137522
Recall : 0.9292368681863231
F1 Score 0.8574302697759487

Class p3:
Precision : 0.42121684867394693
Recall : 0.2037735849056604
F1 Score 0.2746693794506613

Class p4:
Precision : 0.7203883495145631
Recall : 0.7210884353741497
F1 Score 0.7207382224380767



In [101]:
# 2. xgboost

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)



y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)




xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()







Accuracy: 0.78

Class p1:
Precision: 0.825346112886049
Recall: 0.7642998027613412
F1 Score: 0.7936507936507936

Class p2:
Precision: 0.8238249013275923
Recall: 0.9102081268582756
F1 Score: 0.8648648648648648

Class p3:
Precision: 0.46108490566037735
Recall: 0.29509433962264153
F1 Score: 0.3598711458812701

Class p4:
Precision: 0.7262357414448669
Recall: 0.7424684159378037
F1 Score: 0.7342623738587217



In [102]:

# 3. Decision Tree
from sklearn.tree import DecisionTreeClassifier


y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()



Accuracy: 0.71

Class p1:
Precision: 0.7198443579766537
Recall: 0.7297830374753451
F1 Score: 0.7247796278158668

Class p2:
Precision: 0.8102843786521231
Recall: 0.8245787908820614
F1 Score: 0.8173690932311621

Class p3:
Precision: 0.34716679968076614
Recall: 0.3283018867924528
F1 Score: 0.3374709076803724

Class p4:
Precision: 0.6432865731462926
Recall: 0.6239067055393586
F1 Score: 0.6334484459792796



- As XGBoost is realtively better performing we will try to hypertune the parameters and try doing further feature enginerring , see some graphs, scale the data and more. and try to better the model performance.

In [103]:
df_encoded['Approved_Flag'].value_counts()


Approved_Flag
P2    25452
P3     6440
P4     5264
P1     4908
Name: count, dtype: int64

- By looking at the class distribution P2 seems to be the hightest and rest P1,P3 and P4 are distributed a balance fashion.
- We cant call it a highly imbalanced data or skewed data hence we can use accuracy as a evaluation/loss metric. 

In [104]:
df_encoded['Approved_Flag'].value_counts()/len(df_encoded)

Approved_Flag
P2    0.605078
P3    0.153100
P4    0.125143
P1    0.116679
Name: count, dtype: float64

## HyperParameter Tuning

- Mannual Tuning the XGBOost

In [106]:
# Defining the hyperparameter grid
param_grid = {
    'colsample_bytree' : [0.1,0.3,0.5,0.7,0.9],
    'learning_rate': [0.001,0.01,0.1,1],
    'max_depth' :[3,5,8,10],
    'alpha' : [1,10,100],
    'n_estimators' : [10,50,100]
    }

index = 0

answers_grid = {
    'combination' : [],
    'train_accuracy' :[],
    'test_accuracy' : [],
    'colsample_bytree' :[],
    'learning_rate' :[],
    'max_depth' : [],
    'alpha' : [],
    'n_estimators' : []
}

# Looping through each combination of hyperparameters
for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for alpha in param_grid['alpha']:
                for n_estimators in param_grid['n_estimators']:
                    

                    index = index + 1

                    # define and train the XGboost MOdel
                    model = xgb.XGBClassifier(objective='multi:softmax',
                                               num_class =4,
                                               colsample_bytree =colsample_bytree,
                                               learning_rate = learning_rate,
                                               max_depth = max_depth,
                                               alpha = alpha,
                                               n_estimators = n_estimators
                                               )

                    y = df_encoded['Approved_Flag']
                    x = df_encoded.drop(['Approved_Flag'] , axis=1)

                    label_encoder = LabelEncoder()
                    y_encoded = label_encoder.fit_transform(y)

                    x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)
                    model.fit(x_train , y_train)


                    # Predict on training and testing sets
                    y_pred_train = model.predict(x_train)
                    y_pred_test = model.predict(x_test)


                    # CAlculate train and test results
                    train_accuracy = accuracy_score(y_train, y_pred_train) 
                    test_accuracy =  accuracy_score(y_test, y_pred_test)


                    # Include into the list

                    answers_grid['combination'].append(index)
                    answers_grid['colsample_bytree'].append(colsample_bytree)
                    answers_grid['learning_rate'].append(learning_rate)
                    answers_grid['train_accuracy'].append(train_accuracy)
                    answers_grid['test_accuracy'].append(test_accuracy)
                    answers_grid['max_depth'].append(max_depth)
                    answers_grid['alpha'].append(alpha)
                    answers_grid['n_estimators'].append(n_estimators)

                    # Print results for this combination

                    print(f"Combination {index}")
                    print(f"colsample_bytree: {colsample_bytree}",
                          f'learning_rate: {learning_rate}',
                          f'max_depth : {max_depth}',
                          f'alpha : {alpha}',
                          f'n_estimators : {n_estimators}')
                    print(f'train_accuracy : {train_accuracy : .2f}')
                    print(f'test_accuracy : {test_accuracy : .2f}')
                    print("-"*30)





Combination 1
colsample_bytree: 0.1 learning_rate: 0.001 max_depth : 3 alpha : 1 n_estimators : 10
train_accuracy :  0.61
test_accuracy :  0.60
------------------------------
Combination 2
colsample_bytree: 0.1 learning_rate: 0.001 max_depth : 3 alpha : 1 n_estimators : 50
train_accuracy :  0.61
test_accuracy :  0.60
------------------------------
Combination 3
colsample_bytree: 0.1 learning_rate: 0.001 max_depth : 3 alpha : 1 n_estimators : 100
train_accuracy :  0.61
test_accuracy :  0.60
------------------------------
Combination 4
colsample_bytree: 0.1 learning_rate: 0.001 max_depth : 3 alpha : 10 n_estimators : 10
train_accuracy :  0.61
test_accuracy :  0.60
------------------------------
Combination 5
colsample_bytree: 0.1 learning_rate: 0.001 max_depth : 3 alpha : 10 n_estimators : 50
train_accuracy :  0.61
test_accuracy :  0.60
------------------------------
Combination 6
colsample_bytree: 0.1 learning_rate: 0.001 max_depth : 3 alpha : 10 n_estimators : 100
train_accuracy :  0.6

In [108]:
# a dataframe to see which combination of hyperparameters are given us the best result.
xg_hp_df = pd.DataFrame(answers_grid)
xg_hp_df.to_excel('xg_hp_df.xlsx')

## Hyper Parameter tuning using GridSearchCV

In [123]:
# from sklearn.model_selection import GridSearchCV
# x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

# # Define the XGBClassifier with the initial set of hyperparameters
# xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# # Define the parameter grid for hyperparameter tuning

# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.1, 0.2],
# }

# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
# grid_search.fit(x_train, y_train)

# # Print the best hyperparameters
# print("Best Hyperparameters:", grid_search.best_params_)

# # Evaluate the model with the best hyperparameters on the test set
# best_model = grid_search.best_estimator_
# accuracy = best_model.score(x_test, y_test)
# print("Test Accuracy:", accuracy)


Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
Test Accuracy: 0.7818851777011767


In [109]:
# Predicting on the unseen data
a3 = pd.read_excel(r"E:\Data Science\PythonForDS\Credit Risk Modelling\Credit Modeling\Unseen_Dataset.xlsx")


In [110]:
# checking the columns in our training data 
cols_in_df = list(df.columns)
cols_in_df.pop(42)  # Removing the Approved_Flag

# Unseen DataFrame
df_unseen = a3[cols_in_df]

In [113]:
df_unseen['MARITALSTATUS'].unique()


array(['Married', 'Single'], dtype=object)

In [114]:
df_unseen['EDUCATION'].unique()

array(['12TH', 'GRADUATE', 'SSC', 'POST-GRADUATE', 'UNDER GRADUATE',
       'OTHERS'], dtype=object)

In [115]:
df_unseen['GENDER'].unique()

array(['M', 'F'], dtype=object)

In [116]:
df_unseen['last_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'AL', 'CC', 'others', 'HL'], dtype=object)

In [117]:
df_unseen['first_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

In [118]:
# Encoding Education like in training data
df_unseen.loc[df['EDUCATION'] == 'SSC',['EDUCATION']] = 1
df_unseen.loc[df['EDUCATION'] == '12TH',['EDUCATION']] = 2
df_unseen.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']] = 3
df_unseen.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']] = 3
df_unseen.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']] = 4
df_unseen.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']] = 1
df_unseen.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']] = 3


df_unseen['EDUCATION'].value_counts()


EDUCATION
GRADUATE          32
12TH              28
SSC               23
UNDER GRADUATE     9
POST-GRADUATE      5
OTHERS             3
Name: count, dtype: int64

In [119]:
df_unseen['EDUCATION'] = df['EDUCATION'].astype(int)
df_unseen.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 42 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            100 non-null    float64
 1   pct_tl_closed_L6M          100 non-null    float64
 2   Tot_TL_closed_L12M         100 non-null    int64  
 3   pct_tl_closed_L12M         100 non-null    float64
 4   Tot_Missed_Pmnt            100 non-null    int64  
 5   CC_TL                      100 non-null    int64  
 6   Home_TL                    100 non-null    int64  
 7   PL_TL                      100 non-null    int64  
 8   Secured_TL                 100 non-null    int64  
 9   Unsecured_TL               100 non-null    int64  
 10  Other_TL                   100 non-null    int64  
 11  Age_Oldest_TL              100 non-null    int64  
 12  Age_Newest_TL              100 non-null    int64  
 13  time_since_recent_payment  100 non-null    int64  


In [120]:
# On Hot encoding the rest of categorical variables
df_encoded_unseen = pd.get_dummies(df_unseen,columns=['MARITALSTATUS','GENDER','last_prod_enq2','first_prod_enq2'],drop_first=True)
df_encoded_unseen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               100 non-null    float64
 1   pct_tl_closed_L6M             100 non-null    float64
 2   Tot_TL_closed_L12M            100 non-null    int64  
 3   pct_tl_closed_L12M            100 non-null    float64
 4   Tot_Missed_Pmnt               100 non-null    int64  
 5   CC_TL                         100 non-null    int64  
 6   Home_TL                       100 non-null    int64  
 7   PL_TL                         100 non-null    int64  
 8   Secured_TL                    100 non-null    int64  
 9   Unsecured_TL                  100 non-null    int64  
 10  Other_TL                      100 non-null    int64  
 11  Age_Oldest_TL                 100 non-null    int64  
 12  Age_Newest_TL                 100 non-null    int64  
 13  time_s

In [121]:
df_encoded_unseen.describe()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,enq_L3m,NETMONTHLYINCOME,Time_With_Curr_Empr,CC_Flag,PL_Flag,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,EDUCATION
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.19123,0.11621,0.71,0.16822,0.41,0.09,0.04,0.41,2.33,2.3,...,1.09,25764.3,108.33,0.08,0.23,0.15461,0.09333,0.16,0.03,2.25
std,0.294655,0.253891,1.13079,0.27274,0.792579,0.320826,0.242878,1.005992,4.811718,2.544553,...,1.83179,13707.266015,65.996029,0.27266,0.422953,0.335204,0.279607,0.368453,0.171447,0.903137
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,15875.0,60.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.5,...,0.0,24000.0,101.5,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,0.3435,0.091,1.0,0.25,0.25,0.0,0.0,0.0,2.0,3.0,...,1.0,30000.0,130.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,1.0,1.0,5.0,1.0,3.0,2.0,2.0,7.0,35.0,10.0,...,10.0,65000.0,306.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0


## Hyper Paramter Tuned Model predicting on unseen data

In [126]:
# XGboost

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

model = xgb.XGBClassifier(objective='multi:softmax',  num_class=4 ,
                                   colsample_bytree = 0.9,
                                   learning_rate = 1,
                                   max_depth = 3,
                                   alpha = 10,
                                   n_estimators = 100)


model.fit(x_train,y_train)


In [132]:
y_pred_unseen = model.predict(df_encoded_unseen)
a3['Target Variable'] = y_pred_unseen

a3.to_excel('E:\Data Science\PythonForDS\Credit Risk Modelling\Credit Modeling\Final_Predictions.xlsx')

In [137]:
pd.DataFrame(y_pred_unseen).value_counts()


0
1    70
3    13
2     9
0     8
Name: count, dtype: int64