In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,precision_recall_fscore_support
import warnings
import os

In [2]:
a1 = pd.read_excel(r"C:\Users\carna\Desktop\Credit Score Classifier\case_study1.xlsx")
a2 = pd.read_excel(r"C:\Users\carna\Desktop\Credit Score Classifier\case_study2.xlsx")

In [3]:
df1 = a1.copy()
df2 = a2.copy()

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PROSPECTID            51336 non-null  int64  
 1   Total_TL              51336 non-null  int64  
 2   Tot_Closed_TL         51336 non-null  int64  
 3   Tot_Active_TL         51336 non-null  int64  
 4   Total_TL_opened_L6M   51336 non-null  int64  
 5   Tot_TL_closed_L6M     51336 non-null  int64  
 6   pct_tl_open_L6M       51336 non-null  float64
 7   pct_tl_closed_L6M     51336 non-null  float64
 8   pct_active_tl         51336 non-null  float64
 9   pct_closed_tl         51336 non-null  float64
 10  Total_TL_opened_L12M  51336 non-null  int64  
 11  Tot_TL_closed_L12M    51336 non-null  int64  
 12  pct_tl_open_L12M      51336 non-null  float64
 13  pct_tl_closed_L12M    51336 non-null  float64
 14  Tot_Missed_Pmnt       51336 non-null  int64  
 15  Auto_TL            

In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 62 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   PROSPECTID                    51336 non-null  int64  
 1   time_since_recent_payment     51336 non-null  int64  
 2   time_since_first_deliquency   51336 non-null  int64  
 3   time_since_recent_deliquency  51336 non-null  int64  
 4   num_times_delinquent          51336 non-null  int64  
 5   max_delinquency_level         51336 non-null  int64  
 6   max_recent_level_of_deliq     51336 non-null  int64  
 7   num_deliq_6mts                51336 non-null  int64  
 8   num_deliq_12mts               51336 non-null  int64  
 9   num_deliq_6_12mts             51336 non-null  int64  
 10  max_deliq_6mts                51336 non-null  int64  
 11  max_deliq_12mts               51336 non-null  int64  
 12  num_times_30p_dpd             51336 non-null  int64  
 13  n

In [6]:
df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

In [7]:
df1.shape

(51296, 26)

In [8]:
columns_to_be_removed = []
for i in df2.columns:
    if(df2.loc[df2[i] == -99999].shape[0] > 10000):
        columns_to_be_removed.append(i)

In [9]:
columns_to_be_removed

['time_since_first_deliquency',
 'time_since_recent_deliquency',
 'max_delinquency_level',
 'max_deliq_6mts',
 'max_deliq_12mts',
 'CC_utilization',
 'PL_utilization',
 'max_unsec_exposure_inPct']

In [10]:
df2 = df2.drop(columns_to_be_removed,axis=1)

In [11]:
df2.shape

(51336, 54)

In [12]:
for i in df2.columns:
    df2 = df2.loc[df2[i] != -99999]

In [13]:
df2.isna().sum()

PROSPECTID                    0
time_since_recent_payment     0
num_times_delinquent          0
max_recent_level_of_deliq     0
num_deliq_6mts                0
num_deliq_12mts               0
num_deliq_6_12mts             0
num_times_30p_dpd             0
num_times_60p_dpd             0
num_std                       0
num_std_6mts                  0
num_std_12mts                 0
num_sub                       0
num_sub_6mts                  0
num_sub_12mts                 0
num_dbt                       0
num_dbt_6mts                  0
num_dbt_12mts                 0
num_lss                       0
num_lss_6mts                  0
num_lss_12mts                 0
recent_level_of_deliq         0
tot_enq                       0
CC_enq                        0
CC_enq_L6m                    0
CC_enq_L12m                   0
PL_enq                        0
PL_enq_L6m                    0
PL_enq_L12m                   0
time_since_recent_enq         0
enq_L12m                      0
enq_L6m 

In [14]:
df1.isna().sum()

PROSPECTID              0
Total_TL                0
Tot_Closed_TL           0
Tot_Active_TL           0
Total_TL_opened_L6M     0
Tot_TL_closed_L6M       0
pct_tl_open_L6M         0
pct_tl_closed_L6M       0
pct_active_tl           0
pct_closed_tl           0
Total_TL_opened_L12M    0
Tot_TL_closed_L12M      0
pct_tl_open_L12M        0
pct_tl_closed_L12M      0
Tot_Missed_Pmnt         0
Auto_TL                 0
CC_TL                   0
Consumer_TL             0
Gold_TL                 0
Home_TL                 0
PL_TL                   0
Secured_TL              0
Unsecured_TL            0
Other_TL                0
Age_Oldest_TL           0
Age_Newest_TL           0
dtype: int64

In [15]:
for i in list(df1.columns):
    if i in list(df2.columns):
        print(i)

PROSPECTID


In [16]:
#Merging the two dataframes

df = pd.merge(df1, df2, how = 'inner', left_on = ['PROSPECTID'], right_on =['PROSPECTID'])

In [17]:
df.shape

(42064, 79)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PROSPECTID                  42064 non-null  int64  
 1   Total_TL                    42064 non-null  int64  
 2   Tot_Closed_TL               42064 non-null  int64  
 3   Tot_Active_TL               42064 non-null  int64  
 4   Total_TL_opened_L6M         42064 non-null  int64  
 5   Tot_TL_closed_L6M           42064 non-null  int64  
 6   pct_tl_open_L6M             42064 non-null  float64
 7   pct_tl_closed_L6M           42064 non-null  float64
 8   pct_active_tl               42064 non-null  float64
 9   pct_closed_tl               42064 non-null  float64
 10  Total_TL_opened_L12M        42064 non-null  int64  
 11  Tot_TL_closed_L12M          42064 non-null  int64  
 12  pct_tl_open_L12M            42064 non-null  float64
 13  pct_tl_closed_L12M          420

In [19]:
df.isna().sum().sum()

0

In [20]:
#checking categorical columns

for i in df.columns:
    if df[i].dtypes == 'object':
        print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [21]:
df['MARITALSTATUS'].value_counts()

MARITALSTATUS
Married    30886
Single     11178
Name: count, dtype: int64

In [22]:
df['EDUCATION'].value_counts()

EDUCATION
GRADUATE          14140
12TH              11703
SSC                7241
UNDER GRADUATE     4572
OTHERS             2291
POST-GRADUATE      1898
PROFESSIONAL        219
Name: count, dtype: int64

In [23]:
for i in ['MARITALSTATUS' ,'EDUCATION' ,'GENDER' ,'last_prod_enq2' ,'first_prod_enq2']:
    chi2, pval , _, _ = chi2_contingency(pd.crosstab(df[i] , df['Approved_Flag']))
    print(i,'---',pval)

MARITALSTATUS --- 3.578180861038862e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.907936100186563e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.84997610555419e-287


In [24]:
#numerical columns

numeric_columns = []
for i in df.columns:
    if df[i].dtypes != 'object' and i not in ['PROSPECTID','Approved_Flag']:
        numeric_columns.append(i)

In [25]:
len(numeric_columns)

72

In [26]:
vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0


for i in range (0,total_columns):
    vif_value = variance_inflation_factor(vif_data,column_index)
    print(column_index,'----',vif_value)

    if vif_value <= 6 :
        columns_to_be_kept.append(numeric_columns[i])
        column_index = column_index+1

    else:
        vif_data = vif_data.drop([numeric_columns[i]],axis=1)

  vif = 1. / (1. - r_squared_i)


0 ---- inf


  vif = 1. / (1. - r_squared_i)


0 ---- inf
0 ---- 11.320180023967996
0 ---- 8.363698035000336
0 ---- 6.520647877790928
0 ---- 5.149501618212625
1 ---- 2.611111040579735


  vif = 1. / (1. - r_squared_i)


2 ---- inf
2 ---- 1788.7926256209232
2 ---- 8.601028256477228
2 ---- 3.8328007921530785
3 ---- 6.099653381646739
3 ---- 5.581352009642762
4 ---- 1.985584353098778


  vif = 1. / (1. - r_squared_i)


5 ---- inf
5 ---- 4.809538302819343
6 ---- 23.270628983464636
6 ---- 30.595522588100053
6 ---- 4.3843464059655854
7 ---- 3.064658415523423
8 ---- 2.898639771299253
9 ---- 4.377876915347324
10 ---- 2.207853583695844
11 ---- 4.916914200506864
12 ---- 5.214702030064725
13 ---- 3.3861625024231476
14 ---- 7.840583309478997
14 ---- 5.255034641721438


  vif = 1. / (1. - r_squared_i)


15 ---- inf
15 ---- 7.380634506427232
15 ---- 1.4210050015175733
16 ---- 8.083255010190316
16 ---- 1.6241227524040112
17 ---- 7.257811920140003
17 ---- 15.59624383268298
17 ---- 1.825857047132431
18 ---- 1.5080839450032664
19 ---- 2.172088834824577
20 ---- 2.62339755352723
21 ---- 2.2959970812106176
22 ---- 7.360578319196439
22 ---- 2.1602387773102554
23 ---- 2.8686288267891458
24 ---- 6.458218003637277
24 ---- 2.8474118865638265
25 ---- 4.753198156284083
26 ---- 16.22735475594825
26 ---- 6.424377256363877
26 ---- 8.887080381808687
26 ---- 2.3804746142952653
27 ---- 8.609513476514548
27 ---- 13.06755093547673
27 ---- 3.500040056654654
28 ---- 1.9087955874813773
29 ---- 17.006562234161628
29 ---- 10.730485153719197
29 ---- 2.3538497522950275
30 ---- 22.104855915136433
30 ---- 2.7971639638512906
31 ---- 3.4241712032176985
32 ---- 10.175021454450935
32 ---- 6.408710354561301
32 ---- 1.001151196262561
33 ---- 3.069197305397274
34 ---- 2.8091261600643715
35 ---- 20.249538381980678
35 ---- 1

In [27]:
len(columns_to_be_kept)

39

In [28]:
len(columns_to_be_removed)

8

In [29]:
from scipy.stats import f_oneway

In [30]:
columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
    a = list(df[i])
    b = list(df['Approved_Flag'])

    group_P1 = [value for value , group in zip(a,b) if group == 'P1']
    group_P2 = [value for value , group in zip(a,b) if group == 'P2']
    group_P3 = [value for value , group in zip(a,b) if group == 'P3']
    group_P4 = [value for value , group in zip(a,b) if group == 'P4']

    f_statistics, p_value = f_oneway(group_P1,group_P2,group_P3,group_P4)

    if p_value <= 0.05 :
        columns_to_be_kept_numerical.append(i)

In [31]:
len(columns_to_be_kept_numerical)

37

In [32]:
features = columns_to_be_kept_numerical + ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
df = df[features + ['Approved_Flag']]

In [33]:
df['MARITALSTATUS'].unique()

array(['Married', 'Single'], dtype=object)

In [34]:
df['EDUCATION'].unique()

array(['12TH', 'GRADUATE', 'SSC', 'POST-GRADUATE', 'UNDER GRADUATE',
       'OTHERS', 'PROFESSIONAL'], dtype=object)

In [35]:
df['GENDER'].unique()

array(['M', 'F'], dtype=object)

In [36]:
df['last_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'AL', 'CC', 'others', 'HL'], dtype=object)

In [37]:
df['first_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

In [38]:
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]  = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]  = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]  = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]  = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]  = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]  = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]  = 3

In [39]:
df['EDUCATION'].value_counts()

EDUCATION
3    18931
2    11703
1     9532
4     1898
Name: count, dtype: int64

In [40]:
df['EDUCATION'] = df['EDUCATION'].astype(int)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            42064 non-null  float64
 1   pct_tl_closed_L6M          42064 non-null  float64
 2   Tot_TL_closed_L12M         42064 non-null  int64  
 3   pct_tl_closed_L12M         42064 non-null  float64
 4   Tot_Missed_Pmnt            42064 non-null  int64  
 5   CC_TL                      42064 non-null  int64  
 6   Home_TL                    42064 non-null  int64  
 7   PL_TL                      42064 non-null  int64  
 8   Secured_TL                 42064 non-null  int64  
 9   Unsecured_TL               42064 non-null  int64  
 10  Other_TL                   42064 non-null  int64  
 11  Age_Oldest_TL              42064 non-null  int64  
 12  Age_Newest_TL              42064 non-null  int64  
 13  time_since_recent_payment  42064 non-null  int

In [42]:
df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'] , dtype='uint8')

In [43]:
df_encoded.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,0,0,1,0,0,0,0,0,1,0
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,1,0,0,0,0,0,0,0,0,1
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,1,0,0,0,0,0,0,0,1,0


In [44]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

In [45]:
y = df_encoded['Approved_Flag']
x = df_encoded.drop(['Approved_Flag'],axis=1)

In [46]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [47]:
rf = RandomForestClassifier(n_estimators = 200 , random_state = 42)

In [48]:
rf.fit(X_train,y_train)

In [49]:
y_pred = rf.predict(X_test)

In [50]:
accuracy = accuracy_score(y_test,y_pred)
print('Accuracy :' , accuracy)

Accuracy : 0.7636990372043266


In [51]:
precision,recall,f1_score,_ = precision_recall_fscore_support(y_test,y_pred)

In [52]:
for i,v in enumerate(['p1','p2','p3','p4']):
    print(f"Class {v} :")
    print("Precision :",precision[i])
    print("Recall :",recall[i])
    print("f1_score :",f1_score[i])
    print('\n')

Class p1 :
Precision : 0.8370457209847597
Recall : 0.7041420118343196
f1_score : 0.7648634172469202


Class p2 :
Precision : 0.7957519116397621
Recall : 0.9282457879088206
f1_score : 0.856907593778591


Class p3 :
Precision : 0.4423380726698262
Recall : 0.21132075471698114
f1_score : 0.28600612870275793


Class p4 :
Precision : 0.7178502879078695
Recall : 0.7269193391642371
f1_score : 0.7223563495895703




In [53]:
import xgboost as xg
from sklearn.preprocessing import LabelEncoder

In [54]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [55]:
xgb_classifier = xg.XGBClassifier(objective = 'multi:softmax' , num_class = 4)

In [56]:
X_train,X_test,y_train,y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

In [57]:
xgb_classifier.fit(X_train,y_train)

In [58]:
y_pred = xgb_classifier.predict(X_test)

In [59]:
accuracy = accuracy_score(y_test,y_pred)
print('Accuracy :' , accuracy)

Accuracy : 0.7783192677998336


In [60]:
precision,recall,f1_score,_ = precision_recall_fscore_support(y_test,y_pred)

In [61]:
for i,v in enumerate(['p1','p2','p3','p4']):
    print(f"Class {v} :")
    print("Precision :",precision[i])
    print("Recall :",recall[i])
    print("f1_score :",f1_score[i])
    print('\n')

Class p1 :
Precision : 0.823906083244397
Recall : 0.7613412228796844
f1_score : 0.7913890312660175


Class p2 :
Precision : 0.8255418233924413
Recall : 0.913577799801784
f1_score : 0.8673315769665035


Class p3 :
Precision : 0.4756380510440835
Recall : 0.30943396226415093
f1_score : 0.37494284407864653


Class p4 :
Precision : 0.7342386032977691
Recall : 0.7356656948493683
f1_score : 0.7349514563106796




In [62]:
from sklearn.tree import DecisionTreeClassifier

In [63]:
dt = DecisionTreeClassifier(max_depth = 20 , min_samples_split = 10)
dt.fit(X_train,y_train)

In [64]:
y_pred = dt.predict(X_test)

In [65]:
accuracy = accuracy_score(y_test,y_pred)
print('Accuracy :' , accuracy)

Accuracy : 0.7084274337335077


In [66]:
precision,recall,f1_score,_ = precision_recall_fscore_support(y_test,y_pred)

In [67]:
for i,v in enumerate(['p1','p2','p3','p4']):
    print(f"Class {v} :")
    print("Precision :",precision[i])
    print("Recall :",recall[i])
    print("f1_score :",f1_score[i])
    print('\n')

Class p1 :
Precision : 0.7159647404505387
Recall : 0.7209072978303748
f1_score : 0.7184275184275184


Class p2 :
Precision : 0.808531359563693
Recall : 0.822794846382557
f1_score : 0.8156007466352294


Class p3 :
Precision : 0.3417322834645669
Recall : 0.32754716981132076
f1_score : 0.33448940269749516


Class p4 :
Precision : 0.6518218623481782
Recall : 0.6258503401360545
f1_score : 0.6385721368368865




Used Scaling to Scale the datas

In [68]:
from sklearn.preprocessing import StandardScaler

In [69]:
columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
'max_recent_level_of_deliq','recent_level_of_deliq',
'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']

In [70]:
for i in columns_to_be_scaled:
    column_data = df_encoded[i].values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled_column = scaler.fit_transform(column_data)
    df_encoded[i] = scaled_column

In [71]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [72]:
xgb_classifier = xg.XGBClassifier(objective = 'multi:softmax' , num_class = 4)

In [73]:
X_train,X_test,y_train,y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

In [74]:
xgb_classifier.fit(X_train,y_train)

In [75]:
y_pred = xgb_classifier.predict(X_test)

In [76]:
accuracy = accuracy_score(y_test,y_pred)
print('Accuracy :' , accuracy)
print('\n')
for i,v in enumerate(['p1','p2','p3','p4']):
    print(f"Class {v} :")
    print("Precision :",precision[i])
    print("Recall :",recall[i])
    print("f1_score :",f1_score[i])
    print('\n')

Accuracy : 0.7783192677998336


Class p1 :
Precision : 0.7159647404505387
Recall : 0.7209072978303748
f1_score : 0.7184275184275184


Class p2 :
Precision : 0.808531359563693
Recall : 0.822794846382557
f1_score : 0.8156007466352294


Class p3 :
Precision : 0.3417322834645669
Recall : 0.32754716981132076
f1_score : 0.33448940269749516


Class p4 :
Precision : 0.6518218623481782
Recall : 0.6258503401360545
f1_score : 0.6385721368368865




Hyper Parameter tuning on XgBoost as it gives the best accuracy among the three

In [78]:
from sklearn.model_selection import GridSearchCV

In [88]:
param_grid = {
    'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
    'learning_rate'   : [0.001, 0.01, 0.1, 1],
    'max_depth'       : [3, 5, 8, 10],
    'alpha'           : [1, 10, 100],
    'n_estimators'    : [10,50,100]
}

In [89]:
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

In [90]:
grid_search.fit(X_train,y_train)

In [91]:
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'alpha': 1, 'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [92]:
best_model = grid_search.best_estimator_

In [94]:
accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.7757042672055152


In [96]:
a3 = pd.read_excel(r"C:\Users\carna\Desktop\Credit Score Classifier\Unseen_Dataset.xlsx")

In [98]:
cols_in_df = list(df.columns)
cols_in_df.pop(42)

'Approved_Flag'

In [120]:
df_unseen = a3[cols_in_df]

In [121]:
df_unseen['MARITALSTATUS'].unique()

array(['Married', 'Single'], dtype=object)

In [122]:
df_unseen['EDUCATION'].unique()

array(['12TH', 'GRADUATE', 'SSC', 'POST-GRADUATE', 'UNDER GRADUATE',
       'OTHERS'], dtype=object)

In [123]:
df_unseen['GENDER'].unique()

array(['M', 'F'], dtype=object)

In [124]:
df_unseen['last_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'AL', 'CC', 'others', 'HL'], dtype=object)

In [125]:
df_unseen['first_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

In [135]:
df_unseen.loc[df_unseen['EDUCATION'] == 'SSC',['EDUCATION']]  = 1
df_unseen.loc[df_unseen['EDUCATION'] == '12TH',['EDUCATION']]  = 2
df_unseen.loc[df_unseen['EDUCATION'] == 'GRADUATE',['EDUCATION']]  = 3
df_unseen.loc[df_unseen['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]  = 3
df_unseen.loc[df_unseen['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]  = 4
df_unseen.loc[df_unseen['EDUCATION'] == 'OTHERS',['EDUCATION']]  = 1
df_unseen.loc[df_unseen['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]  = 3

In [136]:
df_unseen['EDUCATION'].value_counts()

EDUCATION
3    41
2    28
1    26
4     5
Name: count, dtype: int64

In [137]:
df_unseen['EDUCATION'] = df_unseen['EDUCATION'].astype(int)

In [138]:
df_encoded_unseen = pd.get_dummies(df_unseen, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'] , dtype='uint8')

In [139]:
df_encoded_unseen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               100 non-null    float64
 1   pct_tl_closed_L6M             100 non-null    float64
 2   Tot_TL_closed_L12M            100 non-null    int64  
 3   pct_tl_closed_L12M            100 non-null    float64
 4   Tot_Missed_Pmnt               100 non-null    int64  
 5   CC_TL                         100 non-null    int64  
 6   Home_TL                       100 non-null    int64  
 7   PL_TL                         100 non-null    int64  
 8   Secured_TL                    100 non-null    int64  
 9   Unsecured_TL                  100 non-null    int64  
 10  Other_TL                      100 non-null    int64  
 11  Age_Oldest_TL                 100 non-null    int64  
 12  Age_Newest_TL                 100 non-null    int64  
 13  time_s

In [146]:
model = xg.XGBClassifier(objectice = 'multi:softmax',
                       alpha= 1, colsample_bytree= 0.9,
                        learning_rate= 0.1, max_depth= 5,
                        n_estimators= 100)

In [147]:
model.fit(X_train,y_train)

Parameters: { "objectice" } are not used.



In [148]:
y_pred_unseen = model.predict(df_encoded_unseen)

In [149]:
y_pred_unseen

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 3, 1, 0, 2, 1, 1, 0, 3, 1, 0, 1,
       1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 1, 3, 1, 3, 1, 1, 1, 3, 1, 2,
       3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 0,
       1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1], dtype=int64)

In [150]:
a3['Target_Variables'] = y_pred_unseen

In [156]:
a3.to_excel("C:\\Users\\carna\\Desktop\\Credit Score Classifier\\Unseen_Dataset.xlsx",index=False)