# Credit Risk Modelling

In [1]:
import pandas as pd
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import f_oneway
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

### 1. Data Preparation

Reading Data

In [3]:
df1 = pd.read_excel('data\case_study1.xlsx')
df2 = pd.read_excel('data\case_study2.xlsx')

In [4]:
print(df1.shape,df2.shape)

(51336, 26) (51336, 62)


1.1 Cleaning Data

In [5]:
def cols_with_missing_vals(data):
    cols = []
    for col in data.columns:
        if data[col].dtype != 'object':
            missing_vals = data[data[col]==-99999].shape[0]
            if missing_vals:
                print(col,':',missing_vals)
                cols.append(col)
    return cols

def drop_missing_from_cols(data,cols):
    for col in cols:
        data = data[data[col]!=-99999]
    return data

def cols_to_be_removed(data):
    cols = []
    for col in data.columns:
        if data[col].dtype != 'object':
            if data[data[col]==-99999].shape[0] > 10000: # missing vals >20% of samples
                cols.append(col)
    if cols: print('Columns to be removed: ',cols)
    return cols

In [6]:
# Columns with Null values in df1
df1_missing_cols = cols_with_missing_vals(df1)

Age_Oldest_TL : 40
Age_Newest_TL : 40


In [7]:
# Dropping rows with Null values
print('df1 shape: ',df1.shape)
df1 = drop_missing_from_cols(df1,df1_missing_cols)
print('df1 shape after dropping rows:',df1.shape)
cols_with_missing_vals(df1)

df1 shape:  (51336, 26)
df1 shape after dropping rows: (51296, 26)


[]

In [8]:
# Columns with Null values in df2
cols_with_missing_vals(df2);

time_since_recent_payment : 4291
time_since_first_deliquency : 35949
time_since_recent_deliquency : 35949
max_delinquency_level : 35949
max_deliq_6mts : 12890
max_deliq_12mts : 10832
tot_enq : 6321
CC_enq : 6321
CC_enq_L6m : 6321
CC_enq_L12m : 6321
PL_enq : 6321
PL_enq_L6m : 6321
PL_enq_L12m : 6321
time_since_recent_enq : 6321
enq_L12m : 6321
enq_L6m : 6321
enq_L3m : 6321
pct_currentBal_all_TL : 72
CC_utilization : 47636
PL_utilization : 44435
max_unsec_exposure_inPct : 23178


In [9]:
# Columns with more than 20% of Null values
df2_cols_to_be_removed = cols_to_be_removed(df2)

Columns to be removed:  ['time_since_first_deliquency', 'time_since_recent_deliquency', 'max_delinquency_level', 'max_deliq_6mts', 'max_deliq_12mts', 'CC_utilization', 'PL_utilization', 'max_unsec_exposure_inPct']


In [10]:
# Dropping them
print('df2 shape: ',df2.shape)
df2.drop(df2_cols_to_be_removed,axis=1,inplace=True)
print('df2.shape after dropping columns:',df2.shape)

df2 shape:  (51336, 62)
df2.shape after dropping columns: (51336, 54)


In [11]:
# Dropping other rows with Null values
df2_missing_cols = cols_with_missing_vals(df2)
df2 = drop_missing_from_cols(df2,df2_missing_cols)
print(df2.shape)

time_since_recent_payment : 4291
tot_enq : 6321
CC_enq : 6321
CC_enq_L6m : 6321
CC_enq_L12m : 6321
PL_enq : 6321
PL_enq_L6m : 6321
PL_enq_L12m : 6321
time_since_recent_enq : 6321
enq_L12m : 6321
enq_L6m : 6321
enq_L3m : 6321
pct_currentBal_all_TL : 72
(42066, 54)


In [12]:
cols_with_missing_vals(df2)

[]

In [13]:
df2.isna().sum().sum(),df1.isna().sum().sum()

(0, 0)

1.2 Merging Data Frames

In [14]:
# Checking for common columns
for col in df1:
    if col in df2.columns: print(col)

PROSPECTID


In [15]:
print(f'df1 shape: {df1.shape} \ndf2 shape: {df2.shape}')
df = pd.merge(df1,df2,how='inner',on='PROSPECTID')
df.drop(['PROSPECTID'],axis=1,inplace=True) # Dropping the 'id' feature
print('df shape: ',df.shape)

df1 shape: (51296, 26) 
df2 shape: (42066, 54)
df shape:  (42064, 78)


### 2. Feature Selection

2.1 Chi-squared test for Categorical features

In [16]:
# Categorical columns
categorical_columns = []
for col in df.columns:
    if df[col].dtype=='object' and col!='Approved_Flag':
        categorical_columns.append(col)
print(categorical_columns)

['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']


In [17]:
for col in categorical_columns:
    chi2, p_val, _, _ = chi2_contingency(pd.crosstab(df[col],df['Approved_Flag']))
    print(col,'---',p_val)

MARITALSTATUS --- 3.578180861038862e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.907936100186563e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.84997610555419e-287


In [18]:
# Since all the features have p_value < 0.05
# We accept all categorical features

2.2 Checking for Multicollinearity(Variation Inflation Factor) of Numerical features

In [19]:
numeric_columns = [col for col in df.columns if df[col].dtype != 'object']
print(len(numeric_columns))

72


In [20]:
# VIF threshold --> 6

vif_data = df[numeric_columns]
total_cols = len(numeric_columns)
columns_to_be_kept = [] # numerical columns after VIF
column_index = 0

for i in range(total_cols):
    vif_value = variance_inflation_factor(vif_data,column_index);
    if vif_value <= 6:
        columns_to_be_kept.append(numeric_columns[i])
        column_index += 1
    else:
        vif_data.drop([numeric_columns[i]],axis=1,inplace=True)
        df.drop([numeric_columns[i]],axis=1,inplace=True)

  vif = 1. / (1. - r_squared_i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vif_data.drop([numeric_columns[i]],axis=1,inplace=True)
  vif = 1. / (1. - r_squared_i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vif_data.drop([numeric_columns[i]],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vif_data.drop([numeric_columns[i]],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/s

In [21]:
print('Numerical Columns: ',vif_data.shape[1])
print('Total Columns: ',df.shape[1])

Numerical Columns:  39
Total Columns:  45


2.3 ANOVA Test for Numerical Features

In [22]:
columns_to_be_kept_numerical = []
for col in columns_to_be_kept:
    a = list(df[col])  
    b = list(df['Approved_Flag'])  
    
    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']

    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(col)
    else:
        df.drop([col],axis=1,inplace=True)

In [23]:
print('Numerical Columns: ',len(columns_to_be_kept_numerical))
print('Total Columns: ',df.shape[1])

Numerical Columns:  37
Total Columns:  43


### 3. Encoding and Scaling

In [24]:
categorical_columns

['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

In [25]:
df['last_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'AL', 'CC', 'others', 'HL'], dtype=object)

3.1 Label Encoding for 'EDUCATION' feature
  
  * SSC                :    1
  * 12TH               :    2
  * GRADUATE           :    3
  * UNDER GRADUATE     :    3
  * POST-GRADUATE      :    4
  * OTHERS             :    1 Can be uneducated or with no documents. So can be verified by business end
  * PROFESSIONAL       :    3

 Reverse ordering can also be done as long as order is maintained

In [26]:
# df.loc[df['EDUCATION']=='SSC',['EDUCATION']] = 1
# df.loc[df['EDUCATION']=='12TH',['EDUCATION']] = 2
# df.loc[df['EDUCATION']=='GRADUATE',['EDUCATION']] = 3
# df.loc[df['EDUCATION']=='UNDER GRADUATE',['EDUCATION']] = 3
# df.loc[df['EDUCATION']=='POST-GRADUATE',['EDUCATION']] = 4
# df.loc[df['EDUCATION']=='OTHERS',['EDUCATION']] = 1
# df.loc[df['EDUCATION']=='PROFESSIONAL',['EDUCATION']] = 3

education = {
    'SSC'            : 1,
    '12TH'           : 2,
    'GRADUATE'       : 3,
    'UNDER GRADUATE' : 3,
    'POST-GRADUATE'  : 4,
    'OTHERS'         : 1,
    'PROFESSIONAL'   : 3
}
df['EDUCATION'] = df['EDUCATION'].map(education).astype(int)

In [27]:
df['EDUCATION'].value_counts()

EDUCATION
3    18931
2    11703
1     9532
4     1898
Name: count, dtype: int64

3.2 One Hot Encoding for other categorical columns

In [28]:
df_encoded = pd.get_dummies(df,columns=['MARITALSTATUS','GENDER','last_prod_enq2','first_prod_enq2'])
df_encoded.shape

(42064, 55)

In [29]:
df_encoded.describe()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,enq_L3m,EDUCATION,NETMONTHLYINCOME,Time_With_Curr_Empr,CC_Flag,PL_Flag,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag
count,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,...,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0,42064.0
mean,0.179032,0.097783,0.825504,0.160365,0.525746,0.145921,0.076241,0.328,2.921334,2.341646,...,1.230458,2.313689,26929.9,110.345783,0.102962,0.193063,0.195497,0.064186,0.252235,0.05658
std,0.278043,0.210957,1.537208,0.258831,1.106442,0.549314,0.358582,0.916368,6.379764,3.405397,...,2.069461,0.87107,20843.0,75.629967,0.303913,0.394707,0.367414,0.225989,0.4343,0.231042
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,18000.0,61.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,2.0,24000.0,92.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.333,0.1,1.0,0.25,1.0,0.0,0.0,0.0,3.0,3.0,...,2.0,3.0,31000.0,131.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,33.0,1.0,34.0,27.0,10.0,29.0,235.0,55.0,...,42.0,4.0,2500000.0,1020.0,1.0,1.0,1.0,1.0,1.0,1.0


3.3 Standard Scaling

In [30]:
scalers = {}
columns_to_be_scaled = ['Age_Oldest_TL', 'Age_Newest_TL', 'time_since_recent_payment', 
                        'max_recent_level_of_deliq', 'recent_level_of_deliq', 
                        'time_since_recent_enq', 'NETMONTHLYINCOME', 'Time_With_Curr_Empr']
for col in columns_to_be_scaled:
    scaler = StandardScaler()
    df_encoded[col] = scaler.fit_transform(df_encoded[col].values.reshape(-1,1))
    scalers[col] = scaler

### 4. Modelling

Data Splitting

In [31]:
y = df_encoded['Approved_Flag']
x = df_encoded.drop(['Approved_Flag'],axis=1)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
print(x_train.shape[0],x_test.shape[0])

33651 8413


In [32]:
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [37]:
models = {
    'DecisionTree': DecisionTreeClassifier(max_depth=20,min_samples_split=10,random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200,random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=200, random_state=42)
}

In [38]:
for model_name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, labels=['P1', 'P2', 'P3', 'P4'])

    print(f'Model: {model_name}')
    print(f'Accuracy: {accuracy}')
    for u, v in enumerate(['P1', 'P2', 'P3', 'P4']):
        print(f'Class {v}')
        print(f'Precision: {precision[u]}')
        print(f'Recall: {recall[u]}')
        print(f'F1 Score: {f1_score[u]}')
    print()

Model: DecisionTree
Accuracy: 0.7125876619517414
Class P1
Precision: 0.7254901960784313
Recall: 0.7297830374753451
F1 Score: 0.727630285152409
Class P2
Precision: 0.8125609518236786
Recall: 0.8257680872150645
F1 Score: 0.8191112858828156
Class P3
Precision: 0.3494837172359015
Recall: 0.3320754716981132
F1 Score: 0.34055727554179566
Class P4
Precision: 0.6444885799404171
Recall: 0.630709426627794
F1 Score: 0.637524557956778

Model: RandomForest
Accuracy: 0.7665517651254011
Class P1
Precision: 0.8433014354066986
Recall: 0.6952662721893491
F1 Score: 0.7621621621621621
Class P2
Precision: 0.7952356817029904
Recall: 0.9330029732408325
F1 Score: 0.8586282378693907
Class P3
Precision: 0.4608
Recall: 0.21735849056603773
F1 Score: 0.2953846153846154
Class P4
Precision: 0.7250726040658277
Recall: 0.7278911564625851
F1 Score: 0.7264791464597479

Model: Logistic Regression
Accuracy: 0.749435397598954
Class P1
Precision: 0.8329466357308585
Recall: 0.7080867850098619
F1 Score: 0.7654584221748401
Cla



Model: AdaBoost
Accuracy: 0.7487222156186853
Class P1
Precision: 0.8006012024048096
Recall: 0.7879684418145957
F1 Score: 0.794234592445328
Class P2
Precision: 0.7719816272965879
Recall: 0.932804757185332
F1 Score: 0.8448074679113186
Class P3
Precision: 0.21428571428571427
Recall: 0.0022641509433962265
F1 Score: 0.004480955937266617
Class P4
Precision: 0.6061302681992338
Recall: 0.7687074829931972
F1 Score: 0.6778063410454156

Model: Extra Trees
Accuracy: 0.7355283489837157
Class P1
Precision: 0.8386648122392212
Recall: 0.5946745562130178
F1 Score: 0.6959030582804385
Class P2
Precision: 0.7576632964211202
Recall: 0.9357779980178395
F1 Score: 0.8373536715147215
Class P3
Precision: 0.4107485604606526
Recall: 0.16150943396226414
F1 Score: 0.23185265438786565
Class P4
Precision: 0.6900212314225053
Recall: 0.6316812439261419
F1 Score: 0.6595636732623034



4.1 Random Forest

In [37]:
random_forest_classifier = RandomForestClassifier(200,random_state=42).fit(x_train,y_train)
y_pred = random_forest_classifier.predict(x_test)

accuracy = accuracy_score(y_test,y_pred)
print(f'Accuracy: {accuracy}')
precision,recall,f1_score,_ = precision_recall_fscore_support(y_test,y_pred)

for u,v in enumerate(['p1','p2','p3','p4']):
    print()
    print(f'Class {v}')
    print(f'Precision: {precision[u]}')
    print(f'Recall: {recall[u]}')
    print(f'f1 Score: {f1_score[u]}')

Accuracy: 0.7650065375014858

Class p1
Precision: 0.8358913813459268
Recall: 0.6982248520710059
f1 Score: 0.7608812466415905

Class p2
Precision: 0.7965698760400747
Recall: 0.9298315163528246
f1 Score: 0.8580574355222242

Class p3
Precision: 0.44976816074188564
Recall: 0.21962264150943397
f1 Score: 0.295131845841785

Class p4
Precision: 0.7242718446601941
Recall: 0.7249757045675413
f1 Score: 0.7246236036911122


4.2 XGBoost

In [34]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x,y_encoded,test_size=0.2,random_state=42)

xgb_classifier.fit(x_train,y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test,y_pred)
print(f'Accuracy: {accuracy}')
precision,recall,f1_score,_ = precision_recall_fscore_support(y_test,y_pred)

for u,v in enumerate(['p1','p2','p3','p4']):
    print()
    print(f'Class {v}')
    print(f'Precision: {precision[u]}')
    print(f'Recall: {recall[u]}')
    print(f'f1 Score: {f1_score[u]}')

Accuracy: 0.7783192677998336

Class p1
Precision: 0.823906083244397
Recall: 0.7613412228796844
f1 Score: 0.7913890312660175

Class p2
Precision: 0.8255418233924413
Recall: 0.913577799801784
f1 Score: 0.8673315769665035

Class p3
Precision: 0.4756380510440835
Recall: 0.30943396226415093
f1 Score: 0.37494284407864653

Class p4
Precision: 0.7342386032977691
Recall: 0.7356656948493683
f1 Score: 0.7349514563106796


These are 2 of our best models

### 5. Hyperparameter Tuning

In [46]:
param_grid = {
    'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 8, 10],
    'alpha': [1, 10, 100],
    'n_estimators': [50, 100, 200]
}
answers_grid = {
    'combination': [], 'train_accuracy': [], 'test_accuracy': [],
    'colsample_bytree': [], 'learning_rate': [], 'max_depth': [],
    'alpha': [], 'n_estimators': []
}

In [47]:
combination = 0
x_train, x_test, y_train, y_test = train_test_split(x,y_encoded,test_size=0.2,random_state=42)

for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for alpha in param_grid['alpha']:
                for n_estimators in param_grid['n_estimators']:
                    model = xgb.XGBClassifier(objective='multi:softmax', num_class=4,
                                              colsample_bytree=colsample_bytree,
                                              learning_rate=learning_rate, max_depth=max_depth,
                                              alpha=alpha, n_estimators=n_estimators)
                    model.fit(x_train, y_train)
                    y_pred_train = model.predict(x_train)
                    y_pred_test = model.predict(x_test)
                    train_accuracy = accuracy_score(y_train, y_pred_train)
                    test_accuracy = accuracy_score(y_test, y_pred_test)
                    answers_grid['combination'].append(combination)
                    answers_grid['train_accuracy'].append(train_accuracy)
                    answers_grid['test_accuracy'].append(test_accuracy)
                    answers_grid['colsample_bytree'].append(colsample_bytree)
                    answers_grid['learning_rate'].append(learning_rate)
                    answers_grid['max_depth'].append(max_depth)
                    answers_grid['alpha'].append(alpha)
                    answers_grid['n_estimators'].append(n_estimators)
                    combination += 1

                    print(f'Combination {combination}')
                    print(f'colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}')
                    print(f'Train Accuracy: {train_accuracy:.2f}')
                    print(f'Test Accuracy: {test_accuracy:.2f}')
                    print('.'*30)

Combination 1
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 3, alpha: 1, n_estimators: 50
Train Accuracy: 0.61
Test Accuracy: 0.60
..............................
Combination 2
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 3, alpha: 1, n_estimators: 100
Train Accuracy: 0.61
Test Accuracy: 0.60
..............................
Combination 3
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 3, alpha: 1, n_estimators: 200
Train Accuracy: 0.62
Test Accuracy: 0.62
..............................
Combination 4
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 3, alpha: 10, n_estimators: 50
Train Accuracy: 0.61
Test Accuracy: 0.60
..............................
Combination 5
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 3, alpha: 10, n_estimators: 100
Train Accuracy: 0.61
Test Accuracy: 0.60
..............................
Combination 6
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 3, alpha: 10, n_estimators: 200
Train Accuracy: 0.62
Test Accuracy: 0.6

In [48]:
results = pd.DataFrame(answers_grid)
results.to_excel('tuning_results.xlsx',index=False)
results.head()

Unnamed: 0,combination,train_accuracy,test_accuracy,colsample_bytree,learning_rate,max_depth,alpha,n_estimators
0,0,0.611156,0.603471,0.1,0.01,3,1,50
1,1,0.611839,0.604184,0.1,0.01,3,1,100
2,2,0.623072,0.617497,0.1,0.01,3,1,200
3,3,0.610383,0.602995,0.1,0.01,3,10,50
4,4,0.610829,0.603471,0.1,0.01,3,10,100


### 6. Training best model from Tuning results

In [35]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=4,
                          colsample_bytree=0.5, learning_rate=0.2,
                          max_depth=3, alpha=10, n_estimators=200)
model.fit(x_train,y_train)

### 7. Predicting On Unseen Data

In [40]:
columns = list(df.columns)
columns.pop()

'Approved_Flag'

In [41]:
df3 = pd.read_excel(r'data\Unseen_Dataset.xlsx')
df_unseen = df3[columns]
df_unseen['EDUCATION'] = df_unseen['EDUCATION'].map(education).astype('int')
df_unseen = pd.get_dummies(df_unseen,columns=['MARITALSTATUS','GENDER','last_prod_enq2','first_prod_enq2'])
for col in columns_to_be_scaled:
    df_unseen[col] = scalers[col].transform(df_unseen[col].values.reshape(-1,1))

y_pred_unseen = model.predict(df_unseen)
df3['Target_variable'] = LabelEncoder().fit(y).inverse_transform(y_pred_unseen)
df3.to_excel(r'final_predictions.xlsx',index=False)