In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os


In [5]:
a1 = pd.read_excel('/content/case_study1.xlsx')
a2 = pd.read_excel('/content/case_study2.xlsx')

In [6]:
df1 = a1.copy()
df2 = a2.copy()


In [7]:
columns_to_be_removed = []
for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_to_be_removed.append(i)

In [8]:
df2 = df2.drop(columns_to_be_removed, axis=1)

In [9]:
for i in df2.columns:
    df2 = df2.loc[df2[i] != -99999]

In [10]:
for i in list(df1.columns):
    if i in list(df2.columns):
        print(i)

PROSPECTID


In [11]:
df = pd.merge(df1, df2, how='inner', left_on=['PROSPECTID'], right_on=['PROSPECTID'])


In [12]:
for i in df.columns:
    if df[i].dtype == 'object':
        print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [13]:
for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
    print(i, '---', pval)


MARITALSTATUS --- 3.6401547776371343e-233
EDUCATION --- 2.8651007585199264e-30
GENDER --- 1.8610038357046495e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 8.375872889007583e-287


In [14]:
numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID', 'Approved_Flag']:
        numeric_columns.append(i)

In [15]:
vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0
for i in range(total_columns):
    vif_value = variance_inflation_factor(vif_data, column_index)
    print(column_index, '---', vif_value)
    if vif_value <= 6:
        columns_to_be_kept.append(numeric_columns[i])
        column_index += 1
    else:
        vif_data = vif_data.drop([numeric_columns[i]], axis=1)


  vif = 1. / (1. - r_squared_i)


0 --- inf


  vif = 1. / (1. - r_squared_i)


0 --- inf
0 --- 11.319677860911183
0 --- 8.291829772566755
0 --- 6.520138234382173
0 --- 5.059265701555735
1 --- 2.6064779388021027


  vif = 1. / (1. - r_squared_i)


2 --- inf
2 --- 1779.2530019989363
2 --- 8.57605846135607
2 --- 3.8295656471637
3 --- 5.4692110955705875
4 --- 5.503957502932511
5 --- 1.973712154119323


  vif = 1. / (1. - r_squared_i)


6 --- inf
6 --- 4.811919298953734
7 --- 23.146961044026003
7 --- 30.634993848319926
7 --- 4.384922735753697
8 --- 3.064866354116124
9 --- 2.8932461784347443
10 --- 4.369654303665843
11 --- 2.208341258584075
12 --- 566.1909055769335
12 --- 1.0006593766842498
13 --- 1.9695830645742571
14 --- 7.841218783259259
14 --- 5.247703415624756


  vif = 1. / (1. - r_squared_i)


15 --- inf
15 --- 7.377956585088553
15 --- 1.425793366814442
16 --- 8.084822288044927
16 --- 1.6226699218393807
17 --- 7.238615905249871
17 --- 15.645211505952776
17 --- 1.8184084209088587
18 --- 1.5055024528703032
19 --- 2.1720129343912165
20 --- 2.6235392775473527
21 --- 2.292927625739703
22 --- 7.358811901616798
22 --- 2.1583860086471645
23 --- 2.865198364020101
24 --- 6.457951397444763
24 --- 2.8464061660262128
25 --- 4.751462405159283
26 --- 16.664353050297613
26 --- 6.433862464128575
26 --- 8.905695279309262
26 --- 2.3948438776920553
27 --- 8.625154493499016
27 --- 13.097583499300075
27 --- 3.5102775257188883
28 --- 1.84612473749155
29 --- 18.35043652870675
29 --- 10.708249343242189
29 --- 2.3460067597985903
30 --- 21.54248963193387
30 --- 2.796240194688435
31 --- 3.373862552634249
32 --- 9.973017265651695
32 --- 6.092683218698291
32 --- 1.0011742340538416
33 --- 3.0642351806835473
34 --- 2.807411739205449
35 --- 20.28118581063136
35 --- 15.881630139889586
35 --- 1.83280523444833

In [16]:
from scipy.stats import f_oneway

columns_to_be_kept_numerical = []
for i in columns_to_be_kept:
    a = list(df[i])
    b = list(df['Approved_Flag'])

    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']

    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)
    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)

In [17]:
features = columns_to_be_kept_numerical + ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
df = df[features + ['Approved_Flag']]

# Encoding categorical features
df['EDUCATION'] = df['EDUCATION'].replace({
    'SSC': 1, '12TH': 2, 'GRADUATE': 3, 'UNDER GRADUATE': 3,
    'POST-GRADUATE': 4, 'OTHERS': 1, 'PROFESSIONAL': 3
}).astype(int)

  df['EDUCATION'] = df['EDUCATION'].replace({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EDUCATION'] = df['EDUCATION'].replace({


In [19]:
df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2'])

In [20]:
y = df_encoded['Approved_Flag']
x = df_encoded.drop(['Approved_Flag'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [21]:
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
rf_classifier.fit(x_train, y_train)
y_pred = rf_classifier.predict(x_test)

In [22]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


Accuracy: 0.74


In [23]:
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}: Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1_score[i]}")

Class p1: Precision: 0.7637271214642263, Recall: 0.4727085478887745, F1 Score: 0.583969465648855
Class p2: Precision: 0.764026931708881, Recall: 0.9272373540856031, F1 Score: 0.8377570750571278
Class p3: Precision: 0.4408014571948998, Recall: 0.19626926196269262, F1 Score: 0.2716049382716049
Class p4: Precision: 0.7261208576998051, Recall: 0.6962616822429907, F1 Score: 0.7108778625954199


In [24]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)
xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

# Evaluate XGBoost Model
accuracy = accuracy_score(y_test, y_pred)
print(f'XGBoost Accuracy: {accuracy:.2f}')
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

# Display performance for each class in XGBoost
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}: Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1_score[i]}")


XGBoost Accuracy: 0.74
Class p1: Precision: 0.7340720221606648, Recall: 0.5458290422245108, F1 Score: 0.6261075014766686
Class p2: Precision: 0.7852225020990764, Recall: 0.909727626459144, F1 Score: 0.8429022082018928
Class p3: Precision: 0.43411927877947293, Recall: 0.25385239253852393, F1 Score: 0.3203684749232344
Class p4: Precision: 0.7283464566929134, Recall: 0.6915887850467289, F1 Score: 0.7094918504314478


In [25]:

# Decision Tree Classifier training
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

# Evaluate Decision Tree Model
accuracy = accuracy_score(y_test, y_pred)
print(f'Decision Tree Accuracy: {accuracy:.2f}')
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

# Display Decision Tree performance
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}: Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1_score[i]}")

Decision Tree Accuracy: 0.67
Class p1: Precision: 0.5424274973147154, Recall: 0.5200823892893924, F1 Score: 0.5310199789695058
Class p2: Precision: 0.7662217278457546, Recall: 0.8040856031128405, F1 Score: 0.7846971710651225
Class p3: Precision: 0.3156934306569343, Recall: 0.28061638280616386, F1 Score: 0.2971232288535852
Class p4: Precision: 0.6253776435045317, Recall: 0.5803738317757009, F1 Score: 0.6020358700920989
