In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
encodings_to_try = ['utf-8-sig', 'latin-1', 'ISO-8859-1']

for encoding in encodings_to_try:
    try:
        df = pd.read_csv("C:/Users/15694/Desktop/Cross selling/code/Michelle/variable_without_pca2.csv", encoding=encoding)
        break
    except UnicodeDecodeError:
        continue

In [4]:
df.head()

In [5]:
df.columns

In [6]:
df.shape

In [7]:
df.isna().sum()

In [8]:
df1=df[["TOTAL_AVG_BAL","SIX_MONTH_BAL_OS_FD","SIX_MONTH_BAL_OS_SAVINGS","CUSTOMER_PROFITABILITY"
        ,"LOAN_CAT1"]]

In [9]:
X = df1.drop(columns=["LOAN_CAT1"])
Y = df1["LOAN_CAT1"]

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score, f1_score,  classification_report, confusion_matrix

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=0)

In [12]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from collections import Counter
sm = SMOTE()
X_sm, Y_sm=sm.fit_resample(X_train,y_train)
print('Dataset after resampling:')
print(sorted(Counter(Y_sm).items()))

### Tree based models

###### Decision Tree

In [13]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [14]:
decision_tree = DecisionTreeClassifier(random_state=0)
decision_tree.fit(X_sm, Y_sm)
y_pred_DT = decision_tree.predict(X_test)

In [15]:
acc = accuracy_score(y_test, y_pred_DT)
prec_macro = precision_score(y_test, y_pred_DT, average='macro')
recall_macro = recall_score(y_test, y_pred_DT, average='macro')
f1_macro = f1_score(y_test, y_pred_DT, average='macro')
classification_rep = classification_report(y_test, y_pred_DT)
print("Accuracy Score:", acc)
print("Precision (Macro):", prec_macro)
print("Recall (Macro):", recall_macro)
print("F1 Score (Macro):", f1_macro)

# Confusion Matrix
cf_matrix = confusion_matrix(y_test, y_pred_DT)
print("Confusion Matrix:")
print(cf_matrix)
print("\nClassification Report:\n", classification_rep)

In [16]:

y_pred_train_DT = decision_tree.predict(X_sm)

In [17]:
acc = accuracy_score(Y_sm, y_pred_train_DT)
prec_macro = precision_score(Y_sm, y_pred_train_DT, average='macro')
recall_macro = recall_score(Y_sm, y_pred_train_DT, average='macro')
f1_macro = f1_score(Y_sm, y_pred_train_DT, average='macro')
classification_rep_DT = classification_report(Y_sm,y_pred_train_DT)
print("Accuracy Score:", acc)
print("Precision (Macro):", prec_macro)
print("Recall (Macro):", recall_macro)
print("F1 Score (Macro):", f1_macro)

# Confusion Matrix
cf_matrix_DT = confusion_matrix(Y_sm, y_pred_train_DT)
print("Confusion Matrix:")
print(cf_matrix_DT)
print("\nClassification Report:\n", classification_rep_DT)

##### pre pruning

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
param_grid = {
        'max_depth': range(2,50,1),
         'min_samples_split': range(1,5,1),
         'min_samples_leaf': range(2,5,1)}

In [20]:
dtree = tree.DecisionTreeClassifier(random_state=0)
grid_search = GridSearchCV(dtree, param_grid, cv=5)
grid_search.fit(X_sm, Y_sm)

In [21]:
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

In [22]:
best_dtree_pp = DecisionTreeClassifier(max_depth=17, min_samples_leaf=2, min_samples_split=2)
best_dtree_pp.fit(X_sm, Y_sm)
y_pred_pre = best_dtree_pp.predict(X_test)

In [23]:
acc = accuracy_score(y_test, y_pred_pre)
prec_macro = precision_score(y_test, y_pred_pre, average='macro')
recall_macro = recall_score(y_test, y_pred_pre, average='macro')
f1_macro = f1_score(y_test, y_pred_pre, average='macro')
classification_rep_pre = classification_report(y_test, y_pred_pre)
print("Accuracy Score:", acc)
print("Precision (Macro):", prec_macro)
print("Recall (Macro):", recall_macro)
print("F1 Score (Macro):", f1_macro)

# Confusion Matrix
cf_matrix_pre = confusion_matrix(y_test, y_pred_pre)
print("Confusion Matrix:")
print(cf_matrix_pre)
print("\n Test Classification Report:\n", classification_rep_pre)

In [24]:
y_pred_pre_train = best_dtree_pp.predict(X_sm)

In [25]:
acc = accuracy_score(Y_sm, y_pred_pre_train )
prec_macro = precision_score(Y_sm, y_pred_pre_train , average='macro')
recall_macro = recall_score(Y_sm, y_pred_pre_train, average='macro')
f1_macro = f1_score(Y_sm, y_pred_pre_train , average='macro')
classification_rep_pre_train = classification_report(Y_sm,y_pred_pre_train)
print("Accuracy Score:", acc)
print("Precision (Macro):", prec_macro)
print("Recall (Macro):", recall_macro)
print("F1 Score (Macro):", f1_macro)

# Confusion Matrix
cf_matrix_DT_pre = confusion_matrix(Y_sm, y_pred_pre_train)
print("Confusion Matrix:")
print(cf_matrix_DT_pre)
print("\n Train Classification Report:\n", classification_rep_pre_train )

#### post pruning

In [26]:
decision_tree = DecisionTreeClassifier(random_state=0)

In [27]:
path = decision_tree.cost_complexity_pruning_path(X_sm, Y_sm)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(ccp_alphas)

In [28]:
# For each alpha we will append our model to a list
decision_trees = []
for ccp_alpha in ccp_alphas:
    decision_tree = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    decision_tree.fit(X_sm, Y_sm)
    decision_trees.append(decision_tree)
print("Number of nodes in the last tree is :{} with ccp_alpha:{}".format(
decision_trees[-1].tree_.node_count,ccp_alphas[-1]))

In [29]:
train_scores=[decision_tree.score(X_sm, Y_sm) for decision_tree in decision_trees]
test_scores=[decision_tree.score(X_test, y_test) for decision_tree in decision_trees]

fig,ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing data sets")
ax.plot(ccp_alphas,train_scores,marker='o',label="train",drawstyle="steps-post")
ax.plot(ccp_alphas,test_scores,marker='o',label="test",drawstyle="steps-post")

ax.legend()
plt.show()

In [30]:
decision_tree = tree.DecisionTreeClassifier(random_state=0,ccp_alpha=0.001)
decision_tree.fit(X_sm, Y_sm)

y_train_pred_po = decision_tree.predict(X_sm)
y_test_pred_po = decision_tree.predict(X_test)

In [31]:
acc = accuracy_score(y_test, y_test_pred_po)
prec_macro = precision_score(y_test, y_test_pred_po, average='macro')
recall_macro = recall_score(y_test, y_test_pred_po, average='macro')
f1_macro = f1_score(y_test, y_test_pred_po, average='macro')
classification_rep_po = classification_report(y_test, y_test_pred_po)
print("Accuracy Score:", acc)
print("Precision (Macro):", prec_macro)
print("Recall (Macro):", recall_macro)
print("F1 Score (Macro):", f1_macro)

# Confusion Matrix
cf_matrix_po= confusion_matrix(y_test, y_test_pred_po)
print("Confusion Matrix:")
print(cf_matrix_po)
print("\n Test Classification Report:\n", classification_rep_po)

In [32]:
acc = accuracy_score(Y_sm, y_train_pred_po )
prec_macro = precision_score(Y_sm, y_train_pred_po , average='macro')
recall_macro = recall_score(Y_sm, y_train_pred_po, average='macro')
f1_macro = f1_score(Y_sm, y_train_pred_po , average='macro')
classification_rep_DT_po = classification_report(Y_sm,y_train_pred_po )
print("Accuracy Score:", acc)
print("Precision (Macro):", prec_macro)
print("Recall (Macro):", recall_macro)
print("F1 Score (Macro):", f1_macro)

# Confusion Matrix
cf_matrix_po = confusion_matrix(Y_sm, y_train_pred_po)
print("Confusion Matrix:")
print(cf_matrix_po)
print("\n Train Classification Report:\n", classification_rep_DT_po)

In [33]:
from sklearn.metrics import roc_curve,auc

In [34]:
y_prob=decision_tree.predict_proba(X_test)

In [35]:
for i in range(len(np.unique(Y))):
    fpt,tpr,_=roc_curve(y_test==i,y_prob[:,i])
    roc_auc=auc(fpt,tpr)
    plt.plot(fpt,tpr,label=f'Class {i} Vs Rest (AUC ={roc_auc:.2f})')
    
plt.plot([0,1],[0,1],linestyle='--',color='r',label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Multi-class Classification')
plt.legend()
plt.show()

In [36]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_prob,multi_class='ovr')