# **CUSTOMER CHURN IN BANKING** - NOTEBOOK 2: ML

In [1]:
#libraries import and dataset load
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder


df = pd.read_csv('../data/raw/dataset1.csv', sep = ',')

### Preparation of dataset

In [2]:
#head
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [3]:
#Dropping columns I will not use in the model
df = df.drop(['RowNumber', 'CustomerId', 'Surname','Complain'], axis = 1)


In [4]:
#One-hot Encoder to get categorical data formatted
#hot = pd.get_dummies(df[['Geography', 'Gender', 'Card Type']])
#df = pd.concat([df, hot], axis = 1)
#df = df.drop(['Geography', 'Gender', 'Card Type'], axis = 1)

In [5]:
# checking
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Satisfaction Score,Card Type,Point Earned
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,2,DIAMOND,464
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,3,DIAMOND,456
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,3,DIAMOND,377
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0,5,GOLD,350
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,5,GOLD,425


### Preparation of train & test

In [6]:
# Separating features and target
X = df.drop('Exited', axis = 1)
y = df['Exited']

In [7]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Satisfaction Score,Card Type,Point Earned
0,619,France,Female,42,2,0.0,1,1,1,101348.88,2,DIAMOND,464
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,3,DIAMOND,456
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,3,DIAMOND,377
3,699,France,Female,39,1,0.0,2,0,0,93826.63,5,GOLD,350
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,5,GOLD,425


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CreditScore         10000 non-null  int64  
 1   Geography           10000 non-null  object 
 2   Gender              10000 non-null  object 
 3   Age                 10000 non-null  int64  
 4   Tenure              10000 non-null  int64  
 5   Balance             10000 non-null  float64
 6   NumOfProducts       10000 non-null  int64  
 7   HasCrCard           10000 non-null  int64  
 8   IsActiveMember      10000 non-null  int64  
 9   EstimatedSalary     10000 non-null  float64
 10  Satisfaction Score  10000 non-null  int64  
 11  Card Type           10000 non-null  object 
 12  Point Earned        10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


In [9]:
# Transforming into train & test (30% of data to test the models)
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [10]:
X_train_num = X_train.select_dtypes('number')
X_test_num  = X_test.select_dtypes('number')

X_train_cat = X_train.select_dtypes('object')
X_test_cat  = X_test.select_dtypes('object')

In [11]:
X_train_num.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Satisfaction Score,Point Earned
7681,641,33,2,146193.6,2,1,1,55796.83,1,307
9031,541,39,7,0.0,2,1,0,19823.02,5,781
3691,590,76,5,160979.68,1,0,1,13848.58,4,924
202,516,50,5,0.0,1,0,1,146145.93,5,509
5625,508,60,7,143262.04,1,1,1,129562.74,3,727


In [12]:
# Std scaler to standarize the scale
import pickle

scaler = StandardScaler()
scaler.fit(X_train_num)

filename = "standard_scaler.pkl"
with open("../scalers/"+filename, "wb") as file:
    pickle.dump(scaler, file)

X_train_num_scaled_np = scaler.transform(X_train_num)
X_test_num_scaled_np  = scaler.transform(X_test_num)

X_train_num_scaled_df = pd.DataFrame(X_train_num_scaled_np, columns=X_train_num.columns, index=X_train_num.index)
X_test_num_scaled_df  = pd.DataFrame(X_test_num_scaled_np, columns=X_test_num.columns, index=X_test_num.index)

In [13]:
X_train_cat.head()

Unnamed: 0,Geography,Gender,Card Type
7681,France,Male,DIAMOND
9031,France,Male,PLATINUM
3691,France,Female,PLATINUM
202,Spain,Male,GOLD
5625,France,Female,GOLD


In [16]:
X.select_dtypes('object').head()

Unnamed: 0,Geography,Gender,Card Type
0,France,Female,DIAMOND
1,Spain,Female,DIAMOND
2,France,Female,DIAMOND
3,France,Female,GOLD
4,Spain,Female,GOLD


In [20]:
X_train_cat.head()

Unnamed: 0,Geography,Gender,Card Type
7681,France,Male,DIAMOND
9031,France,Male,PLATINUM
3691,France,Female,PLATINUM
202,Spain,Male,GOLD
5625,France,Female,GOLD


In [21]:
categorical_columns = list(X.select_dtypes('object').columns)

# Posible split de categoricas en nominales y ordinales. En este caso no hay ordinales.
categories = [ list(X[col].unique())  for col in categorical_columns ]

encoder = OneHotEncoder(categories=categories, handle_unknown="infrequent_if_exist", drop="first", sparse_output=False)
encoder.fit(X_train_cat)

filename = "encoder.pkl"
with open("../encoders/"+filename, "wb") as file:
    pickle.dump(encoder, file)

X_train_cat_encoded_np = encoder.transform(X_train_cat)
X_test_cat_encoded_np  = encoder.transform(X_test_cat)

X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded_np, columns=X_train_cat.columns, index=X_train_cat.index)
X_test_cat_encoded_df  = pd.DataFrame(X_test_cat_encoded_np, columns=X_test_cat.columns, index=X_test_cat.index)

ValueError: Shape of passed values is (7000, 6), indices imply (7000, 3)

In [None]:
X_train_proc = pd.concat([X_train_num_scaled_df, X_train_cat_encoded_df], axis=1)
X_test_proc = pd.concat([X_test_num_scaled_df, X_test_cat_encoded_df], axis=1)

In [None]:
# Oversampling
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train_num_scaled_df, y_train)

## ML models

#### Naive-Bayes

In [None]:
# Assuming X_train, y_train, X_test, y_test are already defined
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
previsoes = naive_bayes.predict(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, previsoes)

# Display confusion matrix using seaborn heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Compute and print accuracy
accuracy = accuracy_score(y_test, previsoes)
print(f'Accuracy: {accuracy:.2f}')



In [None]:
# Predicting probabilities
y_probs = naive_bayes.predict_proba(X_test)[:, 1]

# Computing ROC curve and ROC AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = roc_auc_score(y_test, y_probs)

# Plotting ROC curve
plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
print(classification_report(y_test, previsoes))

### Desition Tree

### Random Forest

In [None]:
parameters = {'max_depth': [3, 4, 5, 6, 7, 9, 11],
              'min_samples_split': [2, 3, 4, 5, 6, 7],
              'criterion': ['entropy', 'gini']
             }

model = RandomForestClassifier()
gridRandomForest = RandomizedSearchCV(model, parameters, cv = 5, n_jobs = -1)
gridRandomForest.fit(X_train, y_train)

print('Algorithm: ', gridRandomForest.best_estimator_.criterion)
print('Score: ', gridRandomForest.best_score_)
print('Mín Split: ', gridRandomForest.best_estimator_.min_samples_split)
print('Max Nvl: ', gridRandomForest.best_estimator_.max_depth)

In [None]:
# Define the random forest classifier
random_forest = RandomForestClassifier(n_estimators=100, min_samples_split=7, max_depth=11, criterion='entropy', random_state=0)

# Fit the model
random_forest.fit(X_train, y_train)

# Make predictions
previsoes = random_forest.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, previsoes)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, previsoes)

print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)