In [43]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, balanced_accuracy_score, confusion_matrix, accuracy_score

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

 Download file from

https://www.kaggle.com/datasets/mathchi/churn-for-bank-customers 

In [5]:
data = pd.read_csv("churn.csv")

In [6]:
data = pd.read_csv("churn.csv")

In [7]:
data.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [9]:
#Convert(Geography,Gender) σε Numerical
label_geography = LabelEncoder()
label_gender = LabelEncoder()

In [10]:
data['Geography'] = label_geography.fit_transform(data['Geography'])
data['Gender'] = label_gender.fit_transform(data['Gender'])

data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,0,0,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,0,0,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,0,0,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,2,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,0,1,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,0,1,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,0,0,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,1,1,42,3,75075.31,2,1,0,92888.52,1


In [12]:
#Split X και y, train and test
X_columns = ["CreditScore", "Geography", "Gender", "Tenure", "Balance", "NumOfProducts", 
             "HasCrCard", "IsActiveMember", "EstimatedSalary"]

Y_columns = ["Exited"]

In [14]:
X = data.copy()[X_columns]
y = data.copy()[Y_columns]
print(X.shape)
print(y.shape)

(10000, 9)
(10000, 1)


In [16]:
# Split into training and testing sets with stratify
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [17]:
# display
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 9), (2000, 9), (8000, 1), (2000, 1))


    Train model with :
    1. Random Forest
    2. Without PCA
    3. Without Clusters
    4. With scoring Precision


In [18]:
# set up the random forest classifier
model_rf_class =RandomForestClassifier()

# define grid of parameters 
param_grid_class = {
    'n_estimators': [200, 300, 500],
    'max_depth': [3, 5, 10],
    'criterion': ['gini', 'entropy']
}

In [19]:
precision_scorer = make_scorer(precision_score)

In [20]:
grid_search = GridSearchCV(estimator=model_rf_class, param_grid=param_grid_class, scoring=precision_scorer, cv=5, verbose=1)

In [21]:
grid_search.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [23]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [24]:
print("Best Parameters:", best_params)
print("Best Precision Score:", best_score)

Best Parameters: {'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 500}
Best Precision Score: 0.9058704390847246


In [25]:
# best estimator from the grid search
best_rf_model = grid_search.best_estimator_

In [27]:
feature_importances = best_rf_model.feature_importances_
features_df = pd.DataFrame({
    'Feature': X_columns, 
    'Importance': feature_importances
})

In [29]:
features_df = features_df.sort_values(by='Importance', ascending=False)
features_df

Unnamed: 0,Feature,Importance
5,NumOfProducts,0.572967
7,IsActiveMember,0.152505
4,Balance,0.113359
1,Geography,0.072026
2,Gender,0.052097
0,CreditScore,0.024083
8,EstimatedSalary,0.008895
3,Tenure,0.002888
6,HasCrCard,0.001179


Train with : 
1. Support Vector Machine
2. Standard Scaler
3. scoring Balanced Accuracy

In [38]:
label_geography = LabelEncoder()
label_gender = LabelEncoder()
data['Geography'] = label_geography.fit_transform(data['Geography'])
data['Gender'] = label_gender.fit_transform(data['Gender'])

X_columns = ["CreditScore", "Geography", "Gender", "Tenure", "Balance", "NumOfProducts", 
             "HasCrCard", "IsActiveMember", "EstimatedSalary"]
Y_columns = ["Exited"]


In [39]:
X = data[X_columns]
y = data[Y_columns].values.ravel() # 1d array

In [40]:
# split into train and tests
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [41]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [45]:
# SVM with Radial basis function Kernel
model_svm = SVC(kernel='rbf', gamma='auto')

In [46]:
# params
param_grid_svm = {
    'C': [0.001, 0.01, 1, 10, 100, 1000]
}


In [47]:
# scoring
balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)

In [48]:
# grid search
grid_search_svm = GridSearchCV(estimator=model_svm, param_grid=param_grid_svm, scoring=balanced_accuracy_scorer, cv=5, verbose=1)


In [49]:
# fit model 

grid_search_svm.fit(X_train_scaled, y_train)
best_params_svm = grid_search_svm.best_params_
best_score_svm = grid_search_svm.best_score_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [50]:
print("Best Parameters:", best_params_svm)
print("Best Balanced Accuracy Score:", best_score_svm)

Best Parameters: {'C': 1000}
Best Balanced Accuracy Score: 0.6258415116872611


Random Forest

In [51]:
# Matrix and target array
X = data.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)
y = data['Exited']

In [52]:
# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [54]:
# split to training & test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [55]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [56]:
rf_predictions = rf_classifier.predict(X_test)

In [57]:
rf_predictions

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

SVM

In [58]:
# Initialize and train the Support Vector Machine classifier
svm_classifier = SVC(random_state=42)
svm_classifier.fit(X_train, y_train)

In [59]:
svm_predictions = svm_classifier.predict(X_test)
svm_predictions

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [60]:
# Evaluate the results
for model_name, predictions in [('Random Forest', rf_predictions), ('SVM', svm_predictions)]:
    print(f"{model_name} Performance:")
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Balanced Accuracy:", balanced_accuracy_score(y_test, predictions), "\n")

Random Forest Performance:
Confusion Matrix:
 [[1545   62]
 [ 210  183]]
Accuracy: 0.864
Balanced Accuracy: 0.7135338238717064 

SVM Performance:
Confusion Matrix:
 [[1571   36]
 [ 249  144]]
Accuracy: 0.8575
Balanced Accuracy: 0.6720051112261718 

