In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn import metrics

In [55]:
data = pd.read_csv("BankChurn.csv", sep = ',')
data.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42.0,2.0,,1.0,1.0,1.0,101348.88,1
1,15647311,608,Spain,Female,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0
2,15619304,502,France,Female,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1
3,15701354,699,France,Female,39.0,1.0,,2.0,0.0,0.0,93826.63,0
4,15737888,850,Spain,Female,43.0,2.0,125510.82,1.0,1.0,1.0,79084.1,0


In [56]:
data.isnull().sum()
clean_me = data.dropna()

In [57]:
clean_me.isnull().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [58]:
clean_me = clean_me.drop(["customer_id"], axis = 1)
clean_me.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6377 entries, 1 to 9999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   credit_score      6377 non-null   int64  
 1   country           6377 non-null   object 
 2   gender            6377 non-null   object 
 3   age               6377 non-null   float64
 4   tenure            6377 non-null   float64
 5   balance           6377 non-null   float64
 6   products_number   6377 non-null   float64
 7   credit_card       6377 non-null   float64
 8   active_member     6377 non-null   float64
 9   estimated_salary  6377 non-null   float64
 10  churn             6377 non-null   int64  
dtypes: float64(7), int64(2), object(2)
memory usage: 597.8+ KB


In [59]:
clean_me = pd.get_dummies(clean_me, drop_first= True, dtype = int)
clean_me.head()

Unnamed: 0,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_Germany,country_Spain,gender_Male
1,608,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0,0,1,0
2,502,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1,0,0,0
4,850,43.0,2.0,125510.82,1.0,1.0,1.0,79084.1,0,0,1,0
5,645,44.0,8.0,113755.78,2.0,1.0,0.0,149756.71,1,0,1,1
7,376,29.0,4.0,115046.74,4.0,1.0,0.0,119346.88,1,1,0,0


In [60]:
y = np.array(clean_me["churn"])
X = clean_me.drop(['churn'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [61]:
# Create a Decision Tree Classifier using Gini index
clf_giniME = DecisionTreeClassifier(criterion='gini', random_state=42)

# Train the model
clf_giniME.fit(X_train, y_train)

# Predict the responses for test dataset
y_pred_dt = clf_giniME.predict(X_test)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred_dt)

# Calculate confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_dt)

# Print accuracy, and confusion matrix
print("\nAccuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)


Accuracy: 0.7507836990595611

Confusion Matrix:
 [[1181  269]
 [ 208  256]]


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc=RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 150, 200, 300, 400],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [2,3, 4]
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 4)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_

{'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 400}

In [62]:
# Create a Random Forest Classifier
rf1ME=RandomForestClassifier(n_estimators=400, max_depth=4, max_features='sqrt',random_state=42)

# Train the model using the training set
rf1ME.fit(X_train,y_train)

# Predict obs in test set
y_pred_rf1=rf1ME.predict(X_test)
y_pred_train_rf1=rf1ME.predict(X_train)
# Model Accuracy, how often is the classifier correct?
print("Accuracy Test:",metrics.accuracy_score(y_test, y_pred_rf1))
# Model Accuracy, how often is the classifier correct?
print("Accuracy Train:",metrics.accuracy_score(y_train, y_pred_train_rf1))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_rf1)
print(conf_matrix)

Accuracy Test: 0.8166144200626959
Accuracy Train: 0.8225408917768318
[[1430   20]
 [ 331  133]]


In [37]:
# Different Parameters
rfc=RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [300, 400, 500, 700],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [3, 4, 5, 6]
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 4)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_

{'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 500}

In [63]:
# Create a Random Forest Classifier
rf2ME=RandomForestClassifier(n_estimators=500, max_depth=6, max_features='sqrt',random_state=42)

# Train the model using the training set
rf2ME.fit(X_train,y_train)

# Predict obs in test set
y_pred_rf2=rf2ME.predict(X_test)
y_pred_train_rf2=rf2ME.predict(X_train)
# Model Accuracy, how often is the classifier correct?
print("Accuracy Test:",metrics.accuracy_score(y_test, y_pred_rf2))
# Model Accuracy, how often is the classifier correct?
print("Accuracy Train:",metrics.accuracy_score(y_train, y_pred_train_rf2))

conf_matrix2 = metrics.confusion_matrix(y_test, y_pred_rf2)
print(conf_matrix2)

Accuracy Test: 0.8281086729362591
Accuracy Train: 0.8469639256105759
[[1410   40]
 [ 289  175]]


In [64]:
# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=6,  max_features='sqrt', random_state=42)
rf_classifier.fit(X, y)

# Optimal parameter settings
optimal_params = rf_classifier.get_params()
print("Optimal Parameter Settings:", optimal_params)

# Feature importance
feature_importances = rf_classifier.feature_importances_

# List the variables in decreasing order of importance
importance_sorted_indices = feature_importances.argsort()[::-1]
sorted_feature_names = X.columns[importance_sorted_indices]

print("\nVariables in Decreasing Order of Importance:")
for feature_name in sorted_feature_names:
    print(feature_name)

Optimal Parameter Settings: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

Variables in Decreasing Order of Importance:
age
products_number
active_member
country_Germany
balance
credit_score
estimated_salary
gender_Male
tenure
country_Spain
credit_card


In [65]:
best_params = {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 500, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

# Create a Random Forest Classifier
final_rfME=RandomForestClassifier(**best_params)

# Train the model using the training set
final_rfME.fit(X_train,y_train)

# Predict obs in test set
y_pred_final_rf=final_rfME.predict(X_test)
y_pred_train_final_rf=final_rfME.predict(X_train)
# Model Accuracy, how often is the classifier correct?
print("Accuracy Test:",metrics.accuracy_score(y_test, y_pred_final_rf))
# Model Accuracy, how often is the classifier correct?
print("Accuracy Train:",metrics.accuracy_score(y_train, y_pred_train_final_rf))

conf_matrix_final = metrics.confusion_matrix(y_test, y_pred_final_rf)
print(conf_matrix_final)

Accuracy Test: 0.8281086729362591
Accuracy Train: 0.8469639256105759
[[1410   40]
 [ 289  175]]


In [50]:
#SVM
# Initialize SVM classifier with a linear kernel
svm_classifierME = SVC(kernel='linear', random_state=42)

# Train the model on the training set
svm_classifierME.fit(X_train, y_train)

# Predict the responses for the test set
y_pred_svm = svm_classifierME.predict(X_test)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred_svm)

# Calculate confusion matrix
conf_matrix_svm1ME = metrics.confusion_matrix(y_test, y_pred_svm)

print("\nAccuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix_svm1ME)


Accuracy: 0.7466039707419018

Confusion Matrix:
 [[1402   48]
 [ 437   27]]


In [67]:
# Initialize SVM classifier with a Radial kernel
svm_classifierRADME = SVC(kernel='rbf', random_state=42)

# Train the model on the training set
svm_classifierRADME.fit(X_train, y_train)

# Predict the responses for the test set
y_pred_svm = svm_classifierRADME.predict(X_test)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred_svm)

# Calculate confusion matrix
conf_matrix_svm2ME = metrics.confusion_matrix(y_test, y_pred_svm)

print("\nAccuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix_svm2ME)



Accuracy: 0.7575757575757576

Confusion Matrix:
 [[1450    0]
 [ 464    0]]


In [53]:
params_svm = {
    'C':[.00001, .0001, .001, 0.01],
    'gamma': [.00001, .0001, .001, .01], 
    'kernel': ['rbf','linear']
}
SVM = SVC(random_state=10)
grid = GridSearchCV(estimator= SVM,
                    param_grid=params_svm,
                    verbose=3, cv=3,
                    refit=True)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_estimator_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV 1/3] END ..C=1e-05, gamma=1e-05, kernel=rbf;, score=0.759 total time=   0.5s
[CV 2/3] END ..C=1e-05, gamma=1e-05, kernel=rbf;, score=0.759 total time=   0.4s
[CV 3/3] END ..C=1e-05, gamma=1e-05, kernel=rbf;, score=0.760 total time=   0.5s
[CV 1/3] END C=1e-05, gamma=1e-05, kernel=linear;, score=0.759 total time= 2.7min
[CV 2/3] END C=1e-05, gamma=1e-05, kernel=linear;, score=0.759 total time= 2.7min
[CV 3/3] END C=1e-05, gamma=1e-05, kernel=linear;, score=0.760 total time= 2.6min
[CV 1/3] END .C=1e-05, gamma=0.0001, kernel=rbf;, score=0.759 total time=   0.4s
[CV 2/3] END .C=1e-05, gamma=0.0001, kernel=rbf;, score=0.759 total time=   0.4s
[CV 3/3] END .C=1e-05, gamma=0.0001, kernel=rbf;, score=0.760 total time=   0.5s
[CV 1/3] END C=1e-05, gamma=0.0001, kernel=linear;, score=0.759 total time= 3.2min
[CV 2/3] END C=1e-05, gamma=0.0001, kernel=linear;, score=0.759 total time= 3.0min
[CV 3/3] END C=1e-05, gamma=0.0001, kerne

In [66]:
print(grid.best_estimator_)
print(grid.best_params_)

SVC(C=1e-05, gamma=1e-05, random_state=10)
{'C': 1e-05, 'gamma': 1e-05, 'kernel': 'rbf'}


In [54]:
final_svmME = grid.best_estimator_
y_pred_svmf = final_svmME.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred_svmf)
conf_matrix_svm2 = metrics.confusion_matrix(y_test, y_pred_svmf)

print("\nAccuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix_svm2)




Accuracy: 0.7575757575757576

Confusion Matrix:
 [[1450    0]
 [ 464    0]]
