In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Importing the dataset
df = pd.read_csv('/Users/madisonchristiansen/Desktop/Customer-Churn-Records.csv')   
df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0,0,1,DIAMOND,300
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0,0,5,PLATINUM,771
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1,1,3,SILVER,564
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1,1,2,GOLD,339
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0,0,3,DIAMOND,911


In [4]:
# Prepping data - checking dataset for missing values 
df.isnull().sum().sort_values(ascending=False)

RowNumber             0
CustomerId            0
Card Type             0
Satisfaction Score    0
Complain              0
Exited                0
EstimatedSalary       0
IsActiveMember        0
HasCrCard             0
NumOfProducts         0
Balance               0
Tenure                0
Age                   0
Gender                0
Geography             0
CreditScore           0
Surname               0
Point Earned          0
dtype: int64

In [23]:
# Prepping data - dropping unimportant columns
df2 = df.drop(columns=['Surname', 'RowNumber', 'Geography', 'Gender', 'CustomerId','Age']) 
df2.tail()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
9995,771,5,0.0,2,1,0,96270.64,0,0,1,DIAMOND,300
9996,516,10,57369.61,1,1,1,101699.77,0,0,5,PLATINUM,771
9997,709,7,0.0,1,0,1,42085.58,1,1,3,SILVER,564
9998,772,3,75075.31,2,1,0,92888.52,1,1,2,GOLD,339
9999,792,4,130142.79,1,1,0,38190.78,0,0,3,DIAMOND,911


In [24]:
# Prepping data - dummy variables for card type
df3 = pd.get_dummies(df2, columns=['Card Type'])
df3.head()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Point Earned,Card Type_DIAMOND,Card Type_GOLD,Card Type_PLATINUM,Card Type_SILVER
0,619,2,0.0,1,1,1,101348.88,1,1,2,464,1,0,0,0
1,608,1,83807.86,1,0,1,112542.58,0,1,3,456,1,0,0,0
2,502,8,159660.8,3,1,0,113931.57,1,1,3,377,1,0,0,0
3,699,1,0.0,2,0,0,93826.63,0,0,5,350,0,1,0,0
4,850,2,125510.82,1,1,1,79084.1,0,0,5,425,0,1,0,0


In [25]:
# Model - random forest classifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


X = df3.drop('Exited', axis=1) ## Churn as the target column 
y = df3['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y)
rf_classifier = RandomForestClassifier()

# Train classifer 
rf_classifier.fit(X_train, y_train)
# Predict on the test set
y_pred = rf_classifier.predict(X_test)

In [26]:
# Evalutaion Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")

recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")

f1 = f1_score(y_test, y_pred)
print(f"F1 score: {f1}")

Accuracy: 0.9984
Precision: 0.9959839357429718
Recall: 0.9959839357429718
F1 score: 0.9959839357429718


### The Random Forest Model is showing good evaluation metrics. Although, I want to test a Logistic Regression Model as well to see if it is better for the data. 

In [27]:
# Model - logistic regression 
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression() 

# Train classifer 
logreg_model.fit(X_train, y_train)  

# Predict on the test set
y_pred2 = logreg_model.predict(X_test)

# Evalutaion Metrics
accuracy2 = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy2}")

precision2 = precision_score(y_test, y_pred2)
print(f"Precision: {precision2}")

recall2 = recall_score(y_test, y_pred2)
print(f"Recall: {recall2}")

f12 = f1_score(y_test, y_pred2)
print(f"F1 score: {f12}")

Accuracy: 0.8008
Precision: 0.0
Recall: 0.0
F1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


### The Logistic Regression Model is decently accurate but the RFM is more accuarate. 

In [28]:
feature_import = rf_classifier.feature_importances_

importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_import})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)


               Feature  Importance
7             Complain    0.885599
3        NumOfProducts    0.048424
2              Balance    0.015419
0          CreditScore    0.011029
5       IsActiveMember    0.009988
9         Point Earned    0.008898
6      EstimatedSalary    0.008727
1               Tenure    0.004784
8   Satisfaction Score    0.002829
4            HasCrCard    0.001022
10   Card Type_DIAMOND    0.000905
13    Card Type_SILVER    0.000849
11      Card Type_GOLD    0.000835
12  Card Type_PLATINUM    0.000692
