In [1]:
#Importing the necessary libraries
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [3]:
df = pd.read_csv('/content/Churn_Modelling.csv')
#Displaying the top 5 rows of the dataset for reference
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
#Finding the number of rows
df.shape[0]

10000

In [5]:
#Finding the number of columns
df.shape[1]

14

In [6]:
#Checking for null values
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
#Getting the overall statistics of the data
df.describe(include='all')

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000,10000.0,10000,10000,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
unique,,,2932,,3,2,,,,,,,,
top,,,Smith,,France,Male,,,,,,,,
freq,,,32,,5014,5457,,,,,,,,
mean,5000.5,15690940.0,,650.5288,,,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,,96.653299,,,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,,350.0,,,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,,584.0,,,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,,652.0,,,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,,718.0,,,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0


In [8]:
#Dropping irrelevant features
df = df.drop(['CustomerId', 'Surname', 'RowNumber'], axis=1)

In [9]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [10]:
#Encoding categorical data
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [12]:
df = pd.get_dummies(df,  drop_first = True)

In [13]:
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [14]:
#Checking for balanced or imbalanced data
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [15]:
#From the above reference the data 'Exited' is imbalanced, therefore handling it
#Step 1 - Initialise the independent and dependent variables
X = df.drop('Exited', axis = 1)
Y = df['Exited']

#Step 2 - Handle the imbalanced data
X_bal, Y_bal = SMOTE().fit_resample(X,Y)
Y_bal.value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

In [16]:
#Splitting the dataset into training data and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X_bal, Y_bal, test_size=0.2, random_state = 50)

In [17]:
#Feature Scaling
SC = StandardScaler()
X_train = SC.fit_transform(X_train)
X_test = SC.transform(X_test)

In [18]:
#Training the random forest classifier
RF = RandomForestClassifier()
RF.fit(X_train, Y_train)

In [19]:
#Prediction
Y_pred = RF.predict(X_test)

In [20]:
#Checking for accuracy score
accuracy_score(Y_test, Y_pred)

0.8518518518518519

In [21]:
#Checking for precision score
precision_score(Y_test, Y_pred)

0.8575096277278562

In [22]:
#Checking for recall score
recall_score(Y_test, Y_pred)

0.8423707440100883

In [23]:
#Checking for f1 score
f1_score(Y_test, Y_pred)

0.8498727735368957

In [24]:
#Training the model on entire dataset
X_bal = SC.fit_transform(X_bal)
RF.fit(X_bal, Y_bal)

In [25]:
joblib.dump(RF, 'Customer Churn Prediction')

['Customer Churn Prediction']

In [26]:
Model = joblib.load('Customer Churn Prediction')

In [27]:
df.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object')

1 - Customer leaving the bank

0 - Customer isn't leaving the bank

In [29]:
#Testing the model using random values
Model.predict([[619, 42, 2, 0.0, 0, 0, 0, 101348.88, 0, 0, 0]])

array([1])