In [1]:
# Loading Data

import pandas as pd

df = pd.read_csv('Churn_Modelling.csv', index_col=0)

In [2]:
from sklearn.model_selection import train_test_split                        # For splitting data into training set and test set
from sklearn.preprocessing import LabelEncoder                              # Importing Logistic Regression Model
from sklearn.linear_model import LogisticRegression                                   # Importing Logistic Regression Model
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report   # Importing met

In [3]:
# Eliminating unwanted columns
df.drop(['CustomerId', 'Surname'],axis=1, inplace=True)

In [4]:
# Machine learning model takes in numeric values. It doesn't accept string so converting string into numeric values
# For example, it will do something like ['male', 'female'] ---> [0,1] etc

df['Geography'] = LabelEncoder().fit_transform(df['Geography'])
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])

In [5]:
df.head()  # Notice that unwanted colums have been eliminates and 'Geography' and 'Gender' column is label encoded

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,619,0,0,42,2,0.0,1,1,1,101348.88,1
2,608,2,0,41,1,83807.86,1,0,1,112542.58,0
3,502,0,0,42,8,159660.8,3,1,0,113931.57,1
4,699,0,0,39,1,0.0,2,0,0,93826.63,0
5,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [6]:
X = df.drop(['Exited'], axis=1).values   
y = df['Exited'].values

In [8]:
# Spliting into training and test set.
# Test size = 0.2 ---> 80% --> Training Set; 20% --> Test Set
# Used random_state to fix a particular split everytime

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2,random_state=42)

In [9]:
model = LogisticRegression()        # Declaring model 
model.fit(X_train, y_train)         # Fittinf model to the training set
predictions = model.predict(X_test) # Prediction on test set 

In [10]:
confusion_matrix(y_test, predictions)

array([[1573,   34],
       [ 365,   28]], dtype=int64)

In [11]:
# Since this is a problem of unbalanced class accuracy score should not be given too much weight
accuracy_score(y_test, predictions)

0.8005

In [12]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.81      0.98      0.89      1607
           1       0.45      0.07      0.12       393

    accuracy                           0.80      2000
   macro avg       0.63      0.53      0.51      2000
weighted avg       0.74      0.80      0.74      2000

