Banking Customer Churn Prediction

In [16]:
# import warnings
# warnings.filterwarnings('ignore')

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      9997 non-null   object 
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  9999 non-null   float64
 13  Exited           9965 non-null   object 
dtypes: float64(2), int64(7), object(5)
memory usage: 1.1+ MB



**data Cleaning & Preprocessing**
---

In [19]:
df.isnull().sum()

RowNumber           0
CustomerId          0
Surname             0
CreditScore         3
Geography           0
Gender              0
Age                 0
Tenure              0
Balance             0
NumOfProducts       0
HasCrCard           0
IsActiveMember      0
EstimatedSalary     1
Exited             35
dtype: int64

In [20]:
df.dropna(subset='Exited', inplace=True)

In [21]:
df[df['CreditScore'].isnull()]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
433,434,15595039,Manna,,Germany,Female,37,8,114754.08,1,1,0,136050.44,1
435,436,15581197,Ricci,,France,Female,51,3,99286.98,1,0,1,85578.63,0


In [22]:
df[df['EstimatedSalary'].isnull()]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
458,459,15707362,Yin,514,Germany,Male,43,1,95556.31,1,0,1,,1


In [23]:
features = [
    'CreditScore', 
    'EstimatedSalary',
    'Age',
    'Tenure',
    'Balance',
    'NumOfProducts',
    'HasCrCard',
    'IsActiveMember'
]

imputer = KNNImputer(n_neighbors=5)
imputed_values = imputer.fit_transform(df[features])

df_imputed = pd.DataFrame(imputed_values, columns=features, index=df.index)
df[features] = df_imputed

In [24]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

***Analysis***
---

In [25]:
# Understanding the target
df['Exited'].value_counts()

Exited
0      7884
1      2021
Nan      60
Name: count, dtype: int64

In [26]:
df['HasCrCard'].value_counts()

HasCrCard
1.0    7030
0.0    2935
Name: count, dtype: int64

In [27]:
df['IsActiveMember'].value_counts()

IsActiveMember
1.0    5130
0.0    4835
Name: count, dtype: int64

In [28]:
df['NumOfProducts'].value_counts()

NumOfProducts
1.0    5063
2.0    4576
3.0     266
4.0      60
Name: count, dtype: int64

In [29]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RowNumber,9965.0,5012.051,2883.170284,1.0,2517.0,5017.0,7509.0,10000.0
CustomerId,9965.0,15690950.0,71918.916875,15565701.0,15628558.0,15690743.0,15753215.0,15815690.0
CreditScore,9965.0,650.5809,96.577095,350.0,584.0,652.0,717.0,850.0
Age,9965.0,38.91239,10.477952,18.0,32.0,37.0,44.0,92.0
Tenure,9965.0,5.011239,2.891535,0.0,3.0,5.0,7.0,10.0
Balance,9965.0,76509.96,62415.04009,0.0,0.0,97257.41,127660.46,250898.09
NumOfProducts,9965.0,1.530657,0.5819,1.0,1.0,1.0,2.0,4.0
HasCrCard,9965.0,0.7054691,0.455854,0.0,0.0,1.0,1.0,1.0
IsActiveMember,9965.0,0.5148018,0.499806,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,9965.0,99980.64,57493.759771,11.58,50881.51,100075.1,149195.44,199992.48


In [30]:
df['EstimatedSalary'].head()

0    101348.88
1    112542.58
2    113931.57
3     93826.63
4     79084.10
Name: EstimatedSalary, dtype: float64