Banking Customer Churn Prediction

In [92]:
# import warnings
# warnings.filterwarnings('ignore')

In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619.0,France,Female,42,2,0.0,1,1,1,101348.88,1.0
1,2,15647311,Hill,608.0,Spain,Female,41,1,83807.86,1,0,1,112542.58,0.0
2,3,15619304,Onio,502.0,France,Female,42,8,159660.8,3,1,0,113931.57,1.0
3,4,15701354,Boni,699.0,France,Female,39,1,0.0,2,0,0,93826.63,0.0
4,5,15737888,Mitchell,850.0,Spain,Female,43,2,125510.82,1,1,1,79084.1,0.0


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      9995 non-null   float64
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  9999 non-null   float64
 13  Exited           9904 non-null   float64
dtypes: float64(4), int64(7), object(3)
memory usage: 1.1+ MB



**data Cleaning & Preprocessing**
---

In [95]:
df.isnull().sum()

RowNumber           0
CustomerId          0
Surname             0
CreditScore         5
Geography           0
Gender              0
Age                 0
Tenure              0
Balance             0
NumOfProducts       0
HasCrCard           0
IsActiveMember      0
EstimatedSalary     1
Exited             96
dtype: int64

In [96]:
df.dropna(subset='Exited', inplace=True)

In [97]:
df[df['CreditScore'].isnull()]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
433,434,15595039,Manna,,Germany,Female,37,8,114754.08,1,1,0,136050.44,1.0
435,436,15581197,Ricci,,France,Female,51,3,99286.98,1,0,1,85578.63,0.0


In [98]:
df[df['EstimatedSalary'].isnull()]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
458,459,15707362,Yin,514.0,Germany,Male,43,1,95556.31,1,0,1,,1.0


In [99]:
features = [
    'CreditScore', 
    'EstimatedSalary',
    'Age',
    'Tenure',
    'Balance',
    'NumOfProducts',
    'HasCrCard',
    'IsActiveMember'
]

imputer = KNNImputer(n_neighbors=5)
imputed_values = imputer.fit_transform(df[features])

df_imputed = pd.DataFrame(imputed_values, columns=features, index=df.index)
df[features] = df_imputed

In [100]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

***Analysis***
---

In [101]:
# Understanding the target
df['Exited'].value_counts()

Exited
0.0    7883
1.0    2021
Name: count, dtype: int64

In [102]:
df['HasCrCard'].value_counts()

HasCrCard
1.0    6983
0.0    2921
Name: count, dtype: int64

In [103]:
df['IsActiveMember'].value_counts()

IsActiveMember
1.0    5096
0.0    4808
Name: count, dtype: int64

In [104]:
df['NumOfProducts'].value_counts()

NumOfProducts
1.0    5041
2.0    4541
3.0     262
4.0      60
Name: count, dtype: int64

In [105]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RowNumber,9904.0,5001.587,2888.258558,1.0,2501.75,4986.5,7523.25,9999.0
CustomerId,9904.0,15690970.0,71958.601237,15565701.0,15628530.0,15690780.0,15753333.25,15815690.0
CreditScore,9904.0,650.6042,96.599248,350.0,584.0,652.0,717.0,850.0
Age,9904.0,38.92023,10.476533,18.0,32.0,37.0,44.0,92.0
Tenure,9904.0,5.013833,2.89217,0.0,3.0,5.0,8.0,10.0
Balance,9904.0,76564.19,62393.975288,0.0,0.0,97267.1,127639.05,250898.09
NumOfProducts,9904.0,1.529584,0.581735,1.0,1.0,1.0,2.0,4.0
HasCrCard,9904.0,0.7050687,0.456035,0.0,0.0,1.0,1.0,1.0
IsActiveMember,9904.0,0.5145396,0.499814,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,9904.0,100049.5,57465.622329,11.58,51014.84,100134.3,149216.32,199992.48


In [106]:
df['EstimatedSalary'].head()

0    101348.88
1    112542.58
2    113931.57
3     93826.63
4     79084.10
Name: EstimatedSalary, dtype: float64