# Predictive Analytics: **Bank Customer Churn Prediction**

Predictive Analytics with *Bank Customer Churn Prediction Datasets*

Naufal Mu'afi<br>
naufalmuafi@mail.ugm.ac.id

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

## 1. Data Loading/Data Wrangling
---

Informasi Dataset :

Jenis | Keterangan
--- | ---
Sumber | [Kaggle Dataset : Bank Customer Churn Prediction](https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction/data)
Lisensi | Other
Kategori | Finance
Rating Penggunaan | 9.71
Jenis dan Ukuran Berkas | CSV (268 kb)

In [None]:
churn = pd.read_csv("./data/Churn_Modelling.csv")
churn.head()

## 2. Exploratory Data Analysis (EDA)
---

### 2.1 Assesing and Cleaning Data

1. What are the types of variables in the dataset?
2. How variables distribution in the dataset?
3. Are there any missing values?
4. Are there any redundant features?
5. How about the correlation between features and targets?

#### 2.1.1 Variable Description

In [None]:
churn.shape
print(f"The dataset has {churn.shape[0]} rows, and {churn.shape[1]} columns")

In [None]:
churn.info()

In [None]:
churn.describe()

In [None]:
print(f"\nNumber of duplications: {churn.duplicated().sum()}")

#### 2.1.2 Variable Distribution Classification

before that, we can drop some unnecessary/dummy feature in the dataset

In [None]:
churn.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
churn.head()
churn.shape

Then, we can do a classification to the feature

In [None]:
numerical_features = []
categorical_features = []

nfeatures = len(churn.nunique())

for i in range(nfeatures):
  feature_uniqueness = churn.nunique()
  
  if feature_uniqueness.values[i] <= 25:
    categorical_features.append(feature_uniqueness.index[i])
  else:
    numerical_features.append(feature_uniqueness.index[i])

print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")

#### 2.1.3 Handle Missing Value

In [None]:
churn.isna().sum()

#### 2.1.4. Handle The Outliers

In [None]:
nrows = 2
ncols = 2

fig, ax = plt.subplots(nrows, ncols, figsize=(20, 15))

for row in range(nrows):
  for col in range(ncols):
    column = numerical_features[row*ncols + col]
    
    sns.boxplot(x=churn[column], ax=ax[row, col])

In [None]:
Q1 = churn[numerical_features].quantile(0.25)
Q3 = churn[numerical_features].quantile(0.75)
IQR = Q3-Q1

# # creating a mask for outliers
outlier_mask = ((churn[numerical_features] < (Q1 - 1.5*IQR)) | (churn[numerical_features] > (Q3 + 1.5*IQR))).any(axis=1)

# #filtering out rows with outliers
churn = churn[~outlier_mask]

churn.shape

In [None]:
nrows = 2
ncols = 2

fig, ax = plt.subplots(nrows, ncols, figsize=(20, 15))

for row in range(nrows):
  for col in range(ncols):
    column = numerical_features[row*ncols + col]
    
    sns.boxplot(x=churn[column], ax=ax[row, col])

### 2.2 Univariate Analysis for Categorical Features

#### 2.2.1 Geography Feature

In [None]:
index = 0
feature = categorical_features[index]
count = churn[feature].value_counts()
percent = 100*churn[feature].value_counts(normalize=True)
df = pd.DataFrame({'sample total':count, 'percentage':percent.round(1)})
print(df)

# Plotting the bar chart
ax = count.plot(kind='bar', title=feature)

# Adding labels
plt.xlabel(feature)
plt.ylabel('Count')
plt.title(f'{feature} Distribution')
plt.xticks(rotation=45)

# Adding data labels on each bar
for i, v in enumerate(count):
    ax.text(i, v + 0.5, f'{percent.iloc[i]:.1f}%', ha='center', va='bottom')

plt.show()

#### 2.2.2 Gender Feature

In [None]:
index = 1
feature = categorical_features[index]
count = churn[feature].value_counts()
percent = 100*churn[feature].value_counts(normalize=True)
df = pd.DataFrame({'sample total':count, 'percentage':percent.round(1)})
print(df)

# Plotting the bar chart
ax = count.plot(kind='bar', title=feature)

# Adding labels
plt.xlabel(feature)
plt.ylabel('Count')
plt.title(f'{feature} Distribution')
plt.xticks(rotation=45)

# Adding data labels on each bar
for i, v in enumerate(count):
    ax.text(i, v + 0.5, f'{percent.iloc[i]:.1f}%', ha='center', va='bottom')

plt.show()

#### 2.2.3 Tenure Feature

In [None]:
index = 2
feature = categorical_features[index]
count = churn[feature].value_counts()
percent = 100*churn[feature].value_counts(normalize=True)
df = pd.DataFrame({'sample total':count, 'percentage':percent.round(1)})
print(df)

# Plotting the bar chart
ax = count.plot(kind='bar', title=feature)

# Adding labels
plt.xlabel(feature)
plt.ylabel('Count')
plt.title(f'{feature} Distribution')
plt.xticks(rotation=45)

# Adding data labels on each bar
for i, v in enumerate(count):
    ax.text(i, v + 0.5, f'{percent.iloc[i]:.1f}%', ha='center', va='bottom')

plt.show()

#### 2.2.4 Num of Product Feature

In [None]:
index = 3
feature = categorical_features[index]
count = churn[feature].value_counts()
percent = 100*churn[feature].value_counts(normalize=True)
df = pd.DataFrame({'sample total':count, 'percentage':percent.round(1)})
print(df)

# Plotting the bar chart
ax = count.plot(kind='bar', title=feature)

# Adding labels
plt.xlabel(feature)
plt.ylabel('Count')
plt.title(f'{feature} Distribution')
plt.xticks(rotation=45)

# Adding data labels on each bar
for i, v in enumerate(count):
    ax.text(i, v + 0.5, f'{percent.iloc[i]:.1f}%', ha='center', va='bottom')

plt.show()

#### 2.2.5 Has Credit Card Feature

In [None]:
index = 4
feature = categorical_features[index]
count = churn[feature].value_counts()
percent = 100*churn[feature].value_counts(normalize=True)
df = pd.DataFrame({'sample total':count, 'percentage':percent.round(1)})
print(df)

# Plotting the bar chart
ax = count.plot(kind='bar', title=feature)

# Adding labels
plt.xlabel(feature)
plt.ylabel('Count')
plt.title(f'{feature} Distribution')
plt.xticks(rotation=45)

# Adding data labels on each bar
for i, v in enumerate(count):
    ax.text(i, v + 0.5, f'{percent.iloc[i]:.1f}%', ha='center', va='bottom')

plt.show()

#### 2.2.6 Is Active Member Feature

In [None]:
index = 5
feature = categorical_features[index]
count = churn[feature].value_counts()
percent = 100*churn[feature].value_counts(normalize=True)
df = pd.DataFrame({'sample total':count, 'percentage':percent.round(1)})
print(df)

# Plotting the bar chart
ax = count.plot(kind='bar', title=feature)

# Adding labels
plt.xlabel(feature)
plt.ylabel('Count')
plt.title(f'{feature} Distribution')
plt.xticks(rotation=45)

# Adding data labels on each bar
for i, v in enumerate(count):
    ax.text(i, v + 0.5, f'{percent.iloc[i]:.1f}%', ha='center', va='bottom')

plt.show()

#### 2.2.7 Exited Feature

In [None]:
index = 6
feature = categorical_features[index]
count = churn[feature].value_counts()
percent = 100*churn[feature].value_counts(normalize=True)
df = pd.DataFrame({'sample total':count, 'percentage':percent.round(1)})
print(df)

# Plotting the bar chart
ax = count.plot(kind='bar', title=feature)

# Adding labels
plt.xlabel(feature)
plt.ylabel('Count')
plt.title(f'{feature} Distribution')
plt.xticks(rotation=45)

# Adding data labels on each bar
for i, v in enumerate(count):
    ax.text(i, v + 0.5, f'{percent.iloc[i]:.1f}%', ha='center', va='bottom')

plt.show()

### 2.3 Univariate Analysis for Numerical Features

In [None]:
churn[numerical_features].hist(bins=50, figsize=(20,15))
plt.show()

### 2.4 Multivariate Analysis

#### 2.4.1 Categorical Features

In [None]:
cat_features = categorical_features.copy()
cat_features.remove('Exited')

for col in cat_features:
  sns.catplot(x=col, y='Exited', kind='bar', dodge=False, height=4, aspect=3, data=churn, palette="Set3")
  plt.title(f"Average 'Exited' Relative to {col}")

#### 2.4.1 Numerical Features

##### Pair Plot

In [None]:
# Observing relationships between numerical features
sns.pairplot(churn, diag_kind = 'kde')

##### Target to Each Numerical Features Corresponding

In [None]:
nrows = 2
ncols = 2

fig, ax = plt.subplots(nrows, ncols, figsize=(20, 15))

for row in range(nrows):
  for col in range(ncols):
    column = numerical_features[row*ncols + col]
    
    sns.barplot(x=churn['Exited'], y=churn[column], ax=ax[row, col], palette='Set2')

## 3. Data Preparation
---

### 3.1 Category Feature Encoding

In [None]:
churn = pd.get_dummies(churn, columns=['Geography', 'Gender'], drop_first=True, dtype=np.int8)
churn.head()

### 3.2 Correlation Analysis

In [None]:
plt.figure(figsize=(12,10))
corr_matrix = churn.corr()

sns.heatmap(data=corr_matrix,
            xticklabels=corr_matrix.columns,
            yticklabels=corr_matrix.columns,
            annot=True,
            cmap='coolwarm',
            linewidths=0.5)

plt.title('Correlation Matrix of The Dataset', fontsize=20)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

### 3.3 Train Test Split

In [None]:
X = churn.drop(['Exited'], axis=1)
y = churn['Exited']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    shuffle=True)

print(f'Total # of sample in whole dataset: {len(X)}')
print(f'Total # of sample in train dataset: {len(X_train)}')
print(f'Total # of sample in test dataset: {len(X_test)}')

### 3.4 Feature Scaling

In [None]:
scaler = StandardScaler()
features = list(X_train.columns)

for col in features:
    X_train[col] = scaler.fit_transform(X_train[col].to_numpy().reshape(-1,1))
    X_test[col] = scaler.transform(X_test[col].to_numpy().reshape(-1,1))

In [None]:
X_train[numerical_features].describe().round(4)

In [None]:
X_train.head()

## 4. Model Development
---

In [None]:
# prepare the dataframe for model analysis
models = pd.DataFrame(index=['train_acc', 'test_acc'],
                      columns=['KNN', 'LogisticRegression', 'SVC'])

### 4.1 K-Nearest Neighbourhood Algorithm

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

models.loc['train_acc', 'knn'] = accuracy_score(y_pred=knn.predict(X_train), y_true=y_train)

### 4.2 Logistic Regression Algorithm

In [None]:
log_reg = LogisticRegression(random_state=123)
log_reg.fit(X_train, y_train)

models.loc['train_acc', 'LogisticRegression'] = accuracy_score(y_pred=log_reg.predict(X_train), y_true=y_train)

### 4.3 Support Vector Classifier Algorithm

In [None]:
svc = SVC(C=2.0, kernel='rbf')
svc.fit(X_train, y_train)

models.loc['train_acc', 'SVC'] = accuracy_score(y_pred=svc.predict(X_train), y_true=y_train)

## 5. Model Evaluation
---

In [None]:
X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])

In [None]:
acc = pd.DataFrame(columns=['train', 'test'], index=['KNN', 'LogisticRegression', 'SVC'])
model_dict = {'KNN': knn, 'LogisticRegression':log_reg, 'SVC':svc}

# calculate MSE for each algorithm in train and test dataset
for name, model in model_dict.items():
  acc.loc[name, 'train'] = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))
  acc.loc[name, 'test'] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))

acc

In [None]:
fig, ax = plt.subplots()
acc.sort_values(by='test', ascending=True).plot(kind='barh', ax=ax, zorder=3)
ax.grid(zorder=0)

In [None]:
prediction = X_test.iloc[:5].copy()
pred_dict = {'y_true':y_test[:5]}

for name, model in model_dict.items():
  pred_dict['prediction_'+name] = model.predict(prediction).round(1)

pd.DataFrame(pred_dict)