## Importing the libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
file = r"../data/raw/Bank Customer Churn Prediction.csv"

In [3]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df.shape

(10000, 12)

## Selecting the feature and target set

The model training does not depend on "customer_id" and we are not including it as a feature.

In [12]:
X = df.iloc[:,1:-1]
Y = df.iloc[:,-1]

In [13]:
X.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [26]:
X.shape

(10000, 10)

In [14]:
Y.head()

0    1
1    0
2    1
3    0
4    0
Name: churn, dtype: int64

## Checking for null values

In [15]:
null_cols = pd.DataFrame(X.isnull().sum(), columns=["sum"])
null_cols[null_cols["sum"] != 0]

Unnamed: 0,sum


## Train-Test Split

In [16]:
from sklearn.model_selection import train_test_split
xtrain, xval, ytrain, yval = train_test_split(X, Y, test_size=0.1, random_state=42)
xtrain, xtest, ytrain, ytest = train_test_split(xtrain, ytrain, test_size=0.1/(1-0.1), random_state=42)

## Pre-processsing the Data

Get all the numeric columns to standardise the values

In [18]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_X = X.select_dtypes(include=numerics)
numeric_cols = list(numeric_X.columns)
numeric_cols

['credit_score',
 'age',
 'tenure',
 'balance',
 'products_number',
 'credit_card',
 'active_member',
 'estimated_salary']

In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

xtrain_scl1 = scaler.fit_transform(xtrain[numeric_cols])
xval_scl1 = scaler.transform(xval[numeric_cols])
xtest_scl1 = scaler.transform(xtest[numeric_cols])

Get all categorical columns and one-hot encode the values

In [21]:
features = X.columns
categorical_cols = list(set(features) - set(numeric_cols))
categorical_cols

['country', 'gender']

In [23]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

xtrain_scl2 = encoder.fit_transform(xtrain[categorical_cols])
xval_scl2 = encoder.transform(xval[categorical_cols])
xtest_scl2 = encoder.transform(xtest[categorical_cols])

In [24]:
xtrain_scl = np.hstack([xtrain_scl1, xtrain_scl2.todense()])
xval_scl = np.hstack([xval_scl1, xval_scl2.todense()])
xtest_scl = np.hstack([xtest_scl1, xtest_scl2.todense()])

In [25]:
xtrain_scl.shape

(7999, 13)