# Data Preprocessing

Data has been downloaded from https://www.kaggle.com/datasets/adammaus/predicting-churn-for-bank-customers

In [1]:
#Import required dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in csv file
data = pd.read_csv('Churn_Modelling.csv')

#Display the data frame 
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Drop missing values
data.dropna(inplace=True)

#Drop the row number colum
data0 = data.drop('RowNumber', axis=1)

In [4]:
#Drop the row number colum
data1 = data0.drop('Surname', axis=1)

In [5]:
#Check the data 
print(data1.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   CreditScore      10000 non-null  int64  
 2   Geography        10000 non-null  object 
 3   Gender           10000 non-null  object 
 4   Age              10000 non-null  int64  
 5   Tenure           10000 non-null  int64  
 6   Balance          10000 non-null  float64
 7   NumOfProducts    10000 non-null  int64  
 8   HasCrCard        10000 non-null  int64  
 9   IsActiveMember   10000 non-null  int64  
 10  EstimatedSalary  10000 non-null  float64
 11  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 1015.6+ KB
None


In [6]:
#Convert object values using one-hot encoding
data2 = pd.get_dummies(data1, columns=['Geography', 'Gender'])

In [7]:
#Check the data
data2.head()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,15634602,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,15619304,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
3,15701354,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
4,15737888,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


In [8]:
# Select the columns to normalize
cols_to_normalize = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

# Create a scaler object
scaler = StandardScaler()

# Fit the scaler to the selected columns
scaler.fit(data2[cols_to_normalize])

# Transform the selected columns
data2[cols_to_normalize] = scaler.transform(data2[cols_to_normalize])

In [9]:
#Check the data
data2.head()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,15634602,-0.326221,0.293517,-1.04176,-1.225848,-0.911583,1,1,0.021886,1,1,0,0,1,0
1,15647311,-0.440036,0.198164,-1.387538,0.11735,-0.911583,0,1,0.216534,0,0,0,1,1,0
2,15619304,-1.536794,0.293517,1.032908,1.333053,2.527057,1,0,0.240687,1,1,0,0,1,0
3,15701354,0.501521,0.007457,-1.387538,-1.225848,0.807737,0,0,-0.108918,0,1,0,0,1,0
4,15737888,2.063884,0.388871,-1.04176,0.785728,-0.911583,1,1,-0.365276,0,0,0,1,1,0


In [10]:
#Save the output to csv
data2.to_csv('processed_data.csv', index=False)

In [11]:
import pickle

# Save the scaler object to a file
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)