# Setup & Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
os.chdir("/content/drive/MyDrive/datascience_projects/customer_churn_analysis")
%ls

[0m[01;34mdata[0m/  [01;34mnotebook[0m/


# Load Churn Dataset

In [None]:
df = pd.read_csv("data/data.csv", index_col="RowNumber")

df.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Split up dependent features from independent feature (response)

In [None]:
X = df.iloc[:,2:-1]

X.head()

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,619,France,Female,42,2,0.0,1,1,1,101348.88
2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
3,502,France,Female,42,8,159660.8,3,1,0,113931.57
4,699,France,Female,39,1,0.0,2,0,0,93826.63
5,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [None]:
X.shape

(10000, 10)

In [None]:
y = df.iloc[:,-1]

y.head()

RowNumber
1    1
2    0
3    1
4    0
5    0
Name: Exited, dtype: int64

In [None]:
y.shape

(10000,)

In [None]:
X = X.iloc[:].values

type(X)

numpy.ndarray

In [None]:
X.shape

(10000, 10)

In [None]:
y = y.iloc[:].values

# Handle with categorical variables

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB


**Encode the gender feature: because this variable can be assign only two features**

In [None]:
label_encoder = LabelEncoder()

In [None]:
X[:,2][:5]

array(['Female', 'Female', 'Female', 'Female', 'Female'], dtype=object)

In [None]:
X.shape

(10000, 10)

In [None]:
X[:,2] = label_encoder.fit_transform(X[:,2])

In [None]:
X[:,2][:5]

array([0, 0, 0, 0, 0], dtype=object)

In [None]:
X.shape

(10000, 10)

**Transforming Geography feature applying OneHotEncoder**

In [None]:
X[:, 1][:5]

array(['France', 'Spain', 'France', 'France', 'Spain'], dtype=object)

In [None]:
id_feature = 1
column_transform = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [id_feature])], remainder="passthrough")

In [None]:
X = column_transform.fit_transform(X)

In [None]:
X[:, 2][:5]

array([0.0, 1.0, 0.0, 0.0, 1.0], dtype=object)

In [None]:
X[:][:5]

array([[1.0, 0.0, 0.0, 619, 0, 42, 2, 0.0, 1, 1, 1, 101348.88],
       [0.0, 0.0, 1.0, 608, 0, 41, 1, 83807.86, 1, 0, 1, 112542.58],
       [1.0, 0.0, 0.0, 502, 0, 42, 8, 159660.8, 3, 1, 0, 113931.57],
       [1.0, 0.0, 0.0, 699, 0, 39, 1, 0.0, 2, 0, 0, 93826.63],
       [0.0, 0.0, 1.0, 850, 0, 43, 2, 125510.82, 1, 1, 1, 79084.1]],
      dtype=object)

# Split the Data into Train, Validation and Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("Size of Train, Validation and Test set")
print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

Size of Train, Validation and Test set
Train: (6400, 12), Validation: (1600, 12), Test: (2000, 12)


# Data Normalization

In [None]:
print(f"Max: {round(X_train.min(), 4)}, Min: {round(X_train.max(), 4)}")

Max: 0.0, Min: 238387.56


In [None]:
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)

In [None]:
print(f"Max: {round(X_train.min(), 4)}, Min: {round(X_train.max(), 4)}")

Max: -3.1294, Min: 4.2463


In [None]:
X_val = standard_scaler.transform(X_val)
X_test = standard_scaler.transform(X_test)

In [None]:
print(f"Max: {round(X_val.min(), 4)}, Min: {round(X_val.max(), 4)}")

Max: -2.8595, Min: 5.0581


In [None]:
print(f"Max: {round(X_test.min(), 4)}, Min: {round(X_test.max(), 4)}")

Max: -3.1294, Min: 5.0581


# Save Data crompressed for fit in the Machine Learning Model

In [None]:
%ls

[0m[01;34mdata[0m/  [01;34mnotebook[0m/


In [None]:
train_data_path = "data/train_binary_data"
validation_data_path = "data/validation_binary_data"
test_data_path = "data/test_binary_data"

In [None]:
np.savez(train_data_path, x=X_train, y=y_train)
np.savez(validation_data_path, x=X_val, y=y_val)
np.savez(test_data_path, x=X_test, y=y_test)