# Setup & Import Libraries

In [22]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
os.chdir("/content/drive/MyDrive/datascience_projects/customer_churn_analysis")
%ls

[0m[01;34mdata[0m/  [01;34mnotebook[0m/


# Load Churn Dataset

In [3]:
df = pd.read_csv("data/data.csv", index_col="RowNumber")

df.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Split up dependent features from independent feature (response)

In [31]:
X = df.iloc[:,2:-1]

X.head()

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,619,France,Female,42,2,0.0,1,1,1,101348.88
2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
3,502,France,Female,42,8,159660.8,3,1,0,113931.57
4,699,France,Female,39,1,0.0,2,0,0,93826.63
5,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [32]:
X.shape

(10000, 10)

In [33]:
y = df.iloc[:,-1]

y.head()

RowNumber
1    1
2    0
3    1
4    0
5    0
Name: Exited, dtype: int64

In [34]:
y.shape

(10000,)

In [35]:
X = X.iloc[:].values

type(X)

numpy.ndarray

In [36]:
y = y.iloc[:].values

# Handle with categorical variables

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB


**Encode the gender feature: because this variable can be assign only two features**

In [37]:
label_encoder = LabelEncoder()

In [38]:
X[:,2][:5]

array(['Female', 'Female', 'Female', 'Female', 'Female'], dtype=object)

In [39]:
X[:,2] = label_encoder.fit_transform(X[:,2])

In [40]:
X[:,2][:5]

array([0, 0, 0, 0, 0], dtype=object)

**Transforming Geography feature applying OneHotEncoder**

In [41]:
X[:, 1][:5]

array(['France', 'Spain', 'France', 'France', 'Spain'], dtype=object)

In [42]:
id_feature = 1
column_transform = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [id_feature])], remainder="passthrough")

In [43]:
X = column_transform.fit_transform(X)

In [45]:
X[:, 2][:5]

array([0.0, 1.0, 0.0, 0.0, 1.0], dtype=object)