In [9]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df= pd.read_csv('data/processed/ChurnModelling_Binning_Applied.csv')
df.head()

Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScore_Binned
0,France,Female,42.0,2,0.0,1,1,1,101348.88,1,fair
1,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0,fair
2,France,Female,42.0,8,159660.8,3,1,0,113931.57,1,poor
3,France,Female,38.91,1,0.0,2,0,0,93826.63,0,good
4,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0,excellent


| Variable Type | Preferred Encoding | Why? |
|--------------|-------------------|------|
| Nominal | One-Hot Encoding | No inherent order → avoids implying false ordinal relationships |
| Ordinal | Label Encoding | Preserves order → small integers represent increasing levels |

###  Nominal variables

In [11]:
norminal_variables = ['Geography', 'Gender']

geography_dummies = pd.get_dummies(df['Geography'], prefix="Geography")
gender_dummies = pd.get_dummies(df['Gender'], prefix="Gender")

df_encoded = pd.concat([df, geography_dummies], axis=1)
del df_encoded['Geography']

df_encoded = pd.concat([df_encoded, gender_dummies], axis=1)
del df_encoded['Gender']

df_encoded

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScore_Binned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.00,2,0.00,1,1,1,101348.88,1,fair,True,False,False,True,False
1,41.00,1,83807.86,1,0,1,112542.58,0,fair,False,False,True,True,False
2,42.00,8,159660.80,3,1,0,113931.57,1,poor,True,False,False,True,False
3,38.91,1,0.00,2,0,0,93826.63,0,good,True,False,False,True,False
4,43.00,2,125510.82,1,1,1,79084.10,0,excellent,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39.00,5,0.00,2,1,0,96270.64,0,very good,True,False,False,False,True
9996,35.00,10,57369.61,1,1,1,101699.77,0,poor,True,False,False,False,True
9997,36.00,7,0.00,1,0,1,42085.58,1,good,True,False,False,True,False
9998,42.00,3,75075.31,2,1,0,92888.52,1,very good,False,True,False,False,True


### Ordinal variables

In [12]:
encode_dict_creditscore = {
                            'poor' : 0,
                            'fair' : 1,
                            'good' : 2,
                            'very good' : 3,
                            'excellent': 4
                            }
df_encoded['CreditScore_Binned'] = df_encoded['CreditScore_Binned'].map(encode_dict_creditscore)
df_encoded.head(10)

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScore_Binned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.0,2,0.0,1,1,1,101348.88,1,1,True,False,False,True,False
1,41.0,1,83807.86,1,0,1,112542.58,0,1,False,False,True,True,False
2,42.0,8,159660.8,3,1,0,113931.57,1,0,True,False,False,True,False
3,38.91,1,0.0,2,0,0,93826.63,0,2,True,False,False,True,False
4,43.0,2,125510.82,1,1,1,79084.1,0,4,False,False,True,True,False
5,44.0,8,113755.78,2,1,0,149756.71,1,1,False,False,True,False,True
6,50.0,7,0.0,2,1,1,10062.8,0,4,True,False,False,False,True
7,29.0,4,115046.74,4,1,0,119346.88,1,0,False,True,False,True,False
8,44.0,4,142051.07,2,0,1,74940.5,0,0,True,False,False,False,True
9,27.0,2,134603.88,1,1,1,71725.73,0,2,True,False,False,False,True


In [13]:
df_encoded.to_csv('data/processed/ChurnModelling_Encoded.csv', index=False)

### use SKlearn

In [14]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [15]:
df = pd.read_csv('data/processed/ChurnModelling_Binning_Applied.csv')
df.head(10)

ohe_geography = OneHotEncoder()
ohe_gender = OneHotEncoder()

le_credit_score = LabelEncoder()

ohe_geography.fit(df['Geography'].values.reshape(10000, 1))
ohe_gender.fit(df['Gender'].values.reshape(10000, 1))

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [16]:
geography_ohe = ohe_geography.transform(df['Geography'].values.reshape(10000, 1))
geography_ohe = geography_ohe.toarray()
geography_ohe

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], shape=(10000, 3))

In [17]:
gender_ohe = ohe_gender.transform(df['Gender'].values.reshape(10000, 1))
gender_ohe = gender_ohe.toarray()
gender_ohe

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], shape=(10000, 2))