### 1. Import Dependencies

In [1]:
import os
import numpy as np
import pandas as pd # alias
import seaborn as sns
import matplotlib.pyplot as plt

| Variable Type | Preferred Encoding | Why? |
|--------------|-------------------|------|
| Nominal | One-Hot Encoding | No inherent order → avoids implying false ordinal relationships |
| Ordinal | Label Encoding | Preserves order → small integers represent increasing levels |

Gender <Male, Female>

        Gender_Male, Gender_Female
Male    [1            0]
Female  [0            1]

*One hot Vector*
**One hot Vector**

(Assume Gender is Ordinal);
    Male -> 1
    Female -> 0

Gender -> Nominal
Geography -> Nominal
CreditScoreBins -> Ordinal

In [2]:
df = pd.read_csv('data/processed/ChurnModelling_Binning_Applied.csv')
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenureBins
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,New customers
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No,Long-term
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,New customers
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No,Long-term
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,New customers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No,Long-term
7039,Female,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No,Very loyal
7040,Female,0,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No,New customers
7041,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes,New customers


### 2. Encode Nominal Variables

In [3]:

nominal_variables = ['gender', 'InternetService', 'Contract', 'PaymentMethod']

gender_dummies = pd.get_dummies(df['gender'], prefix='gender')
InternetService_dummies = pd.get_dummies(df['InternetService'], prefix='InternetService')
Contract_dummies = pd.get_dummies(df['Contract'], prefix='Contract')
PaymentMethod_dummies = pd.get_dummies(df['PaymentMethod'], prefix='PaymentMethod')

df_encoded =  pd.concat([df, gender_dummies], axis=1)
del df_encoded['gender']

df_encoded =  pd.concat([df_encoded, InternetService_dummies], axis=1)
del df_encoded['InternetService']

df_encoded =  pd.concat([df_encoded, Contract_dummies], axis=1)
del df_encoded['Contract']

df_encoded =  pd.concat([df_encoded, PaymentMethod_dummies], axis=1)
del df_encoded['PaymentMethod']

df_encoded

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,Yes,No,No,No phone service,No,Yes,No,No,No,...,True,False,False,True,False,False,False,False,True,False
1,0,No,No,Yes,No,Yes,No,Yes,No,No,...,True,False,False,False,True,False,False,False,False,True
2,0,No,No,Yes,No,Yes,Yes,No,No,No,...,True,False,False,True,False,False,False,False,False,True
3,0,No,No,No,No phone service,Yes,No,Yes,Yes,No,...,True,False,False,False,True,False,True,False,False,False
4,0,No,No,Yes,No,No,No,No,No,No,...,False,True,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,...,True,False,False,False,True,False,False,False,False,True
7039,0,Yes,Yes,Yes,Yes,No,Yes,Yes,No,Yes,...,False,True,False,False,True,False,False,True,False,False
7040,0,Yes,Yes,No,No phone service,Yes,No,No,No,No,...,True,False,False,True,False,False,False,False,True,False
7041,1,Yes,No,Yes,Yes,No,No,No,No,No,...,False,True,False,True,False,False,False,False,False,True


In [4]:
df_encoded.isnull().sum()

SeniorCitizen                              0
Partner                                    0
Dependents                                 0
PhoneService                               0
MultipleLines                              0
OnlineSecurity                             0
OnlineBackup                               0
DeviceProtection                           0
TechSupport                                0
StreamingTV                                0
StreamingMovies                            0
PaperlessBilling                           0
MonthlyCharges                             0
TotalCharges                               0
Churn                                      0
tenureBins                                 0
gender_Female                              0
gender_Male                                0
InternetService_DSL                        0
InternetService_Fiber optic                0
InternetService_No                         0
Contract_Month-to-month                    0
Contract_O

### 2. Encode Ordinal Variables

In [5]:
encode_dict_tenure = {
                            'New customers' : 0,
                            'Medium-term' : 1,
                            'Good' : 2,
                            'Long-term' : 3,
                            'Very loyal' : 4
                            }

df_encoded['tenureBins'] = df_encoded['tenureBins'].map(encode_dict_tenure)
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,Yes,No,No,No phone service,No,Yes,No,No,No,...,True,False,False,True,False,False,False,False,True,False
1,0,No,No,Yes,No,Yes,No,Yes,No,No,...,True,False,False,False,True,False,False,False,False,True
2,0,No,No,Yes,No,Yes,Yes,No,No,No,...,True,False,False,True,False,False,False,False,False,True
3,0,No,No,No,No phone service,Yes,No,Yes,Yes,No,...,True,False,False,False,True,False,True,False,False,False
4,0,No,No,Yes,No,No,No,No,No,No,...,False,True,False,True,False,False,False,False,True,False
5,0,No,No,Yes,Yes,No,No,Yes,No,Yes,...,False,True,False,True,False,False,False,False,True,False
6,0,No,Yes,Yes,Yes,No,Yes,No,No,Yes,...,False,True,False,True,False,False,False,True,False,False
7,0,No,No,No,No phone service,Yes,No,No,No,No,...,True,False,False,True,False,False,False,False,False,True
8,0,Yes,No,Yes,Yes,No,No,Yes,Yes,Yes,...,False,True,False,True,False,False,False,False,True,False
9,0,No,Yes,Yes,No,Yes,Yes,No,No,No,...,True,False,False,False,True,False,True,False,False,False


In [6]:
encode_dict_Partner = {
                            'No' : 0,
                            'Yes' : 1
                            }

df_encoded['Partner'] = df_encoded['Partner'].map(encode_dict_Partner)
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,No,No,No phone service,No,Yes,No,No,No,...,True,False,False,True,False,False,False,False,True,False
1,0,0,No,Yes,No,Yes,No,Yes,No,No,...,True,False,False,False,True,False,False,False,False,True
2,0,0,No,Yes,No,Yes,Yes,No,No,No,...,True,False,False,True,False,False,False,False,False,True
3,0,0,No,No,No phone service,Yes,No,Yes,Yes,No,...,True,False,False,False,True,False,True,False,False,False
4,0,0,No,Yes,No,No,No,No,No,No,...,False,True,False,True,False,False,False,False,True,False
5,0,0,No,Yes,Yes,No,No,Yes,No,Yes,...,False,True,False,True,False,False,False,False,True,False
6,0,0,Yes,Yes,Yes,No,Yes,No,No,Yes,...,False,True,False,True,False,False,False,True,False,False
7,0,0,No,No,No phone service,Yes,No,No,No,No,...,True,False,False,True,False,False,False,False,False,True
8,0,1,No,Yes,Yes,No,No,Yes,Yes,Yes,...,False,True,False,True,False,False,False,False,True,False
9,0,0,Yes,Yes,No,Yes,Yes,No,No,No,...,True,False,False,False,True,False,True,False,False,False


In [7]:
encode_dict_Dependents = {
                            'No' : 0,
                            'Yes' : 1
                            }

df_encoded['Dependents'] = df_encoded['Dependents'].map(encode_dict_Dependents)
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,No,No phone service,No,Yes,No,No,No,...,True,False,False,True,False,False,False,False,True,False
1,0,0,0,Yes,No,Yes,No,Yes,No,No,...,True,False,False,False,True,False,False,False,False,True
2,0,0,0,Yes,No,Yes,Yes,No,No,No,...,True,False,False,True,False,False,False,False,False,True
3,0,0,0,No,No phone service,Yes,No,Yes,Yes,No,...,True,False,False,False,True,False,True,False,False,False
4,0,0,0,Yes,No,No,No,No,No,No,...,False,True,False,True,False,False,False,False,True,False
5,0,0,0,Yes,Yes,No,No,Yes,No,Yes,...,False,True,False,True,False,False,False,False,True,False
6,0,0,1,Yes,Yes,No,Yes,No,No,Yes,...,False,True,False,True,False,False,False,True,False,False
7,0,0,0,No,No phone service,Yes,No,No,No,No,...,True,False,False,True,False,False,False,False,False,True
8,0,1,0,Yes,Yes,No,No,Yes,Yes,Yes,...,False,True,False,True,False,False,False,False,True,False
9,0,0,1,Yes,No,Yes,Yes,No,No,No,...,True,False,False,False,True,False,True,False,False,False


In [8]:
encode_dict_PaperlessBilling = {
    'No': 0,
    'Yes': 1
}

# Clean values before mapping (remove spaces, normalize case)
df_encoded['PaperlessBilling'] = (
    df_encoded['PaperlessBilling']
    .str.strip()
    .str.title()
    .map(encode_dict_PaperlessBilling)
)

df_encoded.head(10)


Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,No,No phone service,No,Yes,No,No,No,...,True,False,False,True,False,False,False,False,True,False
1,0,0,0,Yes,No,Yes,No,Yes,No,No,...,True,False,False,False,True,False,False,False,False,True
2,0,0,0,Yes,No,Yes,Yes,No,No,No,...,True,False,False,True,False,False,False,False,False,True
3,0,0,0,No,No phone service,Yes,No,Yes,Yes,No,...,True,False,False,False,True,False,True,False,False,False
4,0,0,0,Yes,No,No,No,No,No,No,...,False,True,False,True,False,False,False,False,True,False
5,0,0,0,Yes,Yes,No,No,Yes,No,Yes,...,False,True,False,True,False,False,False,False,True,False
6,0,0,1,Yes,Yes,No,Yes,No,No,Yes,...,False,True,False,True,False,False,False,True,False,False
7,0,0,0,No,No phone service,Yes,No,No,No,No,...,True,False,False,True,False,False,False,False,False,True
8,0,1,0,Yes,Yes,No,No,Yes,Yes,Yes,...,False,True,False,True,False,False,False,False,True,False
9,0,0,1,Yes,No,Yes,Yes,No,No,No,...,True,False,False,False,True,False,True,False,False,False


In [9]:
df_encoded.isnull().sum()

SeniorCitizen                              0
Partner                                    0
Dependents                                 0
PhoneService                               0
MultipleLines                              0
OnlineSecurity                             0
OnlineBackup                               0
DeviceProtection                           0
TechSupport                                0
StreamingTV                                0
StreamingMovies                            0
PaperlessBilling                           0
MonthlyCharges                             0
TotalCharges                               0
Churn                                      0
tenureBins                                 0
gender_Female                              0
gender_Male                                0
InternetService_DSL                        0
InternetService_Fiber optic                0
InternetService_No                         0
Contract_Month-to-month                    0
Contract_O

In [10]:
encode_dict_Churn = {
                            'No' : False,
                            'Yes' : True
                            }

df_encoded['Churn'] = df_encoded['Churn'].map(encode_dict_Churn)
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,No,No phone service,No,Yes,No,No,No,...,True,False,False,True,False,False,False,False,True,False
1,0,0,0,Yes,No,Yes,No,Yes,No,No,...,True,False,False,False,True,False,False,False,False,True
2,0,0,0,Yes,No,Yes,Yes,No,No,No,...,True,False,False,True,False,False,False,False,False,True
3,0,0,0,No,No phone service,Yes,No,Yes,Yes,No,...,True,False,False,False,True,False,True,False,False,False
4,0,0,0,Yes,No,No,No,No,No,No,...,False,True,False,True,False,False,False,False,True,False
5,0,0,0,Yes,Yes,No,No,Yes,No,Yes,...,False,True,False,True,False,False,False,False,True,False
6,0,0,1,Yes,Yes,No,Yes,No,No,Yes,...,False,True,False,True,False,False,False,True,False,False
7,0,0,0,No,No phone service,Yes,No,No,No,No,...,True,False,False,True,False,False,False,False,False,True
8,0,1,0,Yes,Yes,No,No,Yes,Yes,Yes,...,False,True,False,True,False,False,False,False,True,False
9,0,0,1,Yes,No,Yes,Yes,No,No,No,...,True,False,False,False,True,False,True,False,False,False


In [11]:
# --- PhoneServiceBins ---
df_encoded['PhoneServiceBins'] = (
    df_encoded['PhoneService'].map({'No': 0, 'Yes': 1}) +
    df_encoded['MultipleLines'].map({'No': 0, 'Yes': 1, 'No phone service': 0})
)

# --- InternetServiceBins ---
df_encoded['InternetServiceBins'] = (
    # InternetService_No is boolean, so map with True/False not strings
    df_encoded['InternetService_No'].map({True: 0, False: 1}) +
    df_encoded[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport', 'StreamingTV', 'StreamingMovies']]
    .apply(lambda col: col.map({'No': 0, 'Yes': 1, 'No internet service': 0}))
    .sum(axis=1)
)

In [12]:
df_encoded.drop(columns=[
    'PhoneService', 'MultipleLines',
    'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies'
], inplace=True)
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,...,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,1,29.85,29.85,False,0,True,False,...,False,True,False,False,False,False,True,False,0,2
1,0,0,0,0,56.95,1889.5,False,3,False,True,...,False,False,True,False,False,False,False,True,1,3
2,0,0,0,1,53.85,108.15,True,0,False,True,...,False,True,False,False,False,False,False,True,1,3
3,0,0,0,0,42.3,1840.75,False,3,False,True,...,False,False,True,False,True,False,False,False,0,4
4,0,0,0,1,70.7,151.65,True,0,True,False,...,False,True,False,False,False,False,True,False,1,1
5,0,0,0,1,99.65,820.5,True,0,True,False,...,False,True,False,False,False,False,True,False,2,4
6,0,0,1,1,89.1,1949.4,False,1,False,True,...,False,True,False,False,False,True,False,False,2,3
7,0,0,0,0,29.75,301.9,False,0,True,False,...,False,True,False,False,False,False,False,True,0,2
8,0,1,0,1,104.8,3046.05,True,3,True,False,...,False,True,False,False,False,False,True,False,2,5
9,0,0,1,0,56.15,3487.95,False,4,False,True,...,False,False,True,False,True,False,False,False,1,3


In [13]:
df_encoded.to_csv('data/processed/ChurnModelling_Encoded.csv', index=False)

In [14]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [15]:
nominal_variables = ['gender', 'InternetService', 'Contract', 'PaymentMethod']

In [16]:
df['gender'].values.reshape(7043, 1)

array([['Female'],
       ['Male'],
       ['Male'],
       ...,
       ['Female'],
       ['Male'],
       ['Male']], shape=(7043, 1), dtype=object)

In [17]:
df = pd.read_csv('data/processed/ChurnModelling_Binning_Applied.csv')
df.head(10)

ohe_gender = OneHotEncoder()
ohe_InternetService = OneHotEncoder()
ohe_Contract = OneHotEncoder()
ohe_PaymentMethod = OneHotEncoder()

le_tenure = LabelEncoder()
le_TotalCharges = LabelEncoder()

ohe_gender.fit(df['gender'].values.reshape(7043, 1))
ohe_InternetService.fit(df['InternetService'].values.reshape(7043, 1))
ohe_Contract.fit(df['Contract'].values.reshape(7043, 1))
ohe_PaymentMethod.fit(df['PaymentMethod'].values.reshape(7043, 1))

In [18]:
gender_ohe = ohe_gender.transform(df['gender'].values.reshape(7043, 1))
gender_ohe = gender_ohe.toarray()
gender_ohe

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]], shape=(7043, 2))

In [19]:
ohe_InternetService.fit(df['InternetService'].values.reshape(-1, 1))
InternetService_ohe = ohe_InternetService.transform(df['InternetService'].values.reshape(-1, 1)).toarray()
InternetService_ohe

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], shape=(7043, 3))

In [20]:
ohe_Contract.fit(df['Contract'].values.reshape(-1, 1))
Contract_ohe = ohe_Contract.transform(df['Contract'].values.reshape(-1, 1)).toarray()
Contract_ohe

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.]], shape=(7043, 3))

In [21]:
ohe_PaymentMethod.fit(df['PaymentMethod'].values.reshape(-1, 1))
PaymentMethod_ohe = ohe_PaymentMethod.transform(df['PaymentMethod'].values.reshape(-1, 1)).toarray()
PaymentMethod_ohe

array([[0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]], shape=(7043, 4))