### 1. Import Dependencies

In [53]:
import os
import numpy as np
import pandas as pd # alias
import seaborn as sns
import matplotlib.pyplot as plt

| Variable Type | Preferred Encoding | Why? |
|--------------|-------------------|------|
| Nominal | One-Hot Encoding | No inherent order → avoids implying false ordinal relationships |
| Ordinal | Label Encoding | Preserves order → small integers represent increasing levels |

Gender <Male, Female>

        Gender_Male, Gender_Female
Male    [1            0]
Female  [0            1]

*One hot Vector*
**One hot Vector**

(Assume Gender is Ordinal);
    Male -> 1
    Female -> 0

Gender -> Nominal
Geography -> Nominal
CreditScoreBins -> Ordinal

In [54]:
df = pd.read_csv('data/processed/ChurnModelling_Binning_Applied.csv')
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenureBins
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,New
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No,Established
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,New
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No,Established
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,New
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No,Established
7039,Female,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No,Loyal
7040,Female,0,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No,New
7041,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes,New


In [55]:
print(df['Contract'].value_counts())

Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64


In [56]:
print(df['PaymentMethod'].value_counts())

PaymentMethod
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: count, dtype: int64


In [79]:
print(df['PaperlessBilling'].value_counts())

PaperlessBilling
Yes    4171
No     2872
Name: count, dtype: int64


### 2. Encode Nominal Variables

In [57]:
nominal_variables = ['gender', 'PaymentMethod']

gender_dummies = pd.get_dummies(df['gender'], prefix='gender')
PaymentMethod_dummies = pd.get_dummies(df['PaymentMethod'], prefix='PaymentMethod')

df_encoded =  pd.concat([df, gender_dummies], axis=1)
del df_encoded['gender']

df_encoded =  pd.concat([df_encoded, PaymentMethod_dummies], axis=1)
del df_encoded['PaymentMethod']

df_encoded

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,...,29.85,29.85,No,New,True,False,False,False,True,False
1,0,No,No,Yes,No,DSL,Yes,No,Yes,No,...,56.95,1889.50,No,Established,False,True,False,False,False,True
2,0,No,No,Yes,No,DSL,Yes,Yes,No,No,...,53.85,108.15,Yes,New,False,True,False,False,False,True
3,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,...,42.30,1840.75,No,Established,False,True,True,False,False,False
4,0,No,No,Yes,No,Fiber optic,No,No,No,No,...,70.70,151.65,Yes,New,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,...,84.80,1990.50,No,Established,False,True,False,False,False,True
7039,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,...,103.20,7362.90,No,Loyal,True,False,False,True,False,False
7040,0,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,...,29.60,346.45,No,New,True,False,False,False,True,False
7041,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,...,74.40,306.60,Yes,New,False,True,False,False,False,True


In [58]:
df_encoded.isnull().sum()

SeniorCitizen                              0
Partner                                    0
Dependents                                 0
PhoneService                               0
MultipleLines                              0
InternetService                            0
OnlineSecurity                             0
OnlineBackup                               0
DeviceProtection                           0
TechSupport                                0
StreamingTV                                0
StreamingMovies                            0
Contract                                   0
PaperlessBilling                           0
MonthlyCharges                             0
TotalCharges                               0
Churn                                      0
tenureBins                                 0
gender_Female                              0
gender_Male                                0
PaymentMethod_Bank transfer (automatic)    0
PaymentMethod_Credit card (automatic)      0
PaymentMet

In [59]:
df_encoded['tenureBins'].value_counts()

tenureBins
Established    2671
Loyal          2303
New            2069
Name: count, dtype: int64

### 2. Encode Ordinal Variables

In [60]:
encode_dict_tenure = {
                    'New' : 0,
                    'Established' : 1,
                    'Loyal' : 2
                    }

df_encoded['tenureBins'] = df_encoded['tenureBins'].map(encode_dict_tenure)
df_encoded['tenureBins'].head(10)

0    0
1    1
2    0
3    1
4    0
5    0
6    1
7    0
8    1
9    2
Name: tenureBins, dtype: int64

In [61]:
encode_dict_contract = {
                    'Month-to-month' : 0,
                    'One year' : 1,
                    'Two year' : 2
                    }

df_encoded['Contract'] = df_encoded['Contract'].map(encode_dict_contract)
df_encoded['Contract'].head(10)

0    0
1    1
2    0
3    1
4    0
5    0
6    0
7    0
8    0
9    1
Name: Contract, dtype: int64

In [62]:
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,...,29.85,29.85,No,0,True,False,False,False,True,False
1,0,No,No,Yes,No,DSL,Yes,No,Yes,No,...,56.95,1889.5,No,1,False,True,False,False,False,True
2,0,No,No,Yes,No,DSL,Yes,Yes,No,No,...,53.85,108.15,Yes,0,False,True,False,False,False,True
3,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,...,42.3,1840.75,No,1,False,True,True,False,False,False
4,0,No,No,Yes,No,Fiber optic,No,No,No,No,...,70.7,151.65,Yes,0,True,False,False,False,True,False
5,0,No,No,Yes,Yes,Fiber optic,No,No,Yes,No,...,99.65,820.5,Yes,0,True,False,False,False,True,False
6,0,No,Yes,Yes,Yes,Fiber optic,No,Yes,No,No,...,89.1,1949.4,No,1,False,True,False,True,False,False
7,0,No,No,No,No phone service,DSL,Yes,No,No,No,...,29.75,301.9,No,0,True,False,False,False,False,True
8,0,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,Yes,...,104.8,3046.05,Yes,1,True,False,False,False,True,False
9,0,No,Yes,Yes,No,DSL,Yes,Yes,No,No,...,56.15,3487.95,No,2,False,True,True,False,False,False


In [63]:
df_encoded.isnull().sum()

SeniorCitizen                              0
Partner                                    0
Dependents                                 0
PhoneService                               0
MultipleLines                              0
InternetService                            0
OnlineSecurity                             0
OnlineBackup                               0
DeviceProtection                           0
TechSupport                                0
StreamingTV                                0
StreamingMovies                            0
Contract                                   0
PaperlessBilling                           0
MonthlyCharges                             0
TotalCharges                               0
Churn                                      0
tenureBins                                 0
gender_Female                              0
gender_Male                                0
PaymentMethod_Bank transfer (automatic)    0
PaymentMethod_Credit card (automatic)      0
PaymentMet

In [64]:
encode_dict_Partner = {
                            'No' : 0,
                            'Yes' : 1
                            }

df_encoded['Partner'] = df_encoded['Partner'].map(encode_dict_Partner)
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,No,No,No phone service,DSL,No,Yes,No,No,...,29.85,29.85,No,0,True,False,False,False,True,False
1,0,0,No,Yes,No,DSL,Yes,No,Yes,No,...,56.95,1889.5,No,1,False,True,False,False,False,True
2,0,0,No,Yes,No,DSL,Yes,Yes,No,No,...,53.85,108.15,Yes,0,False,True,False,False,False,True
3,0,0,No,No,No phone service,DSL,Yes,No,Yes,Yes,...,42.3,1840.75,No,1,False,True,True,False,False,False
4,0,0,No,Yes,No,Fiber optic,No,No,No,No,...,70.7,151.65,Yes,0,True,False,False,False,True,False
5,0,0,No,Yes,Yes,Fiber optic,No,No,Yes,No,...,99.65,820.5,Yes,0,True,False,False,False,True,False
6,0,0,Yes,Yes,Yes,Fiber optic,No,Yes,No,No,...,89.1,1949.4,No,1,False,True,False,True,False,False
7,0,0,No,No,No phone service,DSL,Yes,No,No,No,...,29.75,301.9,No,0,True,False,False,False,False,True
8,0,1,No,Yes,Yes,Fiber optic,No,No,Yes,Yes,...,104.8,3046.05,Yes,1,True,False,False,False,True,False
9,0,0,Yes,Yes,No,DSL,Yes,Yes,No,No,...,56.15,3487.95,No,2,False,True,True,False,False,False


In [65]:
encode_dict_Dependents = {
                            'No' : 0,
                            'Yes' : 1
                            }

df_encoded['Dependents'] = df_encoded['Dependents'].map(encode_dict_Dependents)
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,No,No phone service,DSL,No,Yes,No,No,...,29.85,29.85,No,0,True,False,False,False,True,False
1,0,0,0,Yes,No,DSL,Yes,No,Yes,No,...,56.95,1889.5,No,1,False,True,False,False,False,True
2,0,0,0,Yes,No,DSL,Yes,Yes,No,No,...,53.85,108.15,Yes,0,False,True,False,False,False,True
3,0,0,0,No,No phone service,DSL,Yes,No,Yes,Yes,...,42.3,1840.75,No,1,False,True,True,False,False,False
4,0,0,0,Yes,No,Fiber optic,No,No,No,No,...,70.7,151.65,Yes,0,True,False,False,False,True,False
5,0,0,0,Yes,Yes,Fiber optic,No,No,Yes,No,...,99.65,820.5,Yes,0,True,False,False,False,True,False
6,0,0,1,Yes,Yes,Fiber optic,No,Yes,No,No,...,89.1,1949.4,No,1,False,True,False,True,False,False
7,0,0,0,No,No phone service,DSL,Yes,No,No,No,...,29.75,301.9,No,0,True,False,False,False,False,True
8,0,1,0,Yes,Yes,Fiber optic,No,No,Yes,Yes,...,104.8,3046.05,Yes,1,True,False,False,False,True,False
9,0,0,1,Yes,No,DSL,Yes,Yes,No,No,...,56.15,3487.95,No,2,False,True,True,False,False,False


In [66]:
encode_dict_PaperlessBilling = {
    'No': 0,
    'Yes': 1
}

# Clean values before mapping (remove spaces, normalize case)
df_encoded['PaperlessBilling'] = (
    df_encoded['PaperlessBilling']
    .str.strip()
    .str.title()
    .map(encode_dict_PaperlessBilling)
)

df_encoded.head(10)


Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,No,No phone service,DSL,No,Yes,No,No,...,29.85,29.85,No,0,True,False,False,False,True,False
1,0,0,0,Yes,No,DSL,Yes,No,Yes,No,...,56.95,1889.5,No,1,False,True,False,False,False,True
2,0,0,0,Yes,No,DSL,Yes,Yes,No,No,...,53.85,108.15,Yes,0,False,True,False,False,False,True
3,0,0,0,No,No phone service,DSL,Yes,No,Yes,Yes,...,42.3,1840.75,No,1,False,True,True,False,False,False
4,0,0,0,Yes,No,Fiber optic,No,No,No,No,...,70.7,151.65,Yes,0,True,False,False,False,True,False
5,0,0,0,Yes,Yes,Fiber optic,No,No,Yes,No,...,99.65,820.5,Yes,0,True,False,False,False,True,False
6,0,0,1,Yes,Yes,Fiber optic,No,Yes,No,No,...,89.1,1949.4,No,1,False,True,False,True,False,False
7,0,0,0,No,No phone service,DSL,Yes,No,No,No,...,29.75,301.9,No,0,True,False,False,False,False,True
8,0,1,0,Yes,Yes,Fiber optic,No,No,Yes,Yes,...,104.8,3046.05,Yes,1,True,False,False,False,True,False
9,0,0,1,Yes,No,DSL,Yes,Yes,No,No,...,56.15,3487.95,No,2,False,True,True,False,False,False


In [67]:
df_encoded.isnull().sum()

SeniorCitizen                              0
Partner                                    0
Dependents                                 0
PhoneService                               0
MultipleLines                              0
InternetService                            0
OnlineSecurity                             0
OnlineBackup                               0
DeviceProtection                           0
TechSupport                                0
StreamingTV                                0
StreamingMovies                            0
Contract                                   0
PaperlessBilling                           0
MonthlyCharges                             0
TotalCharges                               0
Churn                                      0
tenureBins                                 0
gender_Female                              0
gender_Male                                0
PaymentMethod_Bank transfer (automatic)    0
PaymentMethod_Credit card (automatic)      0
PaymentMet

In [68]:
encode_dict_Churn = {
                            'No' : False,
                            'Yes' : True
                            }

df_encoded['Churn'] = df_encoded['Churn'].map(encode_dict_Churn)
df_encoded.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,No,No phone service,DSL,No,Yes,No,No,...,29.85,29.85,False,0,True,False,False,False,True,False
1,0,0,0,Yes,No,DSL,Yes,No,Yes,No,...,56.95,1889.5,False,1,False,True,False,False,False,True
2,0,0,0,Yes,No,DSL,Yes,Yes,No,No,...,53.85,108.15,True,0,False,True,False,False,False,True
3,0,0,0,No,No phone service,DSL,Yes,No,Yes,Yes,...,42.3,1840.75,False,1,False,True,True,False,False,False
4,0,0,0,Yes,No,Fiber optic,No,No,No,No,...,70.7,151.65,True,0,True,False,False,False,True,False
5,0,0,0,Yes,Yes,Fiber optic,No,No,Yes,No,...,99.65,820.5,True,0,True,False,False,False,True,False
6,0,0,1,Yes,Yes,Fiber optic,No,Yes,No,No,...,89.1,1949.4,False,1,False,True,False,True,False,False
7,0,0,0,No,No phone service,DSL,Yes,No,No,No,...,29.75,301.9,False,0,True,False,False,False,False,True
8,0,1,0,Yes,Yes,Fiber optic,No,No,Yes,Yes,...,104.8,3046.05,True,1,True,False,False,False,True,False
9,0,0,1,Yes,No,DSL,Yes,Yes,No,No,...,56.15,3487.95,False,2,False,True,True,False,False,False


In [69]:
# Let's combine extra features so that we can reduce the number of features

# --- PhoneServiceBins ---
df_encoded['PhoneServiceBins'] = (
    df_encoded['PhoneService'].map({'No': 0, 'Yes': 1}) +
    df_encoded['MultipleLines'].map({'No phone service': 0, 'No': 1, 'Yes': 2, })
)

# --- InternetServiceBins ---
df_encoded['InternetServiceBins'] = (
    # InternetService_No is boolean, so map with True/False not strings
    # changed the code so that InterbetService will not be binned and encoded
    # since fiber optic customers are less likely to leave we can give that feature a higher score
    df_encoded['InternetService'].map({'No': 0, 'DSL': 1, 'Fiber optic': 2}) +
    df_encoded[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport', 'StreamingTV', 'StreamingMovies']]
    .apply(lambda col: col.map({'No internet service': 0, 'No': 1, 'Yes': 2, }))
    .sum(axis=1)
)

In [70]:
df_encoded.drop(columns=[
    'PhoneService', 'MultipleLines',
    'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies', 'InternetService'
], inplace=True)
df_encoded.head(15)

Unnamed: 0,SeniorCitizen,Partner,Dependents,Contract,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,0,1,29.85,29.85,False,0,True,False,False,False,True,False,0,8
1,0,0,0,1,0,56.95,1889.5,False,1,False,True,False,False,False,True,2,9
2,0,0,0,0,1,53.85,108.15,True,0,False,True,False,False,False,True,2,9
3,0,0,0,1,0,42.3,1840.75,False,1,False,True,True,False,False,False,0,10
4,0,0,0,0,1,70.7,151.65,True,0,True,False,False,False,True,False,2,8
5,0,0,0,0,1,99.65,820.5,True,0,True,False,False,False,True,False,3,11
6,0,0,1,0,1,89.1,1949.4,False,1,False,True,False,True,False,False,3,10
7,0,0,0,0,0,29.75,301.9,False,0,True,False,False,False,False,True,0,8
8,0,1,0,0,1,104.8,3046.05,True,1,True,False,False,False,True,False,3,12
9,0,0,1,1,0,56.15,3487.95,False,2,False,True,True,False,False,False,2,9


In [71]:
df_encoded.to_csv('data/processed/ChurnModelling_Encoded.csv', index=False)

In [72]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [73]:
nominal_variables = ['gender', 'InternetService', 'Contract', 'PaymentMethod']

In [74]:
df['gender'].values.reshape(7043, 1)

array([['Female'],
       ['Male'],
       ['Male'],
       ...,
       ['Female'],
       ['Male'],
       ['Male']], dtype=object)

In [75]:
df = pd.read_csv('data/processed/ChurnModelling_Binning_Applied.csv')
df.head(10)

ohe_gender = OneHotEncoder()
# ohe_InternetService = OneHotEncoder()
ohe_Contract = OneHotEncoder()
ohe_PaymentMethod = OneHotEncoder()

le_tenure = LabelEncoder()
le_TotalCharges = LabelEncoder()

ohe_gender.fit(df['gender'].values.reshape(7043, 1))
# ohe_InternetService.fit(df['InternetService'].values.reshape(7043, 1))
ohe_Contract.fit(df['Contract'].values.reshape(7043, 1))
ohe_PaymentMethod.fit(df['PaymentMethod'].values.reshape(7043, 1))

In [76]:
gender_ohe = ohe_gender.transform(df['gender'].values.reshape(7043, 1))
gender_ohe = gender_ohe.toarray()
gender_ohe

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [77]:
ohe_Contract.fit(df['Contract'].values.reshape(-1, 1))
Contract_ohe = ohe_Contract.transform(df['Contract'].values.reshape(-1, 1)).toarray()
Contract_ohe

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [78]:
ohe_PaymentMethod.fit(df['PaymentMethod'].values.reshape(-1, 1))
PaymentMethod_ohe = ohe_PaymentMethod.transform(df['PaymentMethod'].values.reshape(-1, 1)).toarray()
PaymentMethod_ohe

array([[0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])