### 1. Import Dependencies

In [22]:
import os
import numpy as np
import pandas as pd # alias
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

### 2. Important Concepts

#### 2.1 Normalization vs Standarization

#### 2.1.1 What is Normalization ?

Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It is also known as Min-Max scaling.

#### 2.1.2 What is Standardization ?

Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.

0 - 1, 0 + 1 = > -1, 1

### 3. basic processing

In [23]:
df = pd.read_csv('data/processed/ChurnModelling_Encoded.csv')
df

Unnamed: 0,SeniorCitizen,Partner,Dependents,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,1,29.85,29.85,False,0,True,False,True,False,False,False,False,True,False,0,8
1,0,0,0,0,56.95,1889.50,False,1,False,True,False,True,False,False,False,False,True,2,9
2,0,0,0,1,53.85,108.15,True,0,False,True,True,False,False,False,False,False,True,2,9
3,0,0,0,0,42.30,1840.75,False,1,False,True,False,True,False,True,False,False,False,0,10
4,0,0,0,1,70.70,151.65,True,0,True,False,True,False,False,False,False,True,False,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,1,84.80,1990.50,False,1,False,True,False,True,False,False,False,False,True,3,12
7039,0,1,1,1,103.20,7362.90,False,2,True,False,False,True,False,False,True,False,False,3,12
7040,0,1,1,1,29.60,346.45,False,0,True,False,True,False,False,False,False,True,False,0,8
7041,1,1,0,1,74.40,306.60,True,0,False,True,True,False,False,False,False,False,True,3,8


| Condition | Min-Max Scaling | Standardization (Z-score) |
|-----------|----------------|------------------------|
| Data has a known, fixed range | ✅ Yes | ❌ Not ideal |
| Data contains outliers | ❌ Sensitive to outliers | ✅ More robust to outliers |
| Data is normally distributed | ❌ Not necessary | ✅ Preferred |
| Data is not normally distributed (e.g., skewed) | ✅ If shape needs to be preserved | ✅ Often works well after log-transform |
| Model is distance-based (KNN, SVM) | ✅ Recommended | ✅ Also acceptable |
| Model is neural network | ✅ Strongly recommended | ❌ May slow training |
| Model is linear or uses regularization | ❌ Not ideal | ✅ Helps with convergence |
| Input features need bounded values (0–1) | ✅ Required | ❌ Not bounded |
| Applying PCA or LDA | ❌ May distort variance | ✅ Required (centering needed) |
| Want to preserve original distribution shape | ✅ Maintains feature shape | ✅ Maintains shape but centers data |
| Working with tree-based models | ❌ Not needed | ❌ Not needed |

In [24]:
df.head(10)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,1,29.85,29.85,False,0,True,False,True,False,False,False,False,True,False,0,8
1,0,0,0,0,56.95,1889.5,False,1,False,True,False,True,False,False,False,False,True,2,9
2,0,0,0,1,53.85,108.15,True,0,False,True,True,False,False,False,False,False,True,2,9
3,0,0,0,0,42.3,1840.75,False,1,False,True,False,True,False,True,False,False,False,0,10
4,0,0,0,1,70.7,151.65,True,0,True,False,True,False,False,False,False,True,False,2,8
5,0,0,0,1,99.65,820.5,True,0,True,False,True,False,False,False,False,True,False,3,11
6,0,0,1,1,89.1,1949.4,False,1,False,True,True,False,False,False,True,False,False,3,10
7,0,0,0,0,29.75,301.9,False,0,True,False,True,False,False,False,False,False,True,0,8
8,0,1,0,1,104.8,3046.05,True,1,True,False,True,False,False,False,False,True,False,3,12
9,0,0,1,0,56.15,3487.95,False,2,False,True,False,True,False,True,False,False,False,2,9


In [25]:
columns_need_to_be_robust_scaled = ['MonthlyCharges', 'TotalCharges']

for col in columns_need_to_be_robust_scaled:
    #scaler = StandardScaler()
    #scaler = MinMaxScaler()
    scaler = RobustScaler()

    df[col] = scaler.fit_transform(df[col].values.reshape(7043, 1))

    #scaler.fit(df[col].values.reshape(10000,1))
    #df[col] = scaler.transform(df[col].values.reshape(10000, 1))

df

Unnamed: 0,SeniorCitizen,Partner,Dependents,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,1,-0.745170,-0.405008,False,0,True,False,True,False,False,False,False,True,False,0,8
1,0,0,0,0,-0.246550,0.144473,False,1,False,True,False,True,False,False,False,False,True,2,9
2,0,0,0,1,-0.303588,-0.381873,True,0,False,True,True,False,False,False,False,False,True,2,9
3,0,0,0,0,-0.516099,0.130068,False,1,False,True,False,True,False,True,False,False,False,0,10
4,0,0,0,1,0.006440,-0.369019,True,0,True,False,True,False,False,False,False,True,False,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,1,0.265869,0.174316,False,1,False,True,False,True,False,False,False,False,True,3,12
7039,0,1,1,1,0.604416,1.761729,False,2,True,False,False,True,False,False,True,False,False,3,12
7040,0,1,1,1,-0.749770,-0.311461,False,0,True,False,True,False,False,False,False,True,False,0,8
7041,1,1,0,1,0.074517,-0.323235,True,0,False,True,True,False,False,False,False,False,True,3,8


In [26]:
# Since tenureBins, PhoneServiceBins, InternetServiceBins are within a small definable range,
# it is better to use MinMaxScaler for these columns to get to 0-1 range
columns_need_to_be_minmax_scaled = ['tenureBins', 'PhoneServiceBins', 'InternetServiceBins']

for col in columns_need_to_be_minmax_scaled:
    #scaler = StandardScaler()
    scaler = MinMaxScaler()
    #scaler = RobustScaler()

    df[col] = scaler.fit_transform(df[col].values.reshape(7043, 1))

    #scaler.fit(df[col].values.reshape(10000,1))
    #df[col] = scaler.transform(df[col].values.reshape(10000, 1))

df

Unnamed: 0,SeniorCitizen,Partner,Dependents,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,1,-0.745170,-0.405008,False,0.0,True,False,True,False,False,False,False,True,False,0.000000,0.571429
1,0,0,0,0,-0.246550,0.144473,False,0.5,False,True,False,True,False,False,False,False,True,0.666667,0.642857
2,0,0,0,1,-0.303588,-0.381873,True,0.0,False,True,True,False,False,False,False,False,True,0.666667,0.642857
3,0,0,0,0,-0.516099,0.130068,False,0.5,False,True,False,True,False,True,False,False,False,0.000000,0.714286
4,0,0,0,1,0.006440,-0.369019,True,0.0,True,False,True,False,False,False,False,True,False,0.666667,0.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,1,0.265869,0.174316,False,0.5,False,True,False,True,False,False,False,False,True,1.000000,0.857143
7039,0,1,1,1,0.604416,1.761729,False,1.0,True,False,False,True,False,False,True,False,False,1.000000,0.857143
7040,0,1,1,1,-0.749770,-0.311461,False,0.0,True,False,True,False,False,False,False,True,False,0.000000,0.571429
7041,1,1,0,1,0.074517,-0.323235,True,0.0,False,True,True,False,False,False,False,False,True,1.000000,0.571429


In [27]:
df['TotalCharges']

0      -0.405008
1       0.144473
2      -0.381873
3       0.130068
4      -0.369019
          ...   
7038    0.174316
7039    1.761729
7040   -0.311461
7041   -0.323235
7042    1.608554
Name: TotalCharges, Length: 7043, dtype: float64

In [28]:
df['MonthlyCharges']

0      -0.745170
1      -0.246550
2      -0.303588
3      -0.516099
4       0.006440
          ...   
7038    0.265869
7039    0.604416
7040   -0.749770
7041    0.074517
7042    0.649494
Name: MonthlyCharges, Length: 7043, dtype: float64

In [29]:
# This is a data frame in an of it self with extra [ ]
df[['MonthlyCharges']]

Unnamed: 0,MonthlyCharges
0,-0.745170
1,-0.246550
2,-0.303588
3,-0.516099
4,0.006440
...,...
7038,0.265869
7039,0.604416
7040,-0.749770
7041,0.074517


In [30]:
df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,1,-0.74517,-0.405008,False,0.0,True,False,True,False,False,False,False,True,False,0.0,0.571429
1,0,0,0,0,-0.24655,0.144473,False,0.5,False,True,False,True,False,False,False,False,True,0.666667,0.642857
2,0,0,0,1,-0.303588,-0.381873,True,0.0,False,True,True,False,False,False,False,False,True,0.666667,0.642857
3,0,0,0,0,-0.516099,0.130068,False,0.5,False,True,False,True,False,True,False,False,False,0.0,0.714286
4,0,0,0,1,0.00644,-0.369019,True,0.0,True,False,True,False,False,False,False,True,False,0.666667,0.571429


In [31]:
df.to_csv('data/processed/ChurnModelling_Final.csv', index=False)