### 1. Import Dependencies

In [1]:
import os
import numpy as np
import pandas as pd # alias
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### 2. Important Concepts

#### 2.1 Normalization vs Standarization

#### 2.1.1 What is Normalization ?

Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It is also known as Min-Max scaling.

#### 2.1.2 What is Standardization ?

Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.

0 - 1, 0 + 1 = > -1, 1

### 3. basic processing

In [2]:
df = pd.read_csv('data/processed/ChurnModelling_Encoded.csv')
df

Unnamed: 0,SeniorCitizen,Partner,Dependents,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,...,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,1,29.85,29.85,False,0,True,False,...,False,True,False,False,False,False,True,False,0,2
1,0,0,0,0,56.95,1889.50,False,3,False,True,...,False,False,True,False,False,False,False,True,1,3
2,0,0,0,1,53.85,108.15,True,0,False,True,...,False,True,False,False,False,False,False,True,1,3
3,0,0,0,0,42.30,1840.75,False,3,False,True,...,False,False,True,False,True,False,False,False,0,4
4,0,0,0,1,70.70,151.65,True,0,True,False,...,False,True,False,False,False,False,True,False,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,1,84.80,1990.50,False,3,False,True,...,False,False,True,False,False,False,False,True,2,6
7039,0,1,1,1,103.20,7362.90,False,4,True,False,...,False,False,True,False,False,True,False,False,2,5
7040,0,1,1,1,29.60,346.45,False,0,True,False,...,False,True,False,False,False,False,True,False,0,2
7041,1,1,0,1,74.40,306.60,True,0,False,True,...,False,True,False,False,False,False,False,True,2,1


| Condition | Min-Max Scaling | Standardization (Z-score) |
|-----------|----------------|------------------------|
| Data has a known, fixed range | ✅ Yes | ❌ Not ideal |
| Data contains outliers | ❌ Sensitive to outliers | ✅ More robust to outliers |
| Data is normally distributed | ❌ Not necessary | ✅ Preferred |
| Data is not normally distributed (e.g., skewed) | ✅ If shape needs to be preserved | ✅ Often works well after log-transform |
| Model is distance-based (KNN, SVM) | ✅ Recommended | ✅ Also acceptable |
| Model is neural network | ✅ Strongly recommended | ❌ May slow training |
| Model is linear or uses regularization | ❌ Not ideal | ✅ Helps with convergence |
| Input features need bounded values (0–1) | ✅ Required | ❌ Not bounded |
| Applying PCA or LDA | ❌ May distort variance | ✅ Required (centering needed) |
| Want to preserve original distribution shape | ✅ Maintains feature shape | ✅ Maintains shape but centers data |
| Working with tree-based models | ❌ Not needed | ❌ Not needed |

In [3]:
columns_need_to_be_scaled = ['MonthlyCharges', 'TotalCharges']

for col in columns_need_to_be_scaled:
    scaler = StandardScaler()
    #scaler = MinMaxScaler()

    df[col] = scaler.fit_transform(df[col].values.reshape(7043, 1))

    #scaler.fit(df[col].values.reshape(10000,1))
    #df[col] = scaler.transform(df[col].values.reshape(10000, 1))

df

Unnamed: 0,SeniorCitizen,Partner,Dependents,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tenureBins,gender_Female,gender_Male,...,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneServiceBins,InternetServiceBins
0,0,1,0,1,-1.160323,-0.992611,False,0,True,False,...,False,True,False,False,False,False,True,False,0,2
1,0,0,0,0,-0.259629,-0.172165,False,3,False,True,...,False,False,True,False,False,False,False,True,1,3
2,0,0,0,1,-0.362660,-0.958066,True,0,False,True,...,False,True,False,False,False,False,False,True,1,3
3,0,0,0,0,-0.746535,-0.193672,False,3,False,True,...,False,False,True,False,True,False,False,False,0,4
4,0,0,0,1,0.197365,-0.938874,True,0,True,False,...,False,True,False,False,False,False,True,False,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,1,0.665992,-0.127605,False,3,False,True,...,False,False,True,False,False,False,False,True,2,6
7039,0,1,1,1,1.277533,2.242606,False,4,True,False,...,False,False,True,False,False,True,False,False,2,5
7040,0,1,1,1,-1.168632,-0.852932,False,0,True,False,...,False,True,False,False,False,False,True,False,0,2
7041,1,1,0,1,0.320338,-0.870513,True,0,False,True,...,False,True,False,False,False,False,False,True,2,1


In [4]:
df['TotalCharges']

0      -0.992611
1      -0.172165
2      -0.958066
3      -0.193672
4      -0.938874
          ...   
7038   -0.127605
7039    2.242606
7040   -0.852932
7041   -0.870513
7042    2.013897
Name: TotalCharges, Length: 7043, dtype: float64

In [5]:
df['MonthlyCharges']

0      -1.160323
1      -0.259629
2      -0.362660
3      -0.746535
4       0.197365
          ...   
7038    0.665992
7039    1.277533
7040   -1.168632
7041    0.320338
7042    1.358961
Name: MonthlyCharges, Length: 7043, dtype: float64

In [6]:
# This is a data frame in an of it self with extra [ ]
df[['MonthlyCharges']]

Unnamed: 0,MonthlyCharges
0,-1.160323
1,-0.259629
2,-0.362660
3,-0.746535
4,0.197365
...,...
7038,0.665992
7039,1.277533
7040,-1.168632
7041,0.320338


In [7]:
df.to_csv('data/processed/ChurnModelling_Final.csv', index=False)