# 02 - Feature Engineering

We will prepare the dataset for model training by performing:
- Removing duplicates
- Converting data types
- Encoding categorical variables
- Scaling numerical variables
- Saving the processed dataset

In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib
import os

# Paths
RAW_DATA_PATH = "../data/raw/Telco-Customer-Churn.csv"
PROCESSED_DATA_PATH = "../data/processed/processed_churn.csv"

In [50]:
df = pd.read_csv(RAW_DATA_PATH)
print("Shape before cleaning: ", df.shape)
df.head()

Shape before cleaning:  (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [51]:
# Drop customerID column ( not useful for modeling )
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)
    
print("Shape after dropping customerID: ", df.shape)

Shape after dropping customerID:  (7043, 20)


In [52]:
# Drop duplicate rows
print("Duplicate rows before: ", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicate rows after: ", df.duplicated().sum())
print("New shape:", df.shape)

Duplicate rows before:  22
Duplicate rows after:  0
New shape: (7021, 20)


In [53]:
# Convert TotalCharges to float and handle missing values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Convert SeniorCitizen to categorical
df['SeniorCitizen'] = df['SeniorCitizen'].map({0: "No", 1: "Yes"})

print(df.dtypes)


gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [54]:
# Seperate features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

print("Features shape: ", X.shape)
print("---------------------------")
print("Target distribution:\n ", y.value_counts())

Features shape:  (7021, 19)
---------------------------
Target distribution:
  Churn
No     5164
Yes    1857
Name: count, dtype: int64


In [55]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical columns: ", categorical_cols)
print("Numerical columns: ", numerical_cols)

Categorical columns:  ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical columns:  ['tenure', 'MonthlyCharges', 'TotalCharges']


In [56]:
# Define preprocessing for categorical & numerical data
categorical_transfomer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transfomer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ]
)

In [57]:

# Fit and transform the data
X_processed =  preprocessor.fit_transform(X)

print("Processed feature matrix shape: ", X_processed.shape)

Processed feature matrix shape:  (7021, 46)


In [58]:
# Save preprocessor for later use in training
os.makedirs("../models", exist_ok=True)
joblib.dump(preprocessor, "../models/preprocessor.pkl")

['../models/preprocessor.pkl']

In [59]:
# Covert processed features to Dataframe for saving to processed file
processed_feature_names = (
    preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist() + numerical_cols
)

X_processed_df = pd.DataFrame(X_processed, columns=processed_feature_names)
X_processed_df['Churn'] = y.values

X_processed_df.head()

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges,Churn
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-1.282728,-1.164135,-0.997334,No
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.062387,-0.262811,-0.176352,No
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,-1.241967,-0.365914,-0.962766,Yes
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.510759,-0.750058,-0.197874,No
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-1.241967,0.194503,-0.943562,Yes


In [60]:
# Save processed dataset
os.makedirs("../data/processed", exist_ok=True)
X_processed_df.to_csv(PROCESSED_DATA_PATH, index=False)
print("Processed dataset saved to: ", PROCESSED_DATA_PATH)

Processed dataset saved to:  ../data/processed/processed_churn.csv
