### Data Preprocessing

In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Loading Cleaned Dataset
df = pd.read_csv("../data/Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Fixing TotalCharges
df["TotalCharges"] = pd.to_numeric(
    df["TotalCharges"], errors="coerce"
)

df = df.dropna(subset=["TotalCharges"])
# df.shape
print(f"Dataset shape after cleaning TotalCharges is: {df.shape}")


Dataset shape after cleaning TotalCharges is: (7032, 21)


In [4]:
# Encoding Target Variable
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

print("Target variable distribution after encoding:")
print(df["Churn"].value_counts())
print(f"Churn rate: {df['Churn'].mean() * 100:.2f}%")


# Feature / Target Split
X = df.drop(["Churn", "customerID"], axis=1)
y = df["Churn"]

# Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

Target variable distribution after encoding:
Churn
0    5163
1    1869
Name: count, dtype: int64
Churn rate: 26.58%


In [5]:
# Identifying Feature Types
numeric_features = X_train.select_dtypes(
    include=["int64", "float64"]
).columns

categorical_features = X_train.select_dtypes(
    include=["object"]
).columns

print("Feature Type Summary:")
print("-" * 40)
print(f"Numerical features ({len(numeric_features)}):")
print(list(numeric_features))

print(f"\nCategorical features ({len(categorical_features)}):")
print(list(categorical_features))


Feature Type Summary:
----------------------------------------
Numerical features (4):
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

Categorical features (15):
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [6]:
# Define Numeric Transformer
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

In [7]:
# Defining Categorical Transformer
categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
        ))
    ]
)

In [8]:
# Combining Transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [9]:
# Fit on Training Data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [10]:
# Sanity Checks
print("Preprocessing Output Shapes:")
print("-" * 30)
print(f"X_train_processed shape: {X_train_processed.shape}")
print(f"X_test_processed shape:  {X_test_processed.shape}")


Preprocessing Output Shapes:
------------------------------
X_train_processed shape: (5625, 45)
X_test_processed shape:  (1407, 45)


In [11]:
# Retrieve Feature Names
feature_names = preprocessor.get_feature_names_out()

print("Feature Names After Preprocessing:")
print("-" * 30)
print(f"Total number of features: {len(feature_names)}")
print(feature_names)


Feature Names After Preprocessing:
------------------------------
Total number of features: 45
['num__SeniorCitizen' 'num__tenure' 'num__MonthlyCharges'
 'num__TotalCharges' 'cat__gender_Female' 'cat__gender_Male'
 'cat__Partner_No' 'cat__Partner_Yes' 'cat__Dependents_No'
 'cat__Dependents_Yes' 'cat__PhoneService_No' 'cat__PhoneService_Yes'
 'cat__MultipleLines_No' 'cat__MultipleLines_No phone service'
 'cat__MultipleLines_Yes' 'cat__InternetService_DSL'
 'cat__InternetService_Fiber optic' 'cat__InternetService_No'
 'cat__OnlineSecurity_No' 'cat__OnlineSecurity_No internet service'
 'cat__OnlineSecurity_Yes' 'cat__OnlineBackup_No'
 'cat__OnlineBackup_No internet service' 'cat__OnlineBackup_Yes'
 'cat__DeviceProtection_No' 'cat__DeviceProtection_No internet service'
 'cat__DeviceProtection_Yes' 'cat__TechSupport_No'
 'cat__TechSupport_No internet service' 'cat__TechSupport_Yes'
 'cat__StreamingTV_No' 'cat__StreamingTV_No internet service'
 'cat__StreamingTV_Yes' 'cat__StreamingMovies_No

In [12]:
# Saving Preprocessed Outputs
np.save("../data/X_train_processed.npy", X_train_processed)
np.save("../data/X_test_processed.npy", X_test_processed)
np.save("../data/y_train.npy", y_train.values)
np.save("../data/y_test.npy", y_test.values)

print("Preprocessed datasets saved successfully:")
print(" - X_train_processed.npy")
print(" - X_test_processed.npy")
print(" - y_train.npy")
print(" - y_test.npy")

Preprocessed datasets saved successfully:
 - X_train_processed.npy
 - X_test_processed.npy
 - y_train.npy
 - y_test.npy


In [13]:
np.save("../data/feature_names.npy", feature_names)
print(f"Feature names saved successfully.")

print("Preprocessed Training Data Summary:")
print(f"Number of samples: {X_train_processed.shape[0]}")
print(f"Number of features: {X_train_processed.shape[1]}")


Feature names saved successfully.
Preprocessed Training Data Summary:
Number of samples: 5625
Number of features: 45


### Preprocessing Summary

- Converted TotalCharges to numeric and removed invalid entries.
- Encoded churn as a binary target variable.
- Performed a stratified train-test split to preserve class imbalance.
- Applied feature-type-specific preprocessing using ColumnTransformer:
  - StandardScaler for numerical features
  - OneHotEncoder for categorical features
- Fitted preprocessing only on training data to prevent data leakage.