# 📌 Title: 2. Data Preprocessing & Feature Engineering
# 🎯 Goal: Prepare data for modeling with clean features and handling of edge cases

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import os

os.chdir("..")

df = pd.read_csv('data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f"Original shape: {df.shape}")

Original shape: (7043, 21)


In [5]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'])

In [6]:
cols_to_fix = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']
for col in cols_to_fix:
    df[col] = df[col].replace({'No internet service': 'No', 'No phone service': 'No'})

In [7]:
df.drop('customerID', axis=1, inplace=True)

In [8]:
df['Monthly_to_Total_Ratio'] = df['MonthlyCharges'] / (df['TotalCharges'] + 1)  # +1 to avoid divide by 0

In [9]:
df['TenureGroup'] = pd.cut(df['tenure'], bins=[0,12,24,48,72], labels=['0-1Y','1-2Y','2-4Y','4-6Y'])

In [10]:
df['HasPremiumServices'] = ((df['InternetService'] != 'No') & 
                            (df['StreamingTV'] == 'Yes') & 
                            (df['OnlineBackup'] == 'Yes')).astype(int)

In [11]:
df['IsHighMonthly'] = (df['MonthlyCharges'] > 80).astype(int)

In [12]:
categorical_cols = df.select_dtypes(include='object').columns.tolist()
categorical_cols.remove('Churn')  # Target

encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

In [13]:
numerical_df = df[['tenure', 'MonthlyCharges', 'TotalCharges', 
                   'Monthly_to_Total_Ratio', 'HasPremiumServices', 'IsHighMonthly']]

X = pd.concat([numerical_df, encoded_df], axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify=y, random_state=42)

In [15]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Original: {np.bincount(y_train)} → Balanced: {np.bincount(y_train_balanced)}")

Original: [4139 1495] → Balanced: [4139 4139]


In [16]:
# Save datasets
pd.concat([X_train_balanced, y_train_balanced], axis=1).to_csv('data/processed/X_train_balanced.csv', index=False)
X_test.to_csv('data/processed/X_test.csv', index=False)
y_test.to_csv('data/processed/y_test.csv', index=False)

# Save encoder and feature names for future use
joblib.dump(encoder, 'models/onehot_encoder.pkl')
joblib.dump(X_train_balanced.columns.tolist(), 'models/feature_columns.pkl')

print("✅ Preprocessing complete! Files saved.")

✅ Preprocessing complete! Files saved.
