In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../data/cleaned_data.csv')

print("="*50)
print("FEATURE ENGINEERING")
print("="*50)

# Create a copy for feature engineering
df_fe = df.copy()

print("\n1. Creating tenure-based features...")

#Tenure groups (research shows different behavior patterns)
def create_tenure_group(tenure):
    if tenure <= 12:
        return '0-12 months'
    elif tenure <= 24:
        return '12-24 months'
    elif tenure <= 48:
        return '24-48 months'
    else:
        return '48+ months'

df_fe['tenure_group'] = df_fe['tenure'].apply(create_tenure_group)

# Binary flags fro critical periods
df_fe['is_new_customer'] = (df_fe['tenure'] <= 12).astype(int)
df_fe['is_loyal_customer'] = (df_fe['tenure'] >= 48).astype(int)

print(f"   - tenure_group created")
print(f"   - is_new_customer: {df_fe['is_new_customer'].sum()} customers")
print(f"   - is_loyal_customer: {df_fe['is_loyal_customer'].sum()} customers")

FEATURE ENGINEERING

1. Creating tenure-based features...
   - tenure_group created
   - is_new_customer: 2186 customers
   - is_loyal_customer: 2303 customers


In [4]:
print("\n2. Creating charge-based features...")

# Average monthly charges (TotalCharges / tenure)
# Handle tenure = 0 edge case
df_fe['avg_monthly_charges'] = df_fe.apply(
    lambda row: row['MonthlyCharges'] if row['tenure'] == 0
    else row['TotalCharges'] / row['tenure'],
    axis=1
)

# Charge increase rate (Monthly vs Average)
df_fe['charge_increase_rate'] = (
    df_fe['MonthlyCharges'] - df_fe['avg_monthly_charges']
) / (df_fe['avg_monthly_charges'] + 1)  # +1 to avoid division by zero

# High charges flag (top 25%)
df_fe['high_monthly_charges'] = (
    df_fe['MonthlyCharges'] > df_fe['MonthlyCharges'].quantile(0.75)
).astype(int)

print(f"   - avg_monthly_charges created")
print(f"   - charge_increases_rate created")
print(f"   - high_monthly_charges: {df_fe['high_monthly_charges'].sum()} customers")


2. Creating charge-based features...
   - avg_monthly_charges created
   - charge_increases_rate created
   - high_monthly_charges: 1758 customers


In [5]:
print("\n3. Creating service-based features...")

# Count of additional services
service_cols = ['PhoneService', 'MultipleLines', 'InternetService', 
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies']

#Convert to binary (Yes=1, No/No internet service=0)
for col in service_cols:
    df_fe[f'{col}_binary'] = df_fe[col].apply(
        lambda x: 1 if x == 'Yes' else 0
    )

#Total service count
binary_service_cols = [f'{col}_binary' for col in service_cols]
df_fe['total_services'] = df_fe[binary_service_cols].sum(axis=1)

# Premium support services (protective features)
df_fe['has_premium_support'] = (
    (df_fe['OnlineSecurity'] == 'Yes') |
    (df_fe['TechSupport'] == 'Yes')
).astype(int)

# Streaming services (entertainment)
df_fe['has_streaming'] = (
    (df_fe['StreamingTV'] == 'Yes') |
    (df_fe['StreamingMovies'] == 'Yes')
).astype(int)

# Protection bundle 
df_fe['has_protection_bundle'] = (
    (df_fe['OnlineSecurity'] == 'Yes') &
    (df_fe['OnlineBackup'] == 'Yes') &
    (df_fe['DeviceProtection'] == 'Yes')
).astype(int)

print(f"   - total_services created (mean: {df_fe['total_services'].mean():.2f})")
print(f"   - has_premium_support: {df_fe['has_premium_support'].sum()} customers")
print(f"   - has_streaming: {df_fe['has_streaming'].sum()} customers")
print(f"   - has_protection_bundle: {df_fe['has_protection_bundle'].sum()} customers")


3. Creating service-based features...
   - total_services created (mean: 3.36)
   - has_premium_support: 2964 customers
   - has_streaming: 3499 customers
   - has_protection_bundle: 693 customers


In [7]:
print("\n4. Creating contract & payment risk features...")

# Contract risk (month-to-month is risky)
df_fe['is_month_to_month'] = (df_fe['Contract'] == 'Month-to-month').astype(int)

# Payment method risk scores (based on EDA insights)
payment_risk = {
    'Electronic check': 3,  #Highest churn
    'Mailed check': 2,
    'Bank transfer (automatic)': 1,
    'Credit card (automatic)': 1  #Lowest churn
}
df_fe['payment_risk_score'] = df_fe['PaymentMethod'].map(payment_risk)

# Automatic payment (protective)
df_fe['is_auto_payment'] = df_fe['PaymentMethod'].apply(
    lambda x: 1 if 'automatic' in x.lower() else 0 
)

# Paperless billing (slight risk factor)
df_fe['paperless_billing_binary'] = (df_fe['PaperlessBilling'] == 'Yes').astype(int)

print(f"   - is_month_to_month: {df_fe['is_month_to_month'].sum()} customers")
print(f"   - payment_risk_score created")
print(f"   - is_auto_payment: {df_fe['is_auto_payment'].sum()} customers")


4. Creating contract & payment risk features...
   - is_month_to_month: 3875 customers
   - payment_risk_score created
   - is_auto_payment: 3066 customers


In [10]:
print("\n5. Processing demographic features...")

# Senior citizen is already binary (0/1)
df_fe['SeniorCitizen_binary'] = df_fe['SeniorCitizen']

# Partner and dependents
df_fe['has_partner'] = (df_fe['Partner'] == 'Yes').astype(int)
df_fe['has_dependents'] = (df_fe['Dependents'] == 'Yes').astype(int)

# Family flag (partner Or dependents)
df_fe['has_family'] = (
    (df_fe['has_partner'] == 1) | (df_fe['has_dependents'] == 1)
).astype(int)

print(f"   - has_family: {df_fe['has_family'].sum()} customers")


5. Processing demographic features...
   - has_family: 3763 customers


In [11]:
print("\n6. Creating interaction features...")

# High rish profile: new customer + month-to-month + no premium support
df_fe['high_risk_profile'] = (
    (df_fe['is_new_customer'] == 1) &
    (df_fe['is_month_to_month'] == 1) &
    (df_fe['has_premium_support'] == 0)
).astype(int)

# Loyal profitable customer
df_fe['loyal_profitable'] = (
    (df_fe['is_loyal_customer'] == 1) &
    (df_fe['high_monthly_charges'] == 1)
).astype(int)

# Services per dollar (value preception)
df_fe['services_per_dollar'] = df_fe['total_services'] / (df_fe['MonthlyCharges'] + 1)

print(f"   - high_risk_profile: {df_fe['high_risk_profile'].sum()} customers")
print(f"   - loyal_profitable: {df_fe['loyal_profitable'].sum()} customers")

print("\n" + "="*60)
print("FEATURE SUMMARY")
print("="*60)
print(f"Original features: {df.shape[1]}")
print(f"After feature engineering: {df_fe.shape[1]}")
print(f"New features created: {df_fe.shape[1] - df.shape[1]}")


6. Creating interaction features...
   - high_risk_profile: 1576 customers
   - loyal_profitable: 921 customers

FEATURE SUMMARY
Original features: 21
After feature engineering: 51
New features created: 30


In [14]:
print("\n" + "="*50)
print("Preparing Data For Modeling")
print("="*50)

# Drop unnecessary columns
df_model = df_fe.drop(['customerID'], axis=1)

# Separate target variable
X = df_model.drop('Churn', axis=1)
y = df_model['Churn'].map({'Yes': 1, 'No': 0})

print(f"\nTarget distribution:")
print(y.value_counts())
print(f"Churn rate: {y.mean()*100:.2f}%")

# Identify categorical columns that need encoding
categorical_to_encode = ['gender', 'Partner', 'Dependents', 'PhoneService', 
                          'MultipleLines', 'InternetService', 'OnlineSecurity',
                          'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                          'StreamingTV', 'StreamingMovies', 'Contract', 
                          'PaperlessBilling', 'PaymentMethod', 'tenure_group']

# Label encoding for binary and ordinal features
print("ENCODING CATEGORICAL FEATURES")

le_dict = {}
for col in categorical_to_encode:
    if col in X.columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        le_dict[col] = le
        print(f"Encoded: {col}")


Preparing Data For Modeling

Target distribution:
Churn
0    5174
1    1869
Name: count, dtype: int64
Churn rate: 26.54%
ENCODING CATEGORICAL FEATURES
Encoded: gender
Encoded: Partner
Encoded: Dependents
Encoded: PhoneService
Encoded: MultipleLines
Encoded: InternetService
Encoded: OnlineSecurity
Encoded: OnlineBackup
Encoded: DeviceProtection
Encoded: TechSupport
Encoded: StreamingTV
Encoded: StreamingMovies
Encoded: Contract
Encoded: PaperlessBilling
Encoded: PaymentMethod
Encoded: tenure_group


In [16]:
print("\n" + "="*60)
print("TRAIN TEST SPLIT")
print("="*60)

# Stratified split to maintain churn distributon
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTrain churn rate: {y_train.mean()*100:.2f}%")
print(f"Test churn rate: {y_test.mean()*100:.2f}%")


TRAIN TEST SPLIT
Training set: 5634 samples
Test set: 1409 samples

Train churn rate: 26.54%
Test churn rate: 26.54%


In [18]:
print("\n" + "="*60)
print("FEATURE SCALING")
print("="*60)

# Scale numerical features
scaler = StandardScaler()

# Identify numerical columns (excluding already binary features)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 
                      'avg_monthly_charges', 'charge_increase_rate',
                      'total_services', 'services_per_dollar']

# Only scale if they exit in the dataset
numerical_features = [col for col in numerical_features if col in X_train.columns]

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])

print(f"Scaled {len(numerical_features)} numerical features")


FEATURE SCALING
Scaled 7 numerical features


In [20]:
print("\n" + "="*60)
print("SAVING PROCESSED DATA")
print("="*60)

# Save to CSV
X_train_scaled.to_csv('../data/X_train.csv', index=False)
X_test_scaled.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)


SAVING PROCESSED DATA


In [22]:
# Save preprocessors
import pickle

with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('../models/label_encoders.pkl', 'wb') as f:
    pickle.dump(le_dict, f)

print("✓ Training data saved to 'data/X_train.csv'")
print("✓ Test data saved to 'data/X_test.csv'")
print("✓ Scaler saved to 'models/scaler.pkl'")
print("✓ Label encoders saved to 'models/label_encoders.pkl'")

# Display final feature list
print("\n" + "="*60)
print(f"FINAL FEATURE SET ({len(X_train_scaled.columns)} features)")
print("="*60)
print(X_train_scaled.columns.tolist())

✓ Training data saved to 'data/X_train.csv'
✓ Test data saved to 'data/X_test.csv'
✓ Scaler saved to 'models/scaler.pkl'
✓ Label encoders saved to 'models/label_encoders.pkl'

FINAL FEATURE SET (49 features)
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'tenure_group', 'is_new_customer', 'is_loyal_customer', 'avg_monthly_charges', 'charge_increase_rate', 'high_monthly_charges', 'PhoneService_binary', 'MultipleLines_binary', 'InternetService_binary', 'OnlineSecurity_binary', 'OnlineBackup_binary', 'DeviceProtection_binary', 'TechSupport_binary', 'StreamingTV_binary', 'StreamingMovies_binary', 'total_services', 'has_premium_support', 'has_streaming', 'has_protection_bundle', 'is_month_to_month', 'payment_risk_score', 'paperless_billing_b