In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../data/cleaned_data.csv')

print("="*50)
print("FEATURE ENGINEERING")
print("="*50)

# Create a copy for feature engineering
df_fe = df.copy()

print("\n1. Creating tenure-based features...")

#Tenure groups (research shows different behavior patterns)
def create_tenure_group(tenure):
    if tenure <= 12:
        return '0-12 months'
    elif tenure <= 24:
        return '12-24 months'
    elif tenure <= 48:
        return '24-48 months'
    else:
        return '48+ months'

df_fe['tenure_group'] = df_fe['tenure'].apply(create_tenure_group)

# Binary flags fro critical periods
df_fe['is_new_customer'] = (df_fe['tenure'] <= 12).astype(int)
df_fe['is_loyal_customer'] = (df_fe['tenure'] >= 48).astype(int)

print(f"   - tenure_group created")
print(f"   - is_new_customer: {df_fe['is_new_customer'].sum()} customers")
print(f"   - is_loyal_customer: {df_fe['is_loyal_customer'].sum()} customers")

FEATURE ENGINEERING

1. Creating tenure-based features...
   - tenure_group created
   - is_new_customer: 2186 customers
   - is_loyal_customer: 2303 customers


In [5]:
print("\n2. Creating charge-based features...")

# Average monthly charges (TotalCharges / tenure)
# Handle tenure = 0 edge case
df_fe['avg_monthly_charges'] = df_fe.apply(
    lambda row: row['MonthlyCharges'] if row['tenure'] == 0
    else row['TotalCharges'] / row['tenure'],
    axis=1
)

# Charge increase rate (Monthly vs Average)
df_fe['charge_increase_rate'] = (
    df_fe['MonthlyCharges'] - df_fe['avg_monthly_charges']
) / (df_fe['avg_monthly_charges'] + 1)  # +1 to avoid division by zero

# High charges flag (top 25%)
df_fe['high_monthly_charges'] = (
    df_fe['MonthlyCharges'] > df_fe['MonthlyCharges'].quantile(0.75)
).astype(int)

print(f"   - avg_monthly_charges created")
print(f"   - charge_increases_rate created")
print(f"   - high_monthly_charges: {df_fe['high_monthly_charges'].sum()} customers")


2. Creating charge-based features...
   - avg_monthly_charges created
   - charge_increases_rate created
   - high_monthly_charges: 1758 customers


In [7]:
print("\n3. Creating service-based features...")

# Count of additional services
service_cols = ['PhoneService', 'MultipleLines', 'InternetService', 
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies']

#Convert to binary (Yes=1, No/No internet service=0)
for col in service_cols:
    df_fe[f'{col}_binary'] = df_fe[col].apply(
        lambda x: 1 if x == 'Yes' else 0
    )

#Total service count
binary_service_cols = [f'{col}_binary' for col in service_cols]
df_fe['total_services'] = df_fe[binary_service_cols].sum(axis=1)

# Premium support services (protective features)
df_fe['has_premium_support'] = (
    (df_fe['OnlineSecurity'] == 'Yes') |
    (df_fe['TechSupport'] == 'Yes')
).astype(int)

# Streaming services (entertainment)
df_fe['has_streaming'] = (
    (df_fe['StreamingTV'] == 'Yes') |
    (df_fe['StreamingMovies'] == 'Yes')
).astype(int)

# Protection bundle 
df_fe['has_protection_bundle'] = (
    (df_fe['OnlineSecurity'] == 'Yes') &
    (df_fe['OnlineBackup'] == 'Yes') &
    (df_fe['DeviceProtection'] == 'Yes')
).astype(int)

print(f"   - total_services created (mean: {df_fe['total_services'].mean():.2f})")
print(f"   - has_premium_support: {df_fe['has_premium_support'].sum()} customers")
print(f"   - has_streaming: {df_fe['has_streaming'].sum()} customers")
print(f"   - has_protection_bundle: {df_fe['has_protection_bundle'].sum()} customers")


3. Creating service-based features...
   - total_services created (mean: 3.36)
   - has_premium_support: 2964 customers
   - has_streaming: 3499 customers
   - has_protection_bundle: 693 customers
