# Churn Prediction Model - Production Notebook

## Overview
This notebook implements a churn prediction model using LightGBM with the following pipeline:

### Feature Set (36 features)
- **Removed**: 5 raw event counts (Add Friend, Add to Playlist, Error, Thumbs Down, Thumbs Up)
- **Kept**: 
  - Rolling averages (7-day and 14-day windows)
  - Trend features (comparison of 7d vs 14d)
  - Page type counts
  - User metadata
  
### Workflow
1. **Data Loading & Preprocessing**: Load and clean training data
2. **Feature Engineering**: Create rolling averages and trend features
3. **Model Training**: Train LightGBM with optimized hyperparameters
4. **Threshold Optimization**: Find optimal classification threshold
5. **Submission Generation**: Generate final predictions for test data

### Model Configuration
- **Algorithm**: LightGBM (Gradient Boosting)
- **Class Balancing**: Enabled via scale_pos_weight
- **Validation**: Time-series split (70/30)
- **Optimization**: Balanced accuracy maximization

In [None]:
import pandas as pd
import numpy as np
import sys, os
sys.path.append(os.path.abspath(".."))

import src.preprocessing
from importlib import reload
reload(src.preprocessing)

from src.preprocessing import (
    aggregate_user_day_activity, 
    add_rolling_averages,
    compute_cancellation_batch
)

# Import sklearn components
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import lightgbm as lgb
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
class CancellationTargetTransformer(BaseEstimator, TransformerMixin):
    """
    Efficiently computes cancellation targets using vectorized operations.
    Must be provided with raw_df during __init__.
    """
    def __init__(self, window_days=10, raw_df=None):
        self.window_days = window_days
        self.raw_df = raw_df
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.raw_df is None:
            raise ValueError("raw_df must be provided")
        
        print(f"Computing churn targets (vectorized, window={self.window_days}d)...")
        
        # Use efficient batch computation
        churn_targets = compute_cancellation_batch(
            self.raw_df,
            X,
            window_days=self.window_days
        )
        
        # Merge with X
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'])
        churn_targets['date'] = pd.to_datetime(churn_targets['date'])
        X_copy['userId'] = X_copy['userId'].astype(int)
        churn_targets['userId'] = churn_targets['userId'].astype(int)
        
        result = X_copy.merge(churn_targets, on=['userId', 'date'], how='left')
        
        print(f"Churn status distribution:\n{result['churn_status'].value_counts()}")
        return result


class RollingAverageTransformer(BaseEstimator, TransformerMixin):
    """Computes rolling average features."""
    def __init__(self, columns=None, window_days=7):
        self.columns = columns if columns is not None else ['NextSong']
        self.window_days = window_days
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        print(f"Computing rolling averages (window={self.window_days}d)...")
        return add_rolling_averages(X, columns=self.columns, n=self.window_days)


class ThumbsRatioTransformer(BaseEstimator, TransformerMixin):
    """Computes thumbs ratio from rolling averages."""
    def __init__(self, window_days=7):
        self.window_days = window_days
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        up_col = f'thumbs_up_avg_{self.window_days}d'
        down_col = f'thumbs_down_avg_{self.window_days}d'
        ratio_col = f'thumbs_ratio_{self.window_days}d'
        
        if up_col in X_copy.columns and down_col in X_copy.columns:
            denominator = X_copy[up_col] + X_copy[down_col]
            X_copy[ratio_col] = X_copy[up_col] / denominator.replace(0, np.nan)
            X_copy[ratio_col] = X_copy[ratio_col].fillna(0)
        
        return X_copy


class TrendFeaturesTransformer(BaseEstimator, TransformerMixin):
    """Creates trend features by comparing short-term (7d) vs long-term (14d) averages."""
    def __init__(self, columns=None):
        self.columns = columns if columns is not None else ['NextSong']
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        # For each column, compute trend (7d avg / 14d avg - 1)
        # Positive = increasing activity, Negative = decreasing activity
        for col in self.columns:
            col_7d = f'{col.lower().replace(" ", "_")}_avg_7d'
            col_14d = f'{col.lower().replace(" ", "_")}_avg_14d'
            trend_col = f'{col.lower().replace(" ", "_")}_trend'
            
            if col_7d in X_copy.columns and col_14d in X_copy.columns:
                # Compute ratio: (7d / 14d) - 1
                # This gives % change: positive = increasing, negative = decreasing
                denominator = X_copy[col_14d].replace(0, np.nan)
                X_copy[trend_col] = (X_copy[col_7d] / denominator) - 1
                X_copy[trend_col] = X_copy[trend_col].fillna(0)
        
        return X_copy


class FeaturePreprocessor(BaseEstimator, TransformerMixin):
    """Handles type conversions and missing value imputation."""
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        # Convert level to binary
        if 'level' in X_copy.columns:
            X_copy['level'] = (X_copy['level'] == 'paid').astype(int)
        
        # Create user lifecycle feature: long-time user (30+ days) vs recent user
        # if 'days_since_registration' in X_copy.columns:
        #     X_copy['is_established_user'] = (X_copy['days_since_registration'] >= 30).astype(int)
        
        # Add weekend indicator
        if 'date' in X_copy.columns:
            X_copy['date'] = pd.to_datetime(X_copy['date'])
            X_copy['is_weekend'] = (X_copy['date'].dt.dayofweek >= 5).astype(int)
        
        # Fill ratio columns with 0
        ratio_cols = [col for col in X_copy.columns if 'ratio' in col.lower()]
        for col in ratio_cols:
            if col in X_copy.columns:
                X_copy[col] = pd.to_numeric(X_copy[col], errors='coerce').fillna(0)
        
        return X_copy

In [None]:
root = '/Users/mdiaspinto/Documents/School/Python Data Science/Final Project/kaggle-churn'
df_raw = pd.read_parquet(root + '/data/train.parquet')

# Clean up: convert object columns to category, drop unnecessary columns
object_cols = df_raw.select_dtypes(include="object").columns
df_raw[object_cols] = df_raw[object_cols].astype("category")
df_raw = df_raw.drop(columns=['gender', 'firstName', 'lastName', 'location', 'userAgent', 'status', 'auth', 'method'])

print(f"Raw data shape: {df_raw.shape}")
print(f"Date range: {pd.to_datetime(df_raw['time']).min()} to {pd.to_datetime(df_raw['time']).max()}")

In [None]:
# ============================================================================
# CHECK FOR COLUMN MISMATCHES BETWEEN TRAIN AND TEST DATA
# ============================================================================
print("=" * 80)
print("COMPARING TRAIN.PARQUET vs TEST.PARQUET COLUMNS")
print("=" * 80)

# Load both raw datasets
df_train_raw = pd.read_parquet(root + '/data/train.parquet')
df_test_raw_check = pd.read_parquet(root + '/data/test.parquet')

print(f"\nTrain data shape: {df_train_raw.shape}")
print(f"Test data shape: {df_test_raw_check.shape}")

# Get column sets
train_cols = set(df_train_raw.columns)
test_cols = set(df_test_raw_check.columns)

print(f"\nTrain columns count: {len(train_cols)}")
print(f"Test columns count: {len(test_cols)}")

# Find differences
cols_only_in_train = train_cols - test_cols
cols_only_in_test = test_cols - train_cols
common_cols = train_cols & test_cols

print("\n" + "-" * 80)
print("COLUMN DIFFERENCES:")
print("-" * 80)

if cols_only_in_train:
    print(f"\n⚠️  Columns ONLY in TRAIN (not in test): {len(cols_only_in_train)}")
    for col in sorted(cols_only_in_train):
        print(f"   - {col}")
else:
    print("\n✓ No columns unique to train")

if cols_only_in_test:
    print(f"\n⚠️  Columns ONLY in TEST (not in train): {len(cols_only_in_test)}")
    for col in sorted(cols_only_in_test):
        print(f"   - {col}")
else:
    print("\n✓ No columns unique to test")

print(f"\n✓ Common columns: {len(common_cols)}")
print(f"   {sorted(common_cols)}")

# Check which columns we're currently dropping
dropped_cols = ['gender', 'firstName', 'lastName', 'location', 'userAgent', 'status', 'auth', 'method']
print("\n" + "-" * 80)
print(f"Currently dropping these columns: {dropped_cols}")
print("-" * 80)

In [None]:
# Aggregate to user-day level
print("\nAggregating events to user-day level...")
df_agg = aggregate_user_day_activity(df_raw)
df_agg['userId'] = df_agg['userId'].astype(int)

print(f"Aggregated data shape: {df_agg.shape}")
print(f"Date range: {df_agg['date'].min()} to {df_agg['date'].max()}")

In [None]:
# ============================================================================
# FEATURE REDUCTION: DROP RARE/LOW-SIGNAL PAGE TYPE FEATURES
# ============================================================================
# Final model keeps: 7d + 14d rolling windows, trend features, thumbs ratios
# Only removes: 9 rare page type features that add noise
# ============================================================================
print("\n" + "=" * 60)
print("FEATURE REDUCTION: DROPPING RARE/LOW-SIGNAL PAGE TYPES")
print("=" * 60)

# Define features to remove (9 rare/low-signal page types)
features_to_remove = [
    'About', 'Help', 'Settings', 'Save Settings', 'Roll Advert', 
    'Home', 'Logout', 'Submit Downgrade', 'Submit Upgrade'
]

# Check which features exist in the aggregated data
existing_features_to_remove = [col for col in features_to_remove if col in df_agg.columns]

print(f"\nOriginal aggregated data shape: {df_agg.shape}")
print(f"Features to remove: {len(features_to_remove)} rare page types")

if existing_features_to_remove:
    print(f"  Removing: {existing_features_to_remove}")
    df_agg = df_agg.drop(columns=existing_features_to_remove)
    print(f"\n✓ Removed {len(existing_features_to_remove)} rare/low-signal features")
    print(f"✓ New data shape: {df_agg.shape}")
    print(f"✓ KEEPING: 7d + 14d rolling windows, trend features, thumbs ratios")
else:
    print("\n✓ No features to remove (already absent from data)")

In [None]:
# Temporal train-test split
print("\n" + "=" * 60)
print("TEMPORAL TRAIN/TEST SPLIT")
print("=" * 60)

cutoff_date = pd.to_datetime('2018-11-01')
df_agg['date'] = pd.to_datetime(df_agg['date'])

df_train = df_agg[df_agg['date'] < cutoff_date].copy()
df_test = df_agg[df_agg['date'] >= cutoff_date].copy()

print(f"Training set: {df_train.shape}")
print(f"Test set: {df_test.shape}")
print(f"Train dates: {df_train['date'].min()} to {df_train['date'].max()}")
print(f"Test dates: {df_test['date'].min()} to {df_test['date'].max()}")

In [None]:
# ============================================================================
# BUILD AND APPLY FEATURE ENGINEERING PIPELINE
# ============================================================================
print("\n" + "=" * 60)
print("FEATURE ENGINEERING PIPELINE (VECTORIZED & EFFICIENT)")
print("=" * 60)

# Define feature columns to track
feature_cols_list = ['Add Friend', 'Add to Playlist', 'Thumbs Down', 'Thumbs Up', 'Error']

# Create pipeline for training data
print("\n1. Transforming TRAINING data...")
train_pipeline = Pipeline([
    ('churn_target', CancellationTargetTransformer(window_days=10, raw_df=df_raw)),
    ('rolling_avg_7d', RollingAverageTransformer(columns=feature_cols_list, window_days=7)),
    ('rolling_avg_14d', RollingAverageTransformer(columns=feature_cols_list, window_days=14)),
    # ('thumbs_ratio_7d', ThumbsRatioTransformer(window_days=7)),
    # ('thumbs_ratio_14d', ThumbsRatioTransformer(window_days=14)),
    ('trend_features', TrendFeaturesTransformer(columns=feature_cols_list)),
    ('preprocessor', FeaturePreprocessor()),
])

df_train_features = train_pipeline.fit_transform(df_train)

print(f"\nTraining features shape: {df_train_features.shape}")
print(f"Columns: {df_train_features.columns.tolist()[:10]}... (showing first 10)")

# Apply same pipeline to test data
print("\n2. Transforming TEST data...")
test_pipeline = Pipeline([
    ('churn_target', CancellationTargetTransformer(window_days=10, raw_df=df_raw)),
    ('rolling_avg_7d', RollingAverageTransformer(columns=feature_cols_list, window_days=7)),
    ('rolling_avg_14d', RollingAverageTransformer(columns=feature_cols_list, window_days=14)),
    # ('thumbs_ratio_7d', ThumbsRatioTransformer(window_days=7)),
    # ('thumbs_ratio_14d', ThumbsRatioTransformer(window_days=14)),
    ('trend_features', TrendFeaturesTransformer(columns=feature_cols_list)),
    ('preprocessor', FeaturePreprocessor()),
])

df_test_features = test_pipeline.fit_transform(df_test)

print(f"Test features shape: {df_test_features.shape}")

In [None]:
# ============================================================================
# EXTRACT FEATURES AND TARGET
# ============================================================================
print("\n" + "=" * 60)
print("EXTRACTING FEATURES AND TARGET")
print("=" * 60)

exclude_cols = ['userId', 'date', 'churn_status', 'Cancel']
feature_cols = [col for col in df_train_features.columns if col not in exclude_cols]

X_train = df_train_features[feature_cols].copy()
y_train = df_train_features['churn_status'].copy()

X_test = df_test_features[feature_cols].copy()
y_test = df_test_features['churn_status'].copy()

print(f"\nTraining set:")
print(f"  X_train shape: {X_train.shape}")
print(f"  y_train shape: {y_train.shape}")
print(f"  Churn rate: {y_train.mean():.4f}")
print(f"  Churn distribution:\n{y_train.value_counts()}")

print(f"\nTest set:")
print(f"  X_test shape: {X_test.shape}")
print(f"  y_test shape: {y_test.shape}")
print(f"  Churn rate: {y_test.mean():.4f}")
print(f"  Churn distribution:\n{y_test.value_counts()}")

In [None]:
# ============================================================================
# SKLEARN PREPROCESSING + MODEL TRAINING PIPELINE (BASELINE PARAMS)
# ============================================================================
print("\n" + "=" * 60)
print("BUILDING FINAL PIPELINE: PREPROCESSING + LightGBM (BASELINE)")
print("=" * 60)

# Identify feature types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['category', 'object']).columns.tolist()

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)

# Calculate scale_pos_weight to handle class imbalance
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
scale_pos_weight = neg_count / pos_count
print(f"\nClass imbalance adjustment:")
print(f"  Negative class (no churn): {neg_count}")
print(f"  Positive class (churn): {pos_count}")
print(f"  Scale pos weight: {scale_pos_weight:.2f}")

# Use baseline hyperparameters
baseline_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

print(f"\nUsing baseline hyperparameters:")
for param, value in baseline_params.items():
    print(f"  {param}: {value}")

# Full pipeline with baseline hyperparameters
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(
        random_state=42,
        verbose=-1,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1,  # Use all processors
        **baseline_params
    ))
])

print("\nTraining model...")
model_pipeline.fit(X_train, y_train)

print("\nEvaluating on test set...")
y_pred = model_pipeline.predict(X_test)
y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]

print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(classification_report(y_test, y_pred))
print(f'ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}')

# Calculate balanced accuracy
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print(f'Balanced Accuracy: {balanced_acc:.4f}')

In [None]:
# ============================================================================
# OPTIMIZE CLASSIFICATION THRESHOLD
# ============================================================================
print("\n" + "=" * 60)
print("THRESHOLD OPTIMIZATION")
print("=" * 60)

# Test different thresholds
thresholds = np.arange(0.05, 0.95, 0.01)
balanced_accuracies = []

print("\nTesting thresholds from 0.05 to 0.95...")
for threshold in thresholds:
    y_pred_threshold = (y_pred_proba >= threshold).astype(int)
    bal_acc = balanced_accuracy_score(y_test, y_pred_threshold)
    balanced_accuracies.append(bal_acc)

# Find optimal threshold
optimal_idx = np.argmax(balanced_accuracies)
optimal_threshold = thresholds[optimal_idx]
optimal_bal_acc = balanced_accuracies[optimal_idx]

print(f"\n✓ Optimization complete!")
print(f"\nDefault threshold (0.5):")
print(f"  Balanced Accuracy: {balanced_acc:.4f}")
print(f"\nOptimal threshold: {optimal_threshold:.2f}")
print(f"  Balanced Accuracy: {optimal_bal_acc:.4f}")
print(f"  Improvement: +{(optimal_bal_acc - balanced_acc) * 100:.2f}%")

# Show predictions with optimal threshold
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
print(f"\n" + "=" * 60)
print(f"RESULTS WITH OPTIMAL THRESHOLD ({optimal_threshold:.2f})")
print("=" * 60)
print(classification_report(y_test, y_pred_optimal))
print(f'ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}')
print(f'Balanced Accuracy: {optimal_bal_acc:.4f}')

# Visualize threshold vs balanced accuracy
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(thresholds, balanced_accuracies, linewidth=2)
plt.axvline(optimal_threshold, color='red', linestyle='--', label=f'Optimal threshold: {optimal_threshold:.2f}')
plt.axvline(0.5, color='gray', linestyle='--', alpha=0.5, label='Default threshold: 0.50')
plt.axhline(balanced_acc, color='gray', linestyle=':', alpha=0.5)
plt.xlabel('Classification Threshold', fontsize=12)
plt.ylabel('Balanced Accuracy', fontsize=12)
plt.title('Threshold Optimization for Balanced Accuracy', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\n✓ Threshold optimization visualization complete")

In [None]:
# ============================================================================
# CREATE REDUCED FEATURE SET (Remove raw event counts)
# ============================================================================
print("\n" + "=" * 80)
print("CREATING REDUCED FEATURE SET")
print("=" * 80)

# Remove raw event count features (keeping only derived features)
raw_features_to_remove = ['Add Friend', 'Add to Playlist', 'Error', 'Thumbs Down', 'Thumbs Up']

print(f"\nRemoving {len(raw_features_to_remove)} raw event count features:")
for feat in raw_features_to_remove:
    print(f"  - {feat}")

print(f"\nKEEPING all derived features:")
print(f"  - Rolling averages (7d + 14d)")
print(f"  - Trend features (7d vs 14d comparison)")

# Create reduced feature set
X_train_reduced = X_train.drop(columns=raw_features_to_remove, errors='ignore')
X_test_reduced = X_test.drop(columns=raw_features_to_remove, errors='ignore')

print(f"\nOriginal features: {X_train.shape[1]}")
print(f"Reduced features: {X_train_reduced.shape[1]}")
print(f"Removed: {X_train.shape[1] - X_train_reduced.shape[1]} features")

# Retrain model with reduced features
print("\n" + "=" * 80)
print("RETRAINING MODEL WITH REDUCED FEATURES")
print("=" * 80)

numeric_features_reduced = X_train_reduced.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features_reduced = X_train_reduced.select_dtypes(include=['category', 'object']).columns.tolist()

preprocessor_reduced = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_reduced),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features_reduced)
    ]
)

model_pipeline_reduced = Pipeline([
    ('preprocessor', preprocessor_reduced),
    ('classifier', lgb.LGBMClassifier(
        random_state=42,
        verbose=-1,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1,
        **baseline_params
    ))
])

print("\nTraining model...")
model_pipeline_reduced.fit(X_train_reduced, y_train)

print("\nEvaluating on validation set...")
y_pred_reduced = model_pipeline_reduced.predict(X_test_reduced)
y_pred_proba_reduced = model_pipeline_reduced.predict_proba(X_test_reduced)[:, 1]

balanced_acc_reduced = balanced_accuracy_score(y_test, y_pred_reduced)
roc_auc_reduced = roc_auc_score(y_test, y_pred_proba_reduced)

print(f"\nResults (default threshold 0.5):")
print(f"  Balanced Accuracy: {balanced_acc_reduced:.4f}")
print(f"  ROC-AUC: {roc_auc_reduced:.4f}")

# Optimize threshold
print("\n" + "=" * 80)
print("OPTIMIZING THRESHOLD")
print("=" * 80)

balanced_accuracies_reduced = []
for threshold in thresholds:
    y_pred_threshold = (y_pred_proba_reduced >= threshold).astype(int)
    bal_acc = balanced_accuracy_score(y_test, y_pred_threshold)
    balanced_accuracies_reduced.append(bal_acc)

optimal_idx_reduced = np.argmax(balanced_accuracies_reduced)
optimal_threshold_reduced = thresholds[optimal_idx_reduced]
optimal_bal_acc_reduced = balanced_accuracies_reduced[optimal_idx_reduced]

print(f"\nOptimal threshold: {optimal_threshold_reduced:.2f}")
print(f"Balanced Accuracy: {optimal_bal_acc_reduced:.4f}")
print(f"Improvement over default: +{(optimal_bal_acc_reduced - balanced_acc_reduced) * 100:.2f}%")

# Compare with full feature model
print("\n" + "=" * 80)
print("COMPARISON")
print("=" * 80)
print(f"Full model ({X_train.shape[1]} features with raw counts):")
print(f"  Balanced Accuracy: {optimal_bal_acc:.4f}")
print(f"\nReduced model ({X_train_reduced.shape[1]} features without raw counts):")
print(f"  Balanced Accuracy: {optimal_bal_acc_reduced:.4f}")
print(f"  Change: {(optimal_bal_acc_reduced - optimal_bal_acc):+.4f}")

print(f"\n✓ Reduced model ready for submission generation")

In [None]:
# ============================================================================
# GENERATE FINAL SUBMISSION CSV
# ============================================================================
print("\n" + "=" * 80)
print("GENERATING FINAL SUBMISSION")
print("=" * 80)

print("\nMODEL CONFIGURATION:")
print(f"  - Features: {X_train_reduced.shape[1]} (removed 5 raw event counts)")
print(f"  - Optimal threshold: {optimal_threshold_reduced:.2f}")
print(f"  - Balanced Accuracy: {optimal_bal_acc_reduced:.4f}")
print(f"  - Model: LightGBM with class balancing")

# Load and process test data
print("\n1. Loading test data...")
df_test_raw_final = pd.read_parquet(root + '/data/test.parquet')

# Clean up test data
object_cols_test = df_test_raw_final.select_dtypes(include="object").columns
df_test_raw_final[object_cols_test] = df_test_raw_final[object_cols_test].astype("category")
df_test_raw_final = df_test_raw_final.drop(columns=['gender', 'firstName', 'lastName', 'location', 'userAgent', 'status', 'auth', 'method'])

print(f"   Test raw data shape: {df_test_raw_final.shape}")

# Aggregate to user-day level
print("\n2. Aggregating test data to user-day level...")
df_test_agg_final = aggregate_user_day_activity(df_test_raw_final)
df_test_agg_final['userId'] = df_test_agg_final['userId'].astype(int)
df_test_agg_final['date'] = pd.to_datetime(df_test_agg_final['date'])

max_date_final = df_test_agg_final['date'].max()
print(f"   Test aggregated shape: {df_test_agg_final.shape}")
print(f"   Unique users: {df_test_agg_final['userId'].nunique()}")
print(f"   Max date: {max_date_final.date()}")

# Compute rolling features (7d + 14d)
print("\n3. Computing rolling average features (7d + 14d)...")
feature_cols_list = ['Add Friend', 'Add to Playlist', 'Thumbs Down', 'Thumbs Up', 'Error']

test_with_features = add_rolling_averages(df_test_agg_final, columns=feature_cols_list, n=7)
test_with_features = add_rolling_averages(test_with_features, columns=feature_cols_list, n=14)

# Compute trend features (7d vs 14d)
print("\n4. Computing trend features...")
for col in feature_cols_list:
    col_7d = f'{col.lower().replace(" ", "_")}_avg_7d'
    col_14d = f'{col.lower().replace(" ", "_")}_avg_14d'
    trend_col = f'{col.lower().replace(" ", "_")}_trend'
    
    if col_7d in test_with_features.columns and col_14d in test_with_features.columns:
        denominator = test_with_features[col_14d].replace(0, np.nan)
        test_with_features[trend_col] = (test_with_features[col_7d] / denominator) - 1
        test_with_features[trend_col] = test_with_features[trend_col].fillna(0)

# Apply preprocessing transformations
print("\n5. Applying preprocessing...")
if 'level' in test_with_features.columns:
    test_with_features['level'] = (test_with_features['level'] == 'paid').astype(int)

if 'date' in test_with_features.columns:
    test_with_features['date'] = pd.to_datetime(test_with_features['date'])
    test_with_features['is_weekend'] = (test_with_features['date'].dt.dayofweek >= 5).astype(int)

# Get last row per user
last_user_data = test_with_features.sort_values('date').groupby('userId').tail(1).reset_index(drop=True)
print(f"   Users in test: {len(last_user_data)}")

# Align features with training data
print("\n6. Aligning features with training data...")
feature_cols_reduced = [col for col in X_train_reduced.columns]

for col in feature_cols_reduced:
    if col not in last_user_data.columns:
        print(f"   - Adding missing column: {col}")
        last_user_data[col] = 0

X_test_submission_final = last_user_data[feature_cols_reduced].copy()
print(f"   Final feature matrix: {X_test_submission_final.shape}")

# Make predictions
print("\n7. Making predictions...")
print(f"   Using optimized threshold: {optimal_threshold_reduced:.2f}")

y_pred_proba_final = model_pipeline_reduced.predict_proba(X_test_submission_final)[:, 1]
y_pred_final = (y_pred_proba_final >= optimal_threshold_reduced).astype(int)

# Create submission
submission_final = pd.DataFrame({
    'id': last_user_data['userId'].astype(int).values,
    'target': y_pred_final
})

# Save to CSV
output_path_final = root + '/data/submission_final.csv'
submission_final.to_csv(output_path_final, index=False)

print(f"\n{'=' * 80}")
print("✓ SUBMISSION GENERATED")
print(f"{'=' * 80}")
print(f"\nFile: {output_path_final}")
print(f"Shape: {submission_final.shape}")
print(f"Users: {len(submission_final):,}")
print(f"\nPredicted churn rate: {submission_final['target'].mean():.2%}")
print(f"\nTarget distribution:")
print(submission_final['target'].value_counts())
print(f"\nModel Summary:")
print(f"  - Total features: {X_train_reduced.shape[1]}")
print(f"  - Removed: 5 raw event counts (Add Friend, Add to Playlist, Error, Thumbs Down, Thumbs Up)")
print(f"  - Kept: All rolling windows (7d + 14d), trend features")
print(f"  - Threshold: {optimal_threshold_reduced:.2f} (optimized)")
print(f"  - Balanced accuracy: {optimal_bal_acc_reduced:.4f}")
print(f"  - Prediction window: {(max_date_final + pd.Timedelta(days=1)).date()} to {(max_date_final + pd.Timedelta(days=10)).date()}")
print(f"\n✓ Ready for Kaggle submission!")