In [None]:
import pandas as pd
import sys, os
sys.path.append(os.path.abspath(".."))

import src.preprocessing
from importlib import reload
reload(src.preprocessing)

from typing import Union
from src.preprocessing import (
    compute_cancellation, 
    aggregate_user_day_activity, 
    add_days_since, 
    add_rolling_averages,
    add_thumbs_ratio,
    add_days_active_last_n_days
)


In [None]:
root = '/Users/mdiaspinto/Documents/School/Python Data Science/Final Project/kaggle-churn'
df = pd.read_parquet(root + '/data/train.parquet')

object_cols = df.select_dtypes(include="object").columns
df[object_cols] = df[object_cols].astype("category")
df.head()

In [None]:
df_new = aggregate_user_day_activity(df)
df_new = add_rolling_averages(df_new, columns=['Add Friend', 'Add to Playlist', 'Thumbs Down', 'Thumbs Up'], n=30)
df_new = add_rolling_averages(df_new, columns=['Thumbs Down', 'Thumbs Up', 'Error'], n=7)
df_new['thumbs_ratio'] = df_new['thumbs_up_avg_7d'] / (df_new['thumbs_up_avg_7d'] + df_new['thumbs_down_avg_7d'])
df_new.head()


In [None]:
unique_dates = sorted(df_new['date'].unique())
print(f"Computing cancellation targets for {len(unique_dates)} unique dates...")

cancellation_targets = []

for present_date in unique_dates:
    target_df = compute_cancellation(df, present_time=present_date, window_days=10)
    target_df['date'] = present_date
    cancellation_targets.append(target_df)

target_by_date = pd.concat(cancellation_targets, ignore_index=True)
target_by_date = target_by_date.rename(columns={'userId': 'userId', 'cancellation_confirmed': 'churn_status'})

print(f"\nCancellation targets shape: {target_by_date.shape}")
print(f"Sample:")
print(target_by_date.head(10))

In [None]:
df_new = pd.read_csv(root + '/data/df_transformed.csv')
target_by_date = pd.read_csv(root + '/data/churn_status.csv')

# df_new.to_csv(root + '/data/df_transformed.csv', index=False)
# target_by_date.to_csv(root + '/data/churn_status.csv', index=False)

In [None]:
df_train = df_new.merge(target_by_date, on=['userId', 'date'], how='left')

print(f"df_train shape: {df_train.shape}")
print(f"Columns: {df_train.columns.tolist()}")
print(f"\nChurn distribution:")
print(df_train['churn_status'].value_counts())
print(f"\nSample:")
print(df_train.head())

In [None]:
# Train a Random Forest model to predict churn_status
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Prepare data: drop rows with NaN in target
df_model = df_train.dropna(subset=['churn_status']).copy()
print(f"Training data shape after dropping NaN targets: {df_model.shape}")

# Separate features and target
# Exclude userId, date, and churn_status from features
exclude_cols = ['userId', 'date', 'churn_status']
feature_cols = [col for col in df_model.columns if col not in exclude_cols]

# One-hot encode categorical features and fill remaining NaNs
obj_cols = [c for c in feature_cols if df_model[c].dtype == 'object']
X = pd.get_dummies(df_model[feature_cols], columns=obj_cols, drop_first=True).fillna(0)
feature_cols = X.columns.tolist()
y = df_model['churn_status'].astype(int)

print(f"\nFeatures: {len(feature_cols)} columns")
print(f"Feature columns: {feature_cols}")
print(f"\nTarget distribution:")
print(y.value_counts())

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTrain set: {X_train.shape}, Test set: {X_test.shape}")

# Train Random Forest
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("\n=== Model Performance ===")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
# Load and transform test data with the same pipeline as training data
print("Loading test data...")
df_test = pd.read_parquet(root + '/data/test.parquet')
print(f"Test data shape: {df_test.shape}")
print(f"Date range: {df_test['time'].min()} to {df_test['time'].max()}")

# Apply the same transformations
print("\nApplying transformations to test data...")

# 1. Aggregate by user and day
df_test_agg = aggregate_user_day_activity(df_test, fill_missing_days=True)
print(f"After aggregation: {df_test_agg.shape}")
print(f"Available columns: {df_test_agg.columns.tolist()}")

# 2. Add days since metrics - only track columns that exist in test data
available_tracking_cols = []
for col in ['Submit Downgrade', 'Submit Upgrade', 'Cancel']:
    if col in df_test_agg.columns:
        available_tracking_cols.append(col)

if available_tracking_cols:
    df_test_agg = add_days_since(df_test_agg, columns_to_track=available_tracking_cols)
    print(f"After days_since: {df_test_agg.shape}")
else:
    print("No tracking columns found, skipping days_since")

# 3. Add 30-day rolling averages
activity_cols_30 = ['Add Friend', 'Add to Playlist', 'Thumbs Down', 'Thumbs Up']
available_cols_30 = [col for col in activity_cols_30 if col in df_test_agg.columns]
if available_cols_30:
    df_test_agg = add_rolling_averages(df_test_agg, columns=available_cols_30, n=30)
    print(f"After 30d rolling averages: {df_test_agg.shape}")

# 4. Add 7-day rolling averages
activity_cols_7 = ['Thumbs Down', 'Thumbs Up', 'Error']
available_cols_7 = [col for col in activity_cols_7 if col in df_test_agg.columns]
if available_cols_7:
    df_test_agg = add_rolling_averages(df_test_agg, columns=available_cols_7, n=7)
    print(f"After 7d rolling averages: {df_test_agg.shape}")

# 5. Add new features
df_test_agg = add_thumbs_ratio(df_test_agg, thumbs_up_col='thumbs_up_avg_7d', thumbs_down_col='thumbs_down_avg_7d')
df_test_agg = add_days_active_last_n_days(df_test_agg, n_days=30)

print("\nTest data ready for prediction!")
print(f"Final test features shape: {df_test_agg.shape}")
print(f"Columns: {df_test_agg.columns.tolist()}")


In [None]:
# Make predictions on test data using the most recent data for each user
# The 10-day churn window starts on the last date and extends 10 days forward
from src.modeling import prepare_test_data, make_predictions, create_submission
from datetime import timedelta

print("Preparing test data for model prediction...")

# Get the last date in test data
last_date = df_test_agg['date'].max()

# Define the 10-day prediction window (for churn prediction)
window_end = last_date + timedelta(days=9)  # 10 days total (inclusive of last_date)
print(f"Churn prediction window: {last_date} to {window_end}")

# Prepare test data
X_test_pred, df_test_final = prepare_test_data(df_test_agg, feature_cols, last_date=last_date)

# Make predictions
test_predictions, test_predictions_proba = make_predictions(rf_model, X_test_pred)

# Create and save submission
submission = create_submission(
    df_test_final['userId'].values,
    test_predictions,
    output_path=root + '/data/submission.csv'
)


In [None]:
# Train XGBoost model
reload(src.modeling)
from src.modeling import train_xgboost, evaluate_model, get_feature_importance

# Train XGBoost with the same train/test split
xgb_model = train_xgboost(X_train, y_train)

# Evaluate XGBoost
print("\n=== XGBoost Performance ===")
xgb_results = evaluate_model(xgb_model, X_test, y_test)

# Feature importance for XGBoost
xgb_feature_importance = get_feature_importance(xgb_model, feature_cols, top_n=10)


In [None]:
# Make XGBoost predictions on test data
print("Making XGBoost predictions...")

# Use the same prepared test data
xgb_predictions, xgb_predictions_proba = make_predictions(xgb_model, X_test_pred)

# Create and save XGBoost submission
xgb_submission = create_submission(
    df_test_final['userId'].values,
    xgb_predictions,
    output_path=root + '/data/submissionx.csv'
)
