In [None]:
import pandas as pd
import sys, os
sys.path.append(os.path.abspath(".."))

import src.preprocessing
from importlib import reload
reload(src.preprocessing)

from typing import Union
from src.preprocessing import (
    compute_cancellation, 
    aggregate_user_day_activity, 
    add_rolling_averages
)


In [None]:
root = '/Users/mdiaspinto/Documents/School/Python Data Science/Final Project/kaggle-churn'
df = pd.read_parquet(root + '/data/train.parquet')

object_cols = df.select_dtypes(include="object").columns
df[object_cols] = df[object_cols].astype("category")
df.drop(columns=['gender', 'firstName', 'lastName', 'location', 'userAgent'], inplace=True)

In [None]:
df_new = aggregate_user_day_activity(df)
df_new['userId'] = df_new['userId'].astype(int)
df_new.head()

In [None]:
df_new = add_rolling_averages(df_new, columns=['Add Friend', 'Add to Playlist', 'Thumbs Down', 'Thumbs Up', 'Error'], n=7)

In [None]:
denominator = df_new['thumbs_up_avg_7d'] + df_new['thumbs_down_avg_7d']
df_new['thumbs_ratio_7d'] = df_new['thumbs_up_avg_7d'] / denominator.replace(0, pd.NA)
df_new.head()

In [None]:
unique_dates = sorted(df_new['date'].unique())
print(f"Computing cancellation targets for {len(unique_dates)} unique dates...")

cancellation_targets = []

for present_date in unique_dates:
    target_df = compute_cancellation(df, present_time=present_date, window_days=10)
    target_df['date'] = present_date
    cancellation_targets.append(target_df)

target_by_date = pd.concat(cancellation_targets, ignore_index=True)
target_by_date = target_by_date.rename(columns={'userId': 'userId', 'cancellation_confirmed': 'churn_status'})

print(f"\nCancellation targets shape: {target_by_date.shape}")
print(f"Sample:")
print(target_by_date.head(10))

In [None]:
# df_new = pd.read_csv(root + '/data/df_transformed.csv')
target_by_date = pd.read_csv(root + '/data/churn_status.csv')

df_new.to_csv(root + '/data/df_transformed.csv', index=False)
# target_by_date.to_csv(root + '/data/churn_status.csv', index=False)

In [None]:
df_new['date'] = pd.to_datetime(df_new['date'])
target_by_date['date'] = pd.to_datetime(target_by_date['date'])
df_train = df_new.merge(target_by_date, on=['userId', 'date'], how='left')

print(f"\nChurn distribution:")
print(df_train['churn_status'].value_counts())
print(f"\nSample:")
print(df_train.head())

In [17]:
# Split data and define variables for modeling
from sklearn.model_selection import train_test_split

# Define feature columns (exclude userId, date, and target)
feature_cols = [col for col in df_train.columns if col not in ['userId', 'date', 'churn_status']]

# Separate features and target
X = df_train[feature_cols].copy()
y = df_train['churn_status']

# Fix data types for XGBoost compatibility
# Convert 'level' category to numeric (0 for 'free', 1 for 'paid')
if 'level' in X.columns:
    X['level'] = (X['level'] == 'paid').astype(int)

# Convert thumbs_ratio_7d from object to float
if 'thumbs_ratio_7d' in X.columns:
    X['thumbs_ratio_7d'] = pd.to_numeric(X['thumbs_ratio_7d'], errors='coerce').fillna(0)

# Split into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"Feature columns ({len(feature_cols)}): {feature_cols}")
print(f"\nData types after fixes:")
print(X_train.dtypes)

# For predictions on final test data
X_test_pred = X_test
df_test_final = df_train.iloc[X_test.index]

Training set size: (780912, 30)
Test set size: (195228, 30)
Feature columns (30): ['About', 'Add Friend', 'Add to Playlist', 'Cancel', 'Downgrade', 'Error', 'Help', 'Home', 'Logout', 'NextSong', 'Roll Advert', 'Save Settings', 'Settings', 'Submit Downgrade', 'Submit Upgrade', 'Thumbs Down', 'Thumbs Up', 'Upgrade', 'event_count', 'session_count', 'active_flag', 'events_per_session', 'level', 'days_since_registration', 'add_friend_avg_7d', 'add_to_playlist_avg_7d', 'thumbs_down_avg_7d', 'thumbs_up_avg_7d', 'error_avg_7d', 'thumbs_ratio_7d']

Data types after fixes:
About                        int64
Add Friend                   int64
Add to Playlist              int64
Cancel                       int64
Downgrade                    int64
Error                        int64
Help                         int64
Home                         int64
Logout                       int64
NextSong                     int64
Roll Advert                  int64
Save Settings                int64
Settings  

In [20]:
# Train XGBoost and generate predictions
import src.modeling
reload(src.modeling)
from src.modeling import (
    train_xgboost,
    evaluate_model,
    get_feature_importance,
    make_predictions,
    create_submission,
)

# Train the model
xgb_model = train_xgboost(X_train, y_train)
print("\n=== XGBoost Performance ===")
xgb_results = evaluate_model(xgb_model, X_test, y_test)
xgb_feature_importance = get_feature_importance(xgb_model, feature_cols, top_n=10)

# Load and preprocess test data to mirror train
print("\nLoading test data and applying preprocessing...")
df_test_raw = pd.read_parquet(root + '/data/test.parquet')

object_cols_test = df_test_raw.select_dtypes(include="object").columns
df_test_raw[object_cols_test] = df_test_raw[object_cols_test].astype("category")

df_test_raw = df_test_raw.drop(columns=['gender', 'firstName', 'lastName', 'location', 'userAgent'], errors='ignore')

df_test_agg = aggregate_user_day_activity(df_test_raw)
df_test_agg['userId'] = df_test_agg['userId'].astype(int)

df_test_agg = add_rolling_averages(
    df_test_agg,
    columns=['Add Friend', 'Add to Playlist', 'Thumbs Down', 'Thumbs Up', 'Error'],
    n=7
)

denominator_test = df_test_agg['thumbs_up_avg_7d'] + df_test_agg['thumbs_down_avg_7d']
df_test_agg['thumbs_ratio_7d'] = df_test_agg['thumbs_up_avg_7d'] / denominator_test.replace(0, pd.NA)

# Use the most recent date per user for prediction
df_test_latest = df_test_agg.sort_values('date').groupby('userId', as_index=False).tail(1)

# Align test features to the training feature set
X_test_submission = df_test_latest.reindex(columns=feature_cols, fill_value=0)

# Apply same data type fixes as training data
if 'level' in X_test_submission.columns:
    X_test_submission['level'] = (X_test_submission['level'] == 'paid').astype(int)

if 'thumbs_ratio_7d' in X_test_submission.columns:
    X_test_submission['thumbs_ratio_7d'] = pd.to_numeric(X_test_submission['thumbs_ratio_7d'], errors='coerce').fillna(0)

print(f"Test submission matrix shape: {X_test_submission.shape}")
print(f"Test data types: {X_test_submission.dtypes.value_counts()}")

# Predict and write submission
xgb_test_predictions, xgb_test_proba = make_predictions(xgb_model, X_test_submission)
xgb_submission_final = create_submission(
    df_test_latest['userId'].values,
    xgb_test_predictions,
    output_path=root + '/data/submissionx.csv'
)

Calculated scale_pos_weight: 24.92
Training XGBoost optimized for balanced accuracy...
Model training complete!

=== XGBoost Performance ===

=== Model Performance ===
Balanced Accuracy: 0.6771
ROC-AUC Score: 0.7416

Confusion Matrix:
[[137279  50416]
 [  2842   4691]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.73      0.84    187695
           1       0.09      0.62      0.15      7533

    accuracy                           0.73    195228
   macro avg       0.53      0.68      0.49    195228
weighted avg       0.95      0.73      0.81    195228


Top 10 Most Important Features:
                    feature  importance
18              event_count    0.376100
3                    Cancel    0.307514
26       thumbs_down_avg_7d    0.104401
25   add_to_playlist_avg_7d    0.031323
7                      Home    0.021535
27         thumbs_up_avg_7d    0.016635
22                    level    0.016474
19            session_coun

  per_day_counts = df_copy.groupby([user_col, 'date']).size().reset_index(name='event_count')
  session_counts = df_copy.groupby([user_col, 'date'])['sessionId'].nunique().reset_index(name='session_count')
  user_registration = df_copy.groupby(user_col)[registration_col].first().reset_index()
  level_per_day = df_copy.groupby([user_col, 'date'])[level_col].last().reset_index()
  df_aggregated = df_copy.groupby([user_col, 'date', page_col]).size().unstack(fill_value=0).reset_index()
  for user_id, user_data in df_aggregated.groupby(user_col):


Test submission matrix shape: (2904, 30)
Test data types: int64      23
float64     7
Name: count, dtype: int64
Making predictions...
Predictions shape: (2904,)
Prediction distribution:
0    2105
1     799
Name: count, dtype: int64

Submission dataframe shape: (2904, 2)
Sample submissions:
        id  target
0  1995115       0
1  1993285       0
2  1979129       0
3  1997769       0
4  1997880       0
5  1985914       0
6  1987068       0
7  1988412       1
8  1994524       1
9  1988592       0
Submission target distribution:
target
0    2105
1     799
Name: count, dtype: int64
Saved submission to /Users/mdiaspinto/Documents/School/Python Data Science/Final Project/kaggle-churn/data/submissionx.csv
