In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

In [2]:
root = '/Users/mdiaspinto/Documents/School/Python Data Science/Final Project/kaggle-churn'
df_raw = pd.read_parquet(root + '/data/train.parquet')
unused = ['status', 'firstName', 'lastName', 'ts', 'method', 'auth', 'userAgent']
df_raw.drop(columns=unused, inplace=True)
df_raw.head()

Unnamed: 0,gender,level,userId,page,sessionId,location,itemInSession,length,song,artist,time,registration
0,M,paid,1749042,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21
992,M,paid,1749042,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21
1360,M,paid,1749042,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21
1825,M,paid,1749042,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21
2366,M,paid,1749042,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21


In [3]:
root = '/Users/mdiaspinto/Documents/School/Python Data Science/Final Project/kaggle-churn'
df_test = pd.read_parquet(root + '/data/test.parquet')
df_test.drop(columns=unused, inplace=True)
df_test.head()

Unnamed: 0,gender,level,userId,page,sessionId,location,itemInSession,length,song,artist,time,registration
7,M,free,1465194,NextSong,22483,"New York-Newark-Jersey City, NY-NJ-PA",29,250.8273,Mockingbird,Eminem,2018-10-01 00:00:06,2018-09-27 17:29:36
54,M,free,1465194,Roll Advert,22483,"New York-Newark-Jersey City, NY-NJ-PA",30,,,,2018-10-01 00:00:28,2018-09-27 17:29:36
477,M,free,1465194,NextSong,22483,"New York-Newark-Jersey City, NY-NJ-PA",31,355.7873,Thank You (Precious Memories Album Version),Ray Boltz,2018-10-01 00:04:16,2018-09-27 17:29:36
1170,M,free,1465194,NextSong,22483,"New York-Newark-Jersey City, NY-NJ-PA",32,191.68608,Mathletics,Foals,2018-10-01 00:10:11,2018-09-27 17:29:36
1552,M,free,1465194,NextSong,22483,"New York-Newark-Jersey City, NY-NJ-PA",33,275.25179,Proceed,The Roots,2018-10-01 00:13:22,2018-09-27 17:29:36


In [4]:
def feature_builder(df: pd.DataFrame, cutoff_date: pd.Timestamp) -> pd.DataFrame:
    
    # Create a slice of the dataframe up to the cutoff date and makes userId the index
    df_slice = df[df['time'] < cutoff_date].copy()
    idx = pd.Index(np.sort(df_slice['userId'].unique()), name='userId')
    final_df = pd.DataFrame(index=idx)

    # Get key features from users at cutoff date
    user_group = df.groupby('userId')
    final_df['level'] = user_group['level'].last().reindex(idx)
    final_df['days_registered'] = \
        (cutoff_date.normalize() - user_group['registration'].min().reindex(idx).dt.normalize()).dt.days.astype(int)

    # Group sessions and defines start and end for each one
    session_group = df_slice.groupby(['userId', 'sessionId']).agg(
        session_start=('time', 'min'),
        session_end=('time', 'max'),
        song_count=('song', 'count')
    )

    # Calculate session length in seconds
    session_group['session_length'] = (
    session_group['session_end'] - session_group['session_start']
    ).dt.total_seconds()
    
    # Aggregate session statistics per user
    session_stats = session_group.groupby('userId').agg(
        num_sessions=('session_start', 'count'),
        avg_songs_per_session=('song_count', 'mean'),
        avg_session_length=('session_length', 'mean'),
        days_since_last_session=('session_end', 'max'),
    )

    # Convert to days and handle NaT
    session_stats['days_since_last_session'] = (
        (cutoff_date - session_stats['days_since_last_session']).dt.days
    )

    # Convert to hours
    session_stats['avg_session_length'] /= 3600

    # Calculate proportion of activity on weekends
    df_slice['day'] = df_slice['time'].dt.dayofweek
    df_slice['weekend'] = df_slice['day'].isin([5, 6]).astype(int)
    final_df['weekend_perc'] = (df_slice.groupby('userId')['weekend'].sum()\
        /df_slice.groupby('userId')['weekend'].count()).reindex(idx, fill_value=0)
    final_df['weekend_perc'] *= 100

    # Calculate proportion of weekend days in the target window
    target_window = pd.date_range(start=cutoff_date + pd.Timedelta(days=1), periods=10)
    weekend_window_perc = (target_window.dayofweek.isin([5, 6])).sum()
    final_df['weekend_target_perc'] = weekend_window_perc * 10

    # Build thumbs up and thumbs down features for last 5 and 10 sessions
    for n_sessions in (5, 10):
        # Find last N sessions per user
        lastN = (
            session_group.reset_index()[['userId', 'sessionId', 'session_end']]
            .sort_values(['userId', 'session_end'])
            .groupby('userId', as_index=False)
            .tail(n_sessions)
        )
        lastN_keys = pd.MultiIndex.from_frame(lastN[['userId', 'sessionId']])

        # Filter events to those sessions
        df_lastN = (
            df_slice.set_index(['userId', 'sessionId'])
            .loc[lambda d: d.index.isin(lastN_keys)]
            .reset_index()
        )

        # Get user and page counts in last N sessions
        page_group = (
            df_lastN.groupby(['userId', 'page'])
            .size()
            .unstack()
            .reindex(idx)
            .fillna(0)
        )

        # Total songs in last N sessions
        user_songs = (
            session_group.loc[session_group.index.isin(lastN_keys), 'song_count']
            .groupby(level=0)
            .sum()
            .reindex(idx, fill_value=0)
        )
        denom = user_songs.replace(0, 1)

        suffix = f'_last{n_sessions}'

        if n_sessions == 5:
            # Calculate unique artists played in last 5 sessions
            plays_lastN = df_lastN[df_lastN['page'] == 'NextSong']
            unique_artists = (
            plays_lastN.groupby('userId')['artist']
            .nunique()
            .reindex(idx, fill_value=0)
            .astype(int)
            )
            final_df[f'unique_artists_last5'] = 100 * unique_artists / denom

            # Calculate several feature counts for last 5 sessions
            final_df['roll_advert_count_last5'] = (
                page_group.get('Roll Advert', pd.Series(0, index=idx)).astype(int)
            )
            final_df['error_count_last5'] = (
                page_group.get('Error', pd.Series(0, index=idx)).astype(int)
            )
            final_df['about_count_last5'] = (
                page_group.get('About', pd.Series(0, index=idx)).astype(int)
            )
            final_df['add_playlist_count_last5'] = (
                page_group.get('Add to Playlist', pd.Series(0, index=idx)).astype(int)
            )

            final_df['roll_advert_perc_last5'] = 100 * final_df['roll_advert_count_last5'] / denom
            final_df['error_perc_last5'] = 100 * final_df['error_count_last5'] / denom
            final_df['about_perc_last5'] = 100 * final_df['about_count_last5'] / denom
            final_df['add_playlist_perc_last5'] = 100 * final_df['add_playlist_count_last5'] / denom
            columns_drop = ['roll_advert_count_last5', 'error_count_last5', \
                            'about_count_last5', 'add_playlist_count_last5']
            final_df.drop(columns = columns_drop, inplace=True)

        final_df[f'thumbs_up_perc{suffix}'] = 100 * page_group.get('Thumbs Up', 0) / denom
        final_df[f'thumbs_down_perc{suffix}'] = 100 * page_group.get('Thumbs Down', 0) / denom
        final_df[f'thumbs_up_down_perc{suffix}'] = (
            final_df[f'thumbs_up_perc{suffix}'] - final_df[f'thumbs_down_perc{suffix}']
        )

    # Calculate trends between last 5 and last 10 sessions
    final_df['thumbs_up_trend'] = final_df['thumbs_up_perc_last5'] - final_df['thumbs_up_perc_last10']
    final_df['thumbs_down_trend'] = final_df['thumbs_down_perc_last5'] - final_df['thumbs_down_perc_last10']
    final_df['thumbs_up_down_trend'] = final_df['thumbs_up_perc_last5'] - final_df['thumbs_up_perc_last10']

    # Calculate how long user has been premium on a proportion of observed activity
    premium_count = (
    df_slice.loc[df_slice["level"].eq("paid")]
    .groupby("userId")
    .size()
    .reindex(idx, fill_value=0)
    )

    events_user = (
    df_slice.groupby("userId")
    .size()
    .reindex(idx, fill_value=0)
    )
    final_df["paid_perc"] = 100 * premium_count / events_user.replace(0, 1)

    # Produce final dataframe for output
    final_df = final_df.join(session_stats.reindex(idx))
    num_features = ['num_sessions', 'avg_songs_per_session', \
                    'avg_session_length', 'days_since_last_session']
    final_df[num_features] = final_df[num_features].fillna(0).astype(float)
    

    return final_df

In [5]:
test = feature_builder(df_raw, pd.Timestamp('2018-10-20'))
test.head()

Unnamed: 0_level_0,level,days_registered,weekend_perc,weekend_target_perc,unique_artists_last5,roll_advert_perc_last5,error_perc_last5,about_perc_last5,add_playlist_perc_last5,thumbs_up_perc_last5,...,thumbs_down_perc_last10,thumbs_up_down_perc_last10,thumbs_up_trend,thumbs_down_trend,thumbs_up_down_trend,paid_perc,num_sessions,avg_songs_per_session,avg_session_length,days_since_last_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000025,paid,102,6.882793,30,82.103321,0.553506,0.184502,0.0,2.767528,4.059041,...,0.331858,4.646018,-0.918836,0.221647,-0.918836,97.306733,17.0,97.764706,6.746552,1.0
1000035,paid,38,0.250627,30,90.031153,1.246106,0.0,0.623053,2.803738,8.411215,...,0.308642,8.024691,0.077882,0.002885,0.077882,59.899749,7.0,46.285714,3.059206,0.0
1000083,paid,43,0.0,30,88.17734,0.985222,0.0,0.0,1.231527,4.187192,...,0.406504,3.861789,-0.081101,0.086107,-0.081101,62.583893,11.0,45.545455,3.101742,7.0
1000103,paid,28,0.0,30,100.0,7.692308,0.0,0.0,2.564103,0.0,...,2.564103,-2.564103,0.0,0.0,0.0,11.764706,1.0,39.0,2.706667,15.0
1000164,paid,69,3.649635,30,90.045249,3.167421,0.0,0.452489,3.167421,2.714932,...,0.598802,3.592814,-1.476685,0.306175,-1.476685,45.742092,9.0,37.111111,2.532253,0.0


In [6]:
def label_builder(df: pd.DataFrame,
                 cutoff_date: pd.Timestamp,
                 window_size: int = 10,
                 buffer: int = 3) -> pd.Series:
    
    # Define the time window including buffer period
    window_end = cutoff_date + pd.Timedelta(days=window_size)
    buffer_end = window_end + pd.Timedelta(days=buffer)
    window_users = df.loc[df['time'] <= cutoff_date, 'userId'].unique()

    # Get the cancellation time for each user
    cancel_time = (
        df.loc[df['page'] == 'Cancellation Confirmation']
          .groupby('userId')['time']
          .min()
          .reindex(window_users)
    )

    # Set target labels based on cancellation time
    y = pd.Series(0, index=window_users, name='target')
    y[cancel_time <= cutoff_date] = np.nan
    y[(cancel_time > window_end) & (cancel_time <= buffer_end)] = np.nan
    y[(cancel_time > cutoff_date) & (cancel_time <= window_end)] = 1

    return y

In [7]:
test_y = label_builder(df_raw, pd.Timestamp('2018-10-20'))
test_y.head()

1749042    1.0
1563081    0.0
1697168    0.0
1222580    NaN
1714398    0.0
Name: target, dtype: float64

In [8]:
def window_builder(df: pd.DataFrame,
                start_date,
                end_date,
                *,
                step_days: int = 7,
                window_size: int = 10,
                buffer: int = 3,
                corr_threshold: float = 0.95,
                categorical_cols=('level',),
                verbose: bool = True):

    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)

    all_windows = []
    current = start_date

    if verbose:
        print(f'Generating rolling training data from {start_date.date()} to {end_date.date()}...')

    while current <= end_date:
        if verbose:
            print(f'  - Processing window: {current.date()}')

        feats = feature_builder(df, current)
        labels = label_builder(df, current, window_size=window_size, buffer=buffer)

        labels = labels.reindex(feats.index)
        mask = labels.notna()

        window = feats.loc[mask].copy()
        window['target'] = labels.loc[mask].astype(int)
        window['snapshot_date'] = current

        all_windows.append(window)
        current += pd.Timedelta(days=step_days)

    df_window = pd.concat(all_windows, axis=0)

    # Drop userId index
    groups = df_window.index.to_numpy()
    df_window = df_window.reset_index(drop=True)

    # Define X and y
    X = df_window.drop(columns=['target', 'snapshot_date'], errors='ignore')
    y = df_window['target'].astype(int)

    # Mark categoricals as category dtype
    for c in categorical_cols:
        if c in X.columns:
            X[c] = X[c].astype('category')

    # Drop highly correlated numeric columns
    dropped_cols = []
    if corr_threshold is not None:
        X_num = X.select_dtypes(include=[np.number])
        if X_num.shape[1] >= 2:
            corr = X_num.corr().abs()
            upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
            dropped_cols = [col for col in upper.columns if (upper[col] > corr_threshold).any()]
            if verbose:
                print(f'Dropping correlated (>{corr_threshold}): {dropped_cols}')
            X = X.drop(columns=dropped_cols, errors='ignore')

    return X, y, groups, df_window


start_dt = pd.Timestamp('2018-10-01')
end_dt = pd.Timestamp('2018-11-05')

X_window, y_window, groups_window, df_window = window_builder(
    df_raw,
    start_dt,
    end_dt
 )

print(f'Total Samples: {len(df_window)}')

Generating rolling training data from 2018-10-01 to 2018-11-05...
  - Processing window: 2018-10-01
  - Processing window: 2018-10-08
  - Processing window: 2018-10-15
  - Processing window: 2018-10-22
  - Processing window: 2018-10-29
  - Processing window: 2018-11-05
Dropping correlated (>0.95): ['thumbs_up_perc_last10', 'thumbs_down_perc_last10', 'thumbs_up_down_perc_last10', 'thumbs_up_down_trend', 'avg_session_length']
Total Samples: 72948


In [None]:
# === Simple Tests ===
print("Running tests...")

# Test 1: feature_builder
test_features = feature_builder(df_raw, pd.Timestamp('2018-10-20'))
assert isinstance(test_features, pd.DataFrame), "feature_builder should return DataFrame"
assert len(test_features) > 0, "feature_builder should have rows"
assert 'level' in test_features.columns, "Should have level column"
assert test_features['days_registered'].min() >= 0, "Days registered should be non-negative"
print("✓ feature_builder tests passed")

# Test 2: label_builder
test_labels = label_builder(df_raw, pd.Timestamp('2018-10-20'))
assert isinstance(test_labels, pd.Series), "label_builder should return Series"
assert test_labels.isin([0, 1]).sum() > 0, "Should have 0/1 labels"
assert (test_labels.dropna() >= 0).all(), "Labels should be non-negative"
print("✓ label_builder tests passed")

# Test 3: window_builder
X_test_wb, y_test_wb, groups_test_wb, df_test_wb = window_builder(
    df_raw, '2018-10-01', '2018-10-08', verbose=False
)
assert len(X_test_wb) == len(y_test_wb), "X and y should have same length"
assert len(groups_test_wb) == len(y_test_wb), "Groups should match y length"
assert y_test_wb.isin([0, 1]).all(), "y should only contain 0/1"
print("✓ window_builder tests passed")

print("\n" + "="*60)
print("All tests passed! Starting Optuna optimization...")
print("="*60 + "\n")

# Run Optuna optimization
import optuna
from sklearn.model_selection import GroupKFold
from sklearn.metrics import balanced_accuracy_score

def objective(trial):
    """Optuna objective function optimizing for balanced accuracy."""
    # Sample hyperparameters
    params = {
        'objective': 'binary',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_jobs': -1,
        'is_unbalance': True,
        'verbose': -1
    }
    
    # Cross-validation with GroupKFold to prevent data leakage
    gkf = GroupKFold(n_splits=5)
    scores = []
    
    for train_idx, val_idx in gkf.split(X_window, y_window, groups=groups_window):
        X_train, X_val = X_window.iloc[train_idx], X_window.iloc[val_idx]
        y_train, y_val = y_window.iloc[train_idx], y_window.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_val)
        score = balanced_accuracy_score(y_val, y_pred)
        scores.append(score)
    
    return np.mean(scores)

# Run optimization study
study = optuna.create_study(direction='maximize', study_name='churn_lgb_balanced_acc')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print(f'\n{"="*60}')
print(f'Optimization Complete!')
print(f'{"="*60}')
print(f'Best Balanced Accuracy: {study.best_value:.4f}')
print(f'\nBest Hyperparameters:')
for key, value in study.best_params.items():
    print(f'  {key}: {value}')
print(f'{"="*60}\n')

# Train final ensemble with optimized params
best_params = study.best_params.copy()
best_params.update({'objective': 'binary', 'n_jobs': -1, 'is_unbalance': True, 'verbose': -1})

optimized_models = []
print("Training final ensemble with optimized hyperparameters...")
for i in range(5):
    print(f'  - Training Optimized Model {i+1}/5...')
    model = lgb.LGBMClassifier(**best_params, random_state=42+i)
    model.fit(X_window, y_window)
    optimized_models.append(model)

print("\nOptimized models ready for prediction!")

Running tests...
✓ feature_builder tests passed
✓ label_builder tests passed
✓ window_builder tests passed

All tests passed! Starting Optuna optimization...



[I 2025-12-14 03:31:04,160] A new study created in memory with name: churn_lgb_balanced_acc


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-14 03:32:15,840] Trial 0 finished with value: 0.5277756554869745 and parameters: {'n_estimators': 1000, 'learning_rate': 0.21200414989951172, 'num_leaves': 63, 'max_depth': 8, 'min_child_samples': 89, 'subsample': 0.887718183569945, 'colsample_bytree': 0.5032136468816937, 'reg_alpha': 0.07969091437748382, 'reg_lambda': 1.4016564392988256}. Best is trial 0 with value: 0.5277756554869745.
[I 2025-12-14 03:32:51,682] Trial 1 finished with value: 0.5403489973805616 and parameters: {'n_estimators': 1000, 'learning_rate': 0.1778172590148665, 'num_leaves': 31, 'max_depth': 7, 'min_child_samples': 13, 'subsample': 0.7914468512238524, 'colsample_bytree': 0.5972443122993195, 'reg_alpha': 1.7509928071255452e-08, 'reg_lambda': 3.494754720266377e-08}. Best is trial 1 with value: 0.5403489973805616.
[I 2025-12-14 03:33:02,327] Trial 2 finished with value: 0.6727030355650021 and parameters: {'n_estimators': 600, 'learning_rate': 0.08324970784356568, 'num_leaves': 49, 'max_depth': 3, 'min_c

In [None]:
threshold = 0.62

# Generate features for test set
test_date = df_test['time'].max()
X_test = feature_builder(df_test, test_date)

# Match training dtype / categories for categoricals
if 'level' in X_test.columns:
    X_test['level'] = X_test['level'].astype('category')
    if 'level' in X_window.columns and str(X_window['level'].dtype) == 'category':
        X_test['level'] = X_test['level'].cat.set_categories(X_window['level'].cat.categories)

model_test = optimized_models[0]
trained_feature_names = list(model_test.booster_.feature_name())

# Align columns to training features
X_final = X_test.reindex(columns=trained_feature_names, fill_value=0)

total_prob = np.zeros(len(X_final), dtype=float)

# Aggregate predictions from optimized models
for m in optimized_models:
    prob = m.predict_proba(X_final)[:, 1]
    total_prob += prob

test_probs = total_prob / len(optimized_models)
predict_labels = (test_probs >= threshold).astype(int)

# Create submission file
submission = pd.DataFrame({'id': X_test.index, 'target': predict_labels})
submission.to_csv(root + '/data/submission_nopipeline.csv', index=False)

print('Using threshold:', float(threshold))
print('Predicted positives:', int(predict_labels.sum()), 'out of', int(len(predict_labels)))

Using threshold: 0.625
Predicted positives: 1276 out of 2904
