<a href="https://colab.research.google.com/github/migub/recommender-systems/blob/main/Notebooks/05_Hybrid_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Logistic Regression


In [1]:
## 1. Setup
!pip install scikit-surprise lightgbm imbalanced-learn

import numpy as np
import pandas as pd

# For model building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

# LightGBM
import lightgbm as lgb

# For matrix factorization (Surprise)
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.model_selection import GridSearchCV as surprise_GridSearch
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
## 2. Data Loading
df = pd.read_csv("/content/drive/MyDrive/Recommender_Systems/train_preprocessed.csv")

In [10]:
## 3. Prepare Training Data
df = df.drop(columns=['release_date', 'ts_listen'], errors='ignore')  # for example

# Convert booleans to int if needed
if df['is_listened'].dtype == bool:
    df['is_listened'] = df['is_listened'].astype(int)

# Let's separate the target
target_col = 'is_listened'
y = df[target_col]
X = df.drop(columns=[target_col])

# 2.2 Train-Validation Split for final ensemble
# We'll hold out 20% for validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Shapes:")
print("  X_train:", X_train.shape)
print("  y_train:", y_train.shape)
print("  X_val:  ", X_val.shape)
print("  y_val:  ", y_val.shape)

Shapes:
  X_train: (5699117, 15)
  y_train: (5699117,)
  X_val:   (1424780, 15)
  y_val:   (1424780,)


In [11]:
## 4. MATRIX FACTORIZATION (using Surprise SVD)
train_df = pd.DataFrame({
    'user_id': X_train['user_id'],
    'item_id': X_train['media_id'],
    'rating': y_train  # 0 or 1
})

val_df = pd.DataFrame({
    'user_id': X_val['user_id'],
    'item_id': X_val['media_id'],
    'rating': y_val
})

# Surprise requires a "Reader" that defines the rating_scale
reader = Reader(rating_scale=(0, 1))

# Build the full Surprise dataset from train_df
train_data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
trainset = train_data.build_full_trainset()

# We'll also build a separate "testset" object from val_df for evaluating predictions
# Surprise's testset format is list of (user, item, rating)
valset = list(val_df[['user_id','item_id','rating']].itertuples(index=False, name=None))

# 4.2 Train an SVD model
svd_model = SVD(n_factors=50, random_state=42)  # tweak n_factors as needed
svd_model.fit(trainset)

# 4.3 Generate predictions on validation set
# We'll store these predictions as a Pandas Series, aligned with X_val order
val_preds_svd = []
for row in valset:
    user, item, true_rating = row
    pred = svd_model.predict(user, item)
    val_preds_svd.append(pred.est)

# Convert list of predictions to a Series
val_preds_svd = pd.Series(val_preds_svd, index=X_val.index)

In [14]:
## 5. LOGISTIC REGRESSION
cat_cols = ['platform_name', 'platform_family', 'listen_type', 'user_gender', 'listen_hour_period', 'listen_weekpart']
num_cols = ['context_type', 'media_duration', 'user_age', 'song_age']

# Subset X_train, X_val
X_train_lr = X_train[cat_cols + num_cols].copy()
X_val_lr   = X_val[cat_cols + num_cols].copy()

# Use OneHotEncoder from scikit-learn
ohe = OneHotEncoder(drop='first', handle_unknown='ignore')
ohe.fit(X_train_lr[cat_cols])

# Transform cat cols
X_train_cat_encoded = ohe.transform(X_train_lr[cat_cols])
X_val_cat_encoded   = ohe.transform(X_val_lr[cat_cols])

# Combine with numeric columns
X_train_num = X_train_lr[num_cols].fillna(0).to_numpy()
X_val_num   = X_val_lr[num_cols].fillna(0).to_numpy()

# We'll use scipy's sparse hstack to combine
from scipy.sparse import hstack
X_train_final_lr = hstack([X_train_cat_encoded, X_train_num])
X_val_final_lr   = hstack([X_val_cat_encoded,   X_val_num])

# 4.2 Fit Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_final_lr, y_train)

# 4.3 Predict probabilities on validation set
val_preds_lr = lr_model.predict_proba(X_val_final_lr)[:, 1]
val_preds_lr = pd.Series(val_preds_lr, index=X_val.index)

In [15]:
## 6. LIGHTGBM
X_train_lgb = X_train_lr.copy()
X_val_lgb   = X_val_lr.copy()

# We'll reuse the same one-hot encoding logic from above
X_train_cat_encoded_lgb = ohe.transform(X_train_lgb[cat_cols])
X_val_cat_encoded_lgb   = ohe.transform(X_val_lgb[cat_cols])

X_train_num_lgb = X_train_lgb[num_cols].fillna(0).to_numpy()
X_val_num_lgb   = X_val_lgb[num_cols].fillna(0).to_numpy()

X_train_final_lgb = hstack([X_train_cat_encoded_lgb, X_train_num_lgb])
X_val_final_lgb   = hstack([X_val_cat_encoded_lgb,   X_val_num_lgb])

# Convert sparse matrix to LightGBM Dataset
# (You could train directly on the sparse matrix, but let's show the typical approach.)
train_dataset_lgb = lgb.Dataset(X_train_final_lgb, label=y_train)

# 6.2 Train LightGBM model (basic params)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1,
    'seed': 42
}

lgb_model = lgb.train(
    params,
    train_dataset_lgb,
    num_boost_round=100,  # for example
)

# 6.3 Predictions on validation set
val_preds_lgb = lgb_model.predict(X_val_final_lgb)
val_preds_lgb = pd.Series(val_preds_lgb, index=X_val.index)



In [18]:
## 7. ENSEMBLE: STACKING (Meta-Model)
# 7.1 Create a "meta-feature" dataframe for the validation set
meta_val_df = pd.DataFrame({
    'svd': val_preds_svd,
    'lr': val_preds_lr,
    'lgb': val_preds_lgb
}, index=X_val.index)

# 7.2 Train a meta-model (Logistic Regression) on these 3 features
meta_model = LogisticRegression()
meta_model.fit(meta_val_df, y_val)
final_val_preds = meta_model.predict_proba(meta_val_df)[:, 1]


In [19]:
## 8. EVALUATION
def print_metrics(y_true, y_probs, threshold=0.5):
    preds = (y_probs >= threshold).astype(int)
    acc   = accuracy_score(y_true, preds)
    prec  = precision_score(y_true, preds, zero_division=0)
    rec   = recall_score(y_true, preds, zero_division=0)
    f1    = f1_score(y_true, preds, zero_division=0)
    auc   = roc_auc_score(y_true, y_probs)
    print(f"Threshold = {threshold}")
    print(f" Accuracy:  {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall:    {rec:.4f}")
    print(f" F1 Score:  {f1:.4f}")
    print(f" ROC AUC:   {auc:.4f}")
    print()

# 7.1 Evaluate each base model
print("=== Base Models ===")
print("SVD Predictions:")
print_metrics(y_val, val_preds_svd)

print("Logistic Regression:")
print_metrics(y_val, val_preds_lr)

print("LightGBM:")
print_metrics(y_val, val_preds_lgb)

# 7.2 Evaluate the stacked model
print("=== Stacked Model ===")
print_metrics(y_val, final_val_preds)

# 7.3 Confusion matrix for the stacked model at threshold=0.5
from sklearn.metrics import confusion_matrix
stacked_preds = (final_val_preds >= 0.5).astype(int)
cm = confusion_matrix(y_val, stacked_preds)
print("Confusion Matrix (Stacked Model):")
print(cm)

=== Base Models ===
SVD Predictions:
Threshold = 0.5
 Accuracy:  0.7600
 Precision: 0.7861
 Recall:    0.8910
 F1 Score:  0.8353
 ROC AUC:   0.7987

Logistic Regression:
Threshold = 0.5
 Accuracy:  0.6898
 Precision: 0.6969
 Recall:    0.9655
 F1 Score:  0.8095
 ROC AUC:   0.6498

LightGBM:
Threshold = 0.5
 Accuracy:  0.6991
 Precision: 0.7144
 Recall:    0.9319
 F1 Score:  0.8088
 ROC AUC:   0.6755

=== Stacked Model ===
Threshold = 0.5
 Accuracy:  0.7628
 Precision: 0.7891
 Recall:    0.8906
 F1 Score:  0.8368
 ROC AUC:   0.8033

Confusion Matrix (Stacked Model):
[[220366 231562]
 [106433 866419]]
