In [1]:
# 3_model_development.ipynb

# Model Development for KuaiRec Recommender System
# ================================================
#
# This notebook covers the third task of the project:
# 1. Implementing collaborative filtering models
# 2. Implementing content-based filtering models
# 3. Implementing sequence-aware models
# 4. Implementing hybrid approaches
# 5. Evaluating models on the training set

# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import pickle
from datetime import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.abspath("../"))
from src.models.collaborative import ALSModel
from src.models.content_based import ContentBasedModel
from src.models.sequence_aware import SequentialRules
from src.models.hybrid import LightGBMModel

# Set up directories
processed_dir = "../data/processed"
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)

# Load the data
print("Loading processed data...")
train_features = pd.read_csv(os.path.join(processed_dir, "train_features.csv"), low_memory=True)
user_features = pd.read_csv(os.path.join(processed_dir, "user_features.csv"), low_memory=True)
item_features = pd.read_csv(os.path.join(processed_dir, "item_features.csv"), low_memory=True)

# Load/create the interaction matrix
try:
    interaction_matrix = sparse.load_npz(os.path.join(processed_dir, "interaction_matrix.npz"))
    with open(os.path.join(processed_dir, "user_indices.pkl"), 'rb') as f:
        user_indices = pickle.load(f)
    with open(os.path.join(processed_dir, "item_indices.pkl"), 'rb') as f:
        item_indices = pickle.load(f)
    print("Loaded interaction matrix from file.")
except FileNotFoundError:
    print("Creating interaction matrix...")
    # Create user and item indices
    user_indices = {user: i for i, user in enumerate(train_features['user_id'].unique())}
    item_indices = {item: i for i, item in enumerate(train_features['video_id'].unique())}
    
    # Map user and item IDs to indices
    row_indices = train_features['user_id'].map(user_indices).values
    col_indices = train_features['video_id'].map(item_indices).values
    
    # Get rating values
    data = train_features['watch_ratio'].values
    
    # Build sparse matrix
    n_users = len(user_indices)
    n_items = len(item_indices)
    interaction_matrix = sparse.csr_matrix((data, (row_indices, col_indices)), shape=(n_users, n_items))
    
    # Save the matrix and indices
    sparse.save_npz(os.path.join(processed_dir, "interaction_matrix.npz"), interaction_matrix)
    with open(os.path.join(processed_dir, "user_indices.pkl"), 'wb') as f:
        pickle.dump(user_indices, f)
    with open(os.path.join(processed_dir, "item_indices.pkl"), 'wb') as f:
        pickle.dump(item_indices, f)

print(f"Data loaded. Train features shape: {train_features.shape}")
print(f"Interaction matrix shape: {interaction_matrix.shape}, density: {interaction_matrix.nnz / (interaction_matrix.shape[0] * interaction_matrix.shape[1]) * 100:.2f}%")

# 1. Collaborative Filtering Models
# --------------------------------

print("\n--- Training Collaborative Filtering Models ---")

print("\nTraining ALS model...")
als_model = ALSModel(factors=100)
als_model.fit(interaction_matrix)
als_model.save(os.path.join(models_dir, "als_model.pkl"))
print("ALS model trained and saved.")


# 2. Content-Based Filtering
# --------------------------

print("\n--- Training Content-Based Filtering Model ---")
content_model = ContentBasedModel()
content_model.fit(item_features, train_features)
content_model.save(os.path.join(models_dir, "content_model.pkl"))
print("Content-based model trained and saved.")

# 3. Sequence-Aware Model
# -----------------------

print("\n--- Training Sequence-Aware Model ---")
# Sort training data by user and timestamp to create sequences
if 'timestamp' in train_features.columns:
    train_features_sorted = train_features.sort_values(['user_id', 'timestamp'])
    
    seq_model = SequentialRules(max_sequence_length=5, min_support=2)
    seq_model.fit(train_features_sorted)
    seq_model.save(os.path.join(models_dir, "sequential_model.pkl"))
    print("Sequential model trained and saved.")
else:
    print("Timestamp column not found in training data. Skipping sequence-aware model.")

# 4. Hybrid Model
# --------------

print("\n--- Training Hybrid Model ---")
hybrid_model = LightGBMModel()
hybrid_model.fit(train_features)
hybrid_model.save(os.path.join(models_dir, "hybrid_model"))
print("Hybrid model trained and saved.")


# 5. Summary
# ---------

print("\n=== Model Development Summary ===")
print("1. Collaborative Filtering:")
print("   - Trained ALS model with 100 factors")

print("\n2. Content-Based Filtering:")
print("   - Trained model using video categories and engagement metrics")

print("\n3. Sequence-Aware Model:")
if 'seq_model' in locals():
    print("   - Trained sequential rules model with max sequence length of 5")
else:
    print("   - Not trained due to missing timestamp data")

print("\n4. Hybrid Model:")
print("   - Trained LightGBM model combining user, item, and interaction features")

print("\nAll models have been saved to the models directory.")
print("Next step: Implement the recommendation algorithm to generate recommendations for users.")

Loading processed data...
Loaded interaction matrix from file.
Data loaded. Train features shape: (3741835, 21)
Interaction matrix shape: (1411, 3327), density: 79.71%

--- Training Collaborative Filtering Models ---

Training ALS model...
ALS model trained and saved.

--- Training Content-Based Filtering Model ---
Content-based model trained and saved.

--- Training Sequence-Aware Model ---
Sequential model trained and saved.

--- Training Hybrid Model ---
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.793290
[LightGBM] [Info] Total Bins 1608
[LightGBM] [Info] Number of data points in the train set: 2993468, number of used features: 14
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 12 dense feature groups 