In [None]:
# --- Install Dependencies (run this cell first in Colab) ---
!pip install -q supabase gcsfs

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.9/123.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ============================================================
# CONFIGURATION - All hyperparameters and hardcoded variables
# ============================================================

# --- Data Paths ---
RATINGS_PATH = "gs://kanta7/ratings.csv"
MOVIES_PATH = "gs://kanta7/movies.csv"

# --- Supabase Configuration ---
SUPABASE_URL = "https://mgagzgashuexrnkoqkzy.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im1nYWd6Z2FzaHVleHJua29xa3p5Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTc1OTkwOTM5NSwiZXhwIjoyMDc1NDg1Mzk1fQ.0klXZZv3b31usiUKTQxyfXsFPLr5TLPDikUY6s3wrPo"

# --- Supabase Table Names ---
RATINGS_TABLE = "movie_ratings"
MOVIE_EMBEDDINGS_TABLE = "movie_embeddings"
USER_EMBEDDINGS_TABLE = "user_embeddings"
GENRE_LAYERS_TABLE = "genre_layers"

# --- Data Processing ---
# Use 0.0 to train on ALL data (no validation)
TEST_SIZE = 0.0
RANDOM_STATE = 42
GENRE_COLUMN_PREFIX = "g"

# --- DataLoader Settings ---
BATCH_SIZE = 2048
NUM_WORKERS = 2
PIN_MEMORY = True

# --- Model Hyperparameters ---
N_FACTORS = 64
MODEL_DROPOUT = 0.1
EMBEDDING_INIT_STD = 0.01
GENRE_WEIGHT = 0.5  # Weight for genre influence on movie embeddings (reduce to lower genre impact)

# --- Training Hyperparameters ---
EPOCHS = 5
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-6
CLIP_GRAD_NORM = None  # Set to a float value to enable gradient clipping

# --- Export Settings ---
CHUNK_SIZE = 500  # For batch inserts to Supabase
GENRE_LAYER_NAME = "hybrid_mf_v1"  # Name for the genre layer export

In [None]:
# --- Imports ---
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from supabase import create_client, Client

In [None]:
# --- Google Cloud Authentication (required for GCS bucket access) ---
from google.colab import auth
auth.authenticate_user()

In [None]:
# ---------- Dataset ----------
class RatingsDataset(Dataset):
    def __init__(self, user_idx, movie_idx, ratings, genre_matrix):
        assert len(user_idx) == len(movie_idx) == len(ratings)
        self.users = torch.tensor(user_idx, dtype=torch.long)
        self.movies = torch.tensor(movie_idx, dtype=torch.long)
        self.ratings = torch.tensor(ratings, dtype=torch.float32)
        self.genres = torch.tensor(genre_matrix, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx], self.genres[idx]

In [None]:
# ---------- Utilities ----------
def build_id_maps(df: pd.DataFrame, user_col='userId', movie_col='movieId'):
    unique_users = df[user_col].unique()
    unique_movies = df[movie_col].unique()
    user2idx = {u: i for i, u in enumerate(np.sort(unique_users))}
    movie2idx = {m: j for j, m in enumerate(np.sort(unique_movies))}
    user_idx = df[user_col].map(user2idx).to_numpy()
    movie_idx = df[movie_col].map(movie2idx).to_numpy()
    ratings = df['rating'].to_numpy(dtype=np.float32)
    return user2idx, movie2idx, user_idx, movie_idx, ratings

In [None]:
# ---------- Model ----------
class HybridMatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_factors, n_genres, dropout=0.0, genre_weight=1.0):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.genre_layer = nn.Linear(n_genres, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.movie_bias = nn.Embedding(n_movies, 1)
        self.global_bias = nn.Parameter(torch.tensor([0.0]))
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
        self.genre_weight = genre_weight

        nn.init.normal_(self.user_factors.weight, std=EMBEDDING_INIT_STD)
        nn.init.normal_(self.movie_factors.weight, std=EMBEDDING_INIT_STD)
        nn.init.constant_(self.user_bias.weight, 0.0)
        nn.init.constant_(self.movie_bias.weight, 0.0)

    def forward(self, user, movie, genre_vec):
        pu = self.user_factors(user)
        qi = self.movie_factors(movie)
        genre_emb = self.genre_layer(genre_vec)
        qi = qi + self.genre_weight * genre_emb
        pu = self.dropout(pu)
        qi = self.dropout(qi)
        dot = (pu * qi).sum(dim=1)
        b_u = self.user_bias(user).squeeze(1)
        b_i = self.movie_bias(movie).squeeze(1)
        return dot + b_u + b_i + self.global_bias

In [None]:
# ---------- Training / Evaluation ----------
def rmse(preds, targets):
    return math.sqrt(((preds - targets) ** 2).mean().item())

def evaluate(model, dataloader, device):
    model.eval()
    ys, y_preds = [], []
    with torch.no_grad():
        for u, m, r, g in dataloader:
            u, m, r, g = u.to(device), m.to(device), r.to(device), g.to(device)
            p = model(u, m, g)
            ys.append(r.cpu())
            y_preds.append(p.cpu())
    y = torch.cat(ys)
    yp = torch.cat(y_preds)
    return rmse(yp, y)

def train_fn(model, train_loader, val_loader, device,
             epochs=EPOCHS, lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY, clip_grad=CLIP_GRAD_NORM):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    best_metric = float('inf')

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss, cnt = 0.0, 0
        for u, m, r, g in train_loader:
            u, m, r, g = u.to(device), m.to(device), r.to(device), g.to(device)
            optimizer.zero_grad()
            preds = model(u, m, g)
            loss = criterion(preds, r)
            loss.backward()
            if clip_grad:
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
            optimizer.step()
            total_loss += loss.item() * r.size(0)
            cnt += r.size(0)

        train_rmse = math.sqrt(total_loss / cnt)
        
        if val_loader:
            val_rmse = evaluate(model, val_loader, device)
            print(f"Epoch {epoch:03d} | train_rmse={train_rmse:.4f} | val_rmse={val_rmse:.4f}")
            metric = val_rmse
        else:
            print(f"Epoch {epoch:03d} | train_rmse={train_rmse:.4f}")
            metric = train_rmse

        if metric < best_metric:
            best_metric = metric

    return best_metric

In [None]:
# ---------- Load Data ----------
df = pd.read_csv(RATINGS_PATH)
movies_df = pd.read_csv(MOVIES_PATH)

df["userId"] = df["userId"].astype(str)
movies_df["movieId"] = movies_df["movieId"].astype(int)
genre_cols = [c for c in movies_df.columns if c.startswith(GENRE_COLUMN_PREFIX)]
n_genres = len(genre_cols)

print(f"Loaded {len(df)} ratings and {len(movies_df)} movies with {n_genres} genres")

Loaded 23585151 ratings and 62805 movies with 19 genres


In [None]:
# --- Supabase Connection ---
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

# Fetch extra ratings from Supabase
response = supabase.table(RATINGS_TABLE).select("user_id, movie_id, rating").execute()
df_supabase = pd.DataFrame(response.data).rename(columns={
    "user_id": "userId", "movie_id": "movieId", "rating": "rating"
})
df = pd.concat([df, df_supabase], ignore_index=True)

print(f"After merging Supabase data: {len(df)} total ratings")

After merging Supabase data: 23585154 total ratings


In [None]:
# ---------- Prepare Data ----------
if TEST_SIZE > 0:
    train_df, val_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)
else:
    train_df = df
    val_df = pd.DataFrame(columns=df.columns)

user2idx, movie2idx, train_u, train_m, train_r = build_id_maps(train_df)

if not val_df.empty:
    # Filter validation set to known users/movies
    val_df = val_df[val_df['userId'].isin(user2idx) & val_df['movieId'].isin(movie2idx)]
    val_u = val_df['userId'].map(user2idx).to_numpy()
    val_m = val_df['movieId'].map(movie2idx).to_numpy()
    val_r = val_df['rating'].to_numpy(dtype=np.float32)
else:
    val_u = val_m = val_r = np.array([], dtype=np.float32)

# Build genre matrix aligned with movie indices
genre_matrix = np.zeros((len(movie2idx), n_genres), dtype=np.float32)
for _, row in movies_df.iterrows():
    mid = row['movieId']
    if mid in movie2idx:
        genre_matrix[movie2idx[mid]] = row[genre_cols].values.astype(np.float32)

train_genres = genre_matrix[train_m]
val_genres = genre_matrix[val_m] if len(val_m) > 0 else np.zeros((0, n_genres))

num_users, num_movies = len(user2idx), len(movie2idx)
print(f"Training {num_users} users, {num_movies} movies, {n_genres} genres")
if TEST_SIZE > 0:
    print(f"Validation size: {len(val_df)}")
else:
    print("Using FULL dataset for training (no validation).")

Training 324487 users, 61191 movies, 19 genres


In [None]:
# ---------- DataLoaders ----------
train_loader = DataLoader(
    RatingsDataset(train_u, train_m, train_r, train_genres),
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=PIN_MEMORY,
    num_workers=NUM_WORKERS
)

if len(val_u) > 0:
    val_loader = DataLoader(
        RatingsDataset(val_u, val_m, val_r, val_genres),
        batch_size=BATCH_SIZE,
        pin_memory=PIN_MEMORY,
        num_workers=NUM_WORKERS
    )
else:
    val_loader = None

In [None]:
# ---------- Model ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = HybridMatrixFactorization(
    num_users,
    num_movies,
    N_FACTORS,
    n_genres,
    dropout=MODEL_DROPOUT,
    genre_weight=GENRE_WEIGHT
).to(device)
model.global_bias.data = torch.tensor([train_r.mean()], device=device)

print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")

Using device: cuda
Model initialized with 25070351 parameters


In [None]:
# ---------- Train ----------
best_val = train_fn(
    model,
    train_loader,
    val_loader,
    device,
    epochs=EPOCHS,
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    clip_grad=CLIP_GRAD_NORM
)
print(f"Best validation RMSE: {best_val:.4f}")

Epoch 001 | train_rmse=1.7455 | val_rmse=1.6443
Epoch 002 | train_rmse=1.5953 | val_rmse=1.5948
Epoch 003 | train_rmse=1.5269 | val_rmse=1.5735
Epoch 004 | train_rmse=1.4760 | val_rmse=1.5667
Epoch 005 | train_rmse=1.4371 | val_rmse=1.5675
Best validation RMSE: 1.5667


In [None]:
# ---------- Export Movie Embeddings ----------
genre_tensor = torch.tensor(genre_matrix, dtype=torch.float32).to(device)
movie_embeddings = model.movie_factors.weight + model.genre_weight * model.genre_layer(genre_tensor)
movie_embeddings = movie_embeddings.detach().cpu().numpy()
movie_embeddings = np.nan_to_num(movie_embeddings)

idx2movie = {v: k for k, v in movie2idx.items()}
df_movie_embeddings = pd.DataFrame({
    "movie_id": [int(idx2movie[i]) for i in range(len(idx2movie))],
    "embedding": movie_embeddings.tolist()
})

# Upsert new embeddings in chunks (avoids delete timeout)
records = df_movie_embeddings.to_dict(orient="records")
for i in range(0, len(records), CHUNK_SIZE):
    supabase.table(MOVIE_EMBEDDINGS_TABLE).upsert(records[i:i+CHUNK_SIZE], on_conflict='movie_id').execute()

print(f"Saved {len(records)} movie embeddings.")

Saved 61191 movie embeddings.


In [22]:
# ---------- Export User Embeddings ----------
user_embeddings_weights = model.user_factors.weight.detach().cpu().numpy()
user_embeddings_weights = np.nan_to_num(user_embeddings_weights)

# Supabase users only (handle empty df_supabase case)
if not df_supabase.empty and "userId" in df_supabase.columns:
    supabase_user_ids = set(df_supabase["userId"].unique())
else:
    supabase_user_ids = set()

idx2user = {v: k for k, v in user2idx.items()}

# 1. Collect TRAINED user embeddings
trained_records = []
for i, emb in enumerate(user_embeddings_weights):
    if idx2user[i] in supabase_user_ids:
        trained_records.append({"user_id": idx2user[i], "embedding": emb.tolist()})

print(f"Prepared {len(trained_records)} trained user embeddings.")

# 2. Handle 'Fold-in' Users (Cold Start)
# Fetch users who have preferences in user_genre_preferences table
print("Fetching user genre preferences...")
try:
    resp = supabase.table("user_genre_preferences").select("user_id, genre").execute()
    df_prefs = pd.DataFrame(resp.data)
except Exception as e:
    print(f"Skipping fold-in users (error fetching preferences): {e}")
    df_prefs = pd.DataFrame()

fold_in_records = []
if not df_prefs.empty:
    W_genre = model.genre_layer.weight.detach().cpu().numpy() # (n_factors, n_genres)
    b_genre = model.genre_layer.bias.detach().cpu().numpy()   # (n_factors,)

    valid_genres = set(genre_cols)
    trained_user_ids = set(r['user_id'] for r in trained_records)

    # Group by user_id
    grouped = df_prefs.groupby("user_id")['genre'].apply(list)

    for uid, genres in grouped.items():
        if uid in trained_user_ids:
            continue # Already exported via training

        indices = []
        for g in genres:
            # Map "Action" -> "gAction" based on GENRE_COLUMN_PREFIX
            target_col = GENRE_COLUMN_PREFIX + g
            if target_col in genre_cols:
                 indices.append(genre_cols.index(target_col))
            # Fallback for exact match
            elif g in genre_cols:
                 indices.append(genre_cols.index(g))

        if indices:
            vectors = W_genre[:, indices]
            avg_vector = vectors.mean(axis=1) + b_genre
            fold_in_records.append({
                "user_id": uid,
                "embedding": avg_vector.tolist()
            })

print(f"Computed {len(fold_in_records)} fold-in user embeddings from preferences.")

# 3. Combine and Upsert
all_records = trained_records + fold_in_records

if all_records:
    # Safe to upsert
    for i in range(0, len(all_records), CHUNK_SIZE):
        chunk = all_records[i:i+CHUNK_SIZE]
        supabase.table(USER_EMBEDDINGS_TABLE).upsert(chunk, on_conflict='user_id').execute()

    print(f"Successfully saved {len(all_records)} TOTAL user embeddings.")
else:
    print("No user embeddings to export.")

Prepared 3 trained user embeddings.
Fetching user genre preferences...
Computed 2 fold-in user embeddings from preferences.
Successfully saved 5 TOTAL user embeddings.


In [23]:
# ----------- Export genre_layer -------------
W = model.genre_layer.weight.detach().cpu().numpy()  # shape: (n_factors, n_genres)
b = model.genre_layer.bias.detach().cpu().numpy()    # shape: (n_factors,)

print(f"Genre layer weight shape: {W.shape}")
print(f"Genre layer bias shape: {b.shape}")

# Prepare the genre layer record for Supabase
genre_layer_record = {
    "name": GENRE_LAYER_NAME,
    "genre_names": genre_cols,  # List of genre column names
    "weight": W.tolist(),       # Convert to nested list for JSONB
    "bias": b.tolist()          # Convert to list for JSONB
}

# Delete existing record with same name (upsert behavior)
supabase.table(GENRE_LAYERS_TABLE).delete().eq("name", GENRE_LAYER_NAME).execute()

# Insert the new genre layer
supabase.table(GENRE_LAYERS_TABLE).insert(genre_layer_record).execute()

print(f"Genre layer '{GENRE_LAYER_NAME}' saved to {GENRE_LAYERS_TABLE}.")
print("All embeddings saved successfully!")

Genre layer weight shape: (64, 19)
Genre layer bias shape: (64,)
Genre layer 'hybrid_mf_v1' saved to genre_layers.
All embeddings saved successfully!
