In [None]:
import pandas as pd
import numpy as np

df_embs = pd.read_csv("df_movie_embs.csv")  

df_ratings = pd.read_csv("ml-100k/u.data", sep="\t", names=["userId", "movieId", "rating", "timestamp"])
df_ratings = df_ratings.drop(columns=["timestamp"])


In [None]:
movie_avg_rating = df_ratings.groupby("movieId")["rating"].mean().reset_index()
movie_avg_rating.columns = ["movieId", "avg_rating"]

df_features = pd.merge(df_embs, movie_avg_rating, on="movieId", how="left")
df_features = df_features.dropna()


In [None]:
from sklearn.cluster import KMeans


emb_cols = [col for col in df_features.columns if col.startswith("emb_")]
k1 = 20  
df_features["cluster_1"] = KMeans(n_clusters=k1, random_state=42).fit_predict(df_features[emb_cols])

df_features["cluster_2"] = -1
for c in range(k1):
    subset = df_features[df_features["cluster_1"] == c]
    if len(subset) >= 2:
        enhanced = subset[emb_cols + ["avg_rating"]]
        df_features.loc[subset.index, "cluster_2"] = KMeans(n_clusters=min(5, len(subset)), random_state=42).fit_predict(enhanced)


In [None]:
df_features["final_cluster"] = df_features["cluster_1"].astype(str) + "_" + df_features["cluster_2"].astype(str)
movie_cluster_map = df_features.set_index("movieId")["final_cluster"].to_dict()

df_ratings["cluster"] = df_ratings["movieId"].map(movie_cluster_map)
df_ratings = df_ratings.dropna()


In [None]:
from collections import defaultdict

cluster_list = sorted(df_ratings["cluster"].unique())
cluster_to_idx = {c: i for i, c in enumerate(cluster_list)}

user_vectors = defaultdict(lambda: np.zeros(len(cluster_list)))
user_counts = defaultdict(lambda: np.zeros(len(cluster_list)))

for _, row in df_ratings.iterrows():
    uid = row["userId"]
    cid = row["cluster"]
    idx = cluster_to_idx[cid]
    user_vectors[uid][idx] += row["rating"]
    user_counts[uid][idx] += 1

user_features = {}
for uid in user_vectors:
    vec = user_vectors[uid]
    count = user_counts[uid]
    with np.errstate(divide='ignore', invalid='ignore'):
        mean_vec = np.where(count != 0, vec / count, 0)
    user_features[uid] = mean_vec


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class UserAttentionEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, heads=4):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=heads, batch_first=True)
        self.proj = nn.Linear(input_dim, hidden_dim)

    def forward(self, user_matrix):
        # user_matrix: (num_users, input_dim)
        user_matrix = user_matrix.unsqueeze(1)  # (batch, seq_len=1, input_dim)
        attn_output, _ = self.attn(user_matrix, user_matrix, user_matrix)  # self-attention
        return self.proj(attn_output.squeeze(1))  # (batch, hidden_dim)

user_ids = sorted(user_features.keys())
user_matrix = torch.tensor([user_features[uid] for uid in user_ids], dtype=torch.float32)

encoder = UserAttentionEncoder(input_dim=len(cluster_list))
with torch.no_grad():
    user_embeddings = encoder(user_matrix)  # (num_users, hidden_dim)

from sklearn.metrics.pairwise import cosine_similarity
user_sim_matrix = cosine_similarity(user_embeddings.numpy())


In [None]:
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

X_user, X_movie, y = [], [], []

for _, row in df_ratings.iterrows():
    uid, mid, rating = row["userId"], row["movieId"], row["rating"]
    if uid not in user_features:
        continue
    if mid not in df_features.set_index("movieId").index:
        continue

    u_vec = user_features[uid]
    m_vec = df_features[df_features["movieId"] == mid][emb_cols + ["avg_rating"]].values[0]

    X_user.append(u_vec)
    X_movie.append(m_vec)
    y.append(rating)

X_user_tensor = torch.tensor(X_user, dtype=torch.float32).to(device)
X_movie_tensor = torch.tensor(X_movie, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.float32).to(device)

user_dim = X_user_tensor.shape[1]
movie_dim = X_movie_tensor.shape[1]


In [None]:
import torch
import torch.nn as nn

import torch.nn as nn
import torch

class TransformerRecommender(nn.Module):
    def __init__(self, raw_user_dim, movie_dim, d_model=128, nhead=4, num_layers=2):
        super().__init__()

        self.pad_user = nn.Linear(raw_user_dim, d_model)   # 150 → 128
        self.pad_movie = nn.Linear(movie_dim, d_model)     # 384 → 128

        self.pos_embedding = nn.Parameter(torch.randn(1, 2, d_model))

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.predictor = nn.Sequential(
            nn.Linear(d_model * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user_vec, movie_vec):
        u = self.pad_user(user_vec)     # (batch, d_model)
        m = self.pad_movie(movie_vec)   # (batch, d_model)

        x = torch.stack([u, m], dim=1)  # (batch, 2, d_model)

        x = x + self.pos_embedding[:, :2, :]

        encoded = self.encoder(x)  # (batch, 2, d_model)

        combined = torch.cat([encoded[:, 0, :], encoded[:, 1, :]], dim=1)  # (batch, d_model*2)

        return self.predictor(combined).squeeze()


In [None]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

X_user_train, X_user_test, X_movie_train, X_movie_test, y_train, y_test = train_test_split(
    X_user_tensor, X_movie_tensor, y_tensor, test_size=0.2, random_state=42
)

train_dataset = TensorDataset(X_user_train, X_movie_train, y_train)
test_dataset  = TensorDataset(X_user_test, X_movie_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=512, shuffle=False)


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random

sample_indices = np.random.choice(len(X_user_tensor), size=4000, replace=False)
user_vectors_sample = X_user_tensor[sample_indices].cpu().numpy()

cos_sim_matrix = cosine_similarity(user_vectors_sample)

user_pairs = []
labels = []

while len(user_pairs) < 3000:
    i, j = random.sample(range(len(user_vectors_sample)), 2)
    sim = cos_sim_matrix[i, j]
    if sim > 0.95:
        user_pairs.append((sample_indices[i], sample_indices[j]))
        labels.append(1)
    elif sim < 0.3:
        user_pairs.append((sample_indices[i], sample_indices[j]))
        labels.append(0)


In [None]:
user_dim = X_user_tensor.shape[1]    
movie_dim = X_movie_tensor.shape[1]  

model = TransformerRecommender(user_dim, movie_dim).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()


In [None]:

cosine_loss_fn = nn.CosineEmbeddingLoss(margin=0.3)
lambda_sim = 0.5  

for epoch in range(30):
    model.train()
    total_loss = 0

    for xu, xm, yt in train_loader:
        xu, xm, yt = xu.to(device), xm.to(device), yt.to(device)
        optimizer.zero_grad()

        preds = model(xu, xm)
        mse_loss = loss_fn(preds, yt)

        selected_pairs = random.sample(range(len(user_pairs)), 512)
        u1_indices = [user_pairs[i][0] for i in selected_pairs]
        u2_indices = [user_pairs[i][1] for i in selected_pairs]
        sim_labels = torch.tensor([labels[i] for i in selected_pairs], dtype=torch.float32).to(device)

        u1_vec = X_user_tensor[u1_indices].to(device)
        u2_vec = X_user_tensor[u2_indices].to(device)
        u1_emb = model.pad_user(u1_vec)
        u2_emb = model.pad_user(u2_vec)
        u1_out = model.encoder(u1_emb.unsqueeze(1) + model.pos_embedding[:, :1])
        u2_out = model.encoder(u2_emb.unsqueeze(1) + model.pos_embedding[:, :1])
        u1_final = u1_out.squeeze(1)
        u2_final = u2_out.squeeze(1)

        sim_loss = cosine_loss_fn(u1_final, u2_final, sim_labels)

        loss = mse_loss + lambda_sim * sim_loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f" Epoch {epoch+1}/30 - Total Hybrid Loss: {total_loss:.4f}")


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for xu, xm, yt in test_loader:
        xu, xm = xu.to(device), xm.to(device)
        preds = model(xu, xm)
        y_true.extend(yt.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred) ** 0.5

print(f"RMSE: {rmse:.4f}")
print(f" MAE:  {mae:.4f}")