# Module 10: Two-Tower Models

This is a basic two-tower recommender model that was trained on the 1 million MovieLens dataset.  This code was initially generated by ChatGPT (OpenAI 2025), before I debugged it and tested its expected behavior.

OpenAI. 2025. Chat with ChatGPT about Two-Tower Recommendation Systems and Evaluation Metrics. ChatGPT, June 3, 2025. https://chat.openai.com/.

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

# Load and preprocess data (table-wise sharding)
ratings = pd.read_csv('./data/ml-1m/ratings.dat', sep='::', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
movies = pd.read_csv('./data/ml-1m/movies.dat', sep='::', header=None,
                     names=['movie_id', 'title', 'genres'], engine='python', encoding="latin-1")
users = pd.read_csv('./data/ml-1m/users.dat', sep='::', header=None,
                    names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

# Table-wise data sharding (easy approach)
data = pd.merge(ratings, movies[['movie_id', 'title']], on='movie_id')

user_to_idx = {user: idx for idx, user in enumerate(data['user_id'].unique())}
movie_to_idx = {movie: idx for idx, movie in enumerate(data['movie_id'].unique())}

data['user_idx'] = data['user_id'].map(user_to_idx)
data['movie_idx'] = data['movie_id'].map(movie_to_idx)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Dataset
class RecDataset(Dataset):
    def __init__(self, data):
        self.user_idx = torch.tensor(data['user_idx'].values)
        self.movie_idx = torch.tensor(data['movie_idx'].values)
        self.rating = torch.tensor(data['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.user_idx)

    def __getitem__(self, idx):
        return self.user_idx[idx], self.movie_idx[idx], self.rating[idx]

train_dataset = RecDataset(train_data)
test_dataset = RecDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

# Two-Tower Model
class TwoTowerModel(nn.Module):
    def __init__(self, n_users, n_movies, emb_dim=512):
        super(TwoTowerModel, self).__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.movie_emb = nn.Embedding(n_movies, emb_dim)

    def forward(self, user_idx, movie_idx):
        user_embedding = self.user_emb(user_idx)
        movie_embedding = self.movie_emb(movie_idx)
        scores = (user_embedding * movie_embedding).sum(dim=1)  # dot product
        return scores

model = TwoTowerModel(n_users=len(user_to_idx), n_movies=len(movie_to_idx))
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

# Training
for epoch in range(50):
    model.train()
    running_loss = 0.0
    for user_idx, movie_idx, rating in train_loader:
        optimizer.zero_grad()
        output = model(user_idx, movie_idx)
        loss = loss_fn(output, rating)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    #if epoch + 1 % 5 == 0:
    print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}')

# Evaluation: NDCG@10, Precision@10, Recall@10
def evaluate_topk(model, test_df, k=10):
    model.eval()
    user_positive_items = defaultdict(set)
    for _, row in test_df.iterrows():
        user_positive_items[row['user_idx']].add(row['movie_idx'])

    precision_list, recall_list, ndcg_list = [], [], []

    with torch.no_grad():
        for user in user_positive_items.keys():
            # Get all movie scores for this user
            user_tensor = torch.tensor([user] * len(movie_to_idx))
            movie_tensor = torch.tensor(list(movie_to_idx.values()))
            scores = model(user_tensor, movie_tensor).cpu().numpy()

            #topk_indices = np.argsort(scores)[-k:][::-1]

            topk_indices = np.argsort(scores)[-k:][::-1].copy()
            topk_items = movie_tensor[topk_indices].numpy()

            #topk_items = movie_tensor[topk_indices].numpy()

            true_items = user_positive_items[user]
            hits = [1 if item in true_items else 0 for item in topk_items]

            precision = np.sum(hits) / k
            recall = np.sum(hits) / len(true_items)
            dcg = np.sum([rel / np.log2(idx + 2) for idx, rel in enumerate(hits)])
            idcg = np.sum([1 / np.log2(i + 2) for i in range(min(len(true_items), k))])
            ndcg = dcg / idcg if idcg > 0 else 0.0

            precision_list.append(precision)
            recall_list.append(recall)
            ndcg_list.append(ndcg)

    print(f"Precision@{k}: {np.mean(precision_list):.4f}")
    print(f"Recall@{k}: {np.mean(recall_list):.4f}")
    print(f"NDCG@{k}: {np.mean(ndcg_list):.4f}")

# Run evaluation
evaluate_topk(model, test_data, k=10)


Epoch 1, Loss: 503.39047760243915
Epoch 2, Loss: 393.6391291313464
Epoch 3, Loss: 309.2055415863271
Epoch 4, Loss: 242.90896415222636
Epoch 5, Loss: 190.5340214995167
Epoch 6, Loss: 148.98679762545143
Epoch 7, Loss: 116.00413230252083
Epoch 8, Loss: 89.93755259599222
Epoch 9, Loss: 69.41950952183561
Epoch 10, Loss: 53.34633196955142
Epoch 11, Loss: 40.82120997338649
Epoch 12, Loss: 31.11166368664988
Epoch 13, Loss: 23.640140599302015
Epoch 14, Loss: 17.930266747389304
Epoch 15, Loss: 13.60136778946118
Epoch 16, Loss: 10.345775104239774
Epoch 17, Loss: 7.902506875869868
Epoch 18, Loss: 6.072773063274296
Epoch 19, Loss: 4.700501372442221
Epoch 20, Loss: 3.679887526175555
Epoch 21, Loss: 2.9311759567931484
Epoch 22, Loss: 2.379806975422003
Epoch 23, Loss: 1.9715879445185747
Epoch 24, Loss: 1.6699375293748764
Epoch 25, Loss: 1.450127925561822
Epoch 26, Loss: 1.292082395623712
Epoch 27, Loss: 1.1817159456060367
Epoch 28, Loss: 1.0970031521509371
Epoch 29, Loss: 1.0320135825277899
Epoch 30, 

In [7]:
# Run evaluation
evaluate_topk(model, test_data, k=10)

Precision@10: 0.0030
Recall@10: 0.0008
NDCG@10: 0.0029


In [8]:
def recommend_for_user(model, user_id_raw, user_to_idx, movie_to_idx, movies_df, ratings_df, k=10):
    model.eval()
    
    if user_id_raw not in user_to_idx:
        print(f"User ID {user_id_raw} not found.")
        return pd.DataFrame()
    
    user_idx = user_to_idx[user_id_raw]
    all_movie_idxs = np.array(list(movie_to_idx.values()))
    
    # Get the set of movie_idx the user has already seen
    watched_movie_ids = ratings_df[ratings_df['user_id'] == user_id_raw]['movie_id'].unique()
    watched_movie_idxs = [movie_to_idx[movie_id] for movie_id in watched_movie_ids if movie_id in movie_to_idx]
    
    # Filter out watched movies
    unwatched_mask = ~np.isin(all_movie_idxs, watched_movie_idxs)
    candidate_movie_idxs = all_movie_idxs[unwatched_mask]
    
    if len(candidate_movie_idxs) == 0:
        print("No unseen movies to recommend.")
        return pd.DataFrame()

    # Score all candidate movies
    user_tensor = torch.tensor([user_idx] * len(candidate_movie_idxs))
    movie_tensor = torch.tensor(candidate_movie_idxs)
    
    with torch.no_grad():
        scores = model(user_tensor, movie_tensor).cpu().numpy()
    
    # Top-k recommendations
    #top_k_indices = np.argsort(scores)[-k:][::-1]
    top_k_indices = np.argsort(scores)[-k:][::-1].copy()
    top_k_movie_idxs = movie_tensor[top_k_indices].numpy()
    
    # Map back to movie_id
    idx_to_movie = {idx: movie for movie, idx in movie_to_idx.items()}
    top_k_movie_ids = [idx_to_movie[idx] for idx in top_k_movie_idxs]
    
    # Get movie titles
    recommended_movies = movies_df[movies_df['movie_id'].isin(top_k_movie_ids)][['movie_id', 'title']]
    recommended_movies['rank'] = recommended_movies['movie_id'].apply(lambda x: top_k_movie_ids.index(x))
    
    return recommended_movies.sort_values('rank').drop('rank', axis=1).reset_index(drop=True)

user_id = 1  # Example user
recommendations = recommend_for_user(model, user_id, user_to_idx, movie_to_idx, movies, ratings, k=10)
print(recommendations)

   movie_id                                              title
0      2873                          Lulu on the Bridge (1998)
1       599                             Wild Bunch, The (1969)
2      3326                   What Planet Are You From? (2000)
3      2415                         Violets Are Blue... (1986)
4      3167                            Carnal Knowledge (1971)
5      2737                               Assassination (1987)
6      3636  Those Who Love Me Can Take the Train (Ceux qui...
7      1322            Amityville 1992: It's About Time (1992)
8      2349                                   Mona Lisa (1986)
9      1507                               Paradise Road (1997)
