# Movie Recommendation System (1M)

## Initial library load 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader, Dataset, Subset
import numpy as np
import pandas as pd
import seaborn as sns
import random
from datetime import datetime
from unidecode import unidecode

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fc3d8027070>

In [4]:
import os
if not os.path.exists('runs'):
    os.makedirs('runs')
writer = SummaryWriter('runs/lenet5_mnist')

## Data load and preprocessing

In [5]:
ratings = pd.read_csv("./ml-1m/ratings.dat", sep="::", header=None,
                      names=['UserID','MovieID','Rating','Timestamp'], engine="python")

In [6]:
# convert timestamp to datetime
ratings['Datetime'] = ratings['Timestamp'].apply(lambda ts: datetime.fromtimestamp(ts))

In [7]:
ratings['Label'] = 1

In [8]:
# Ensure user and movie IDs are zero-indexed for embedding layers.
ratings['UserID'] = ratings['UserID'] - 1  # Users: 0 to 6039
ratings['MovieID'] = ratings['MovieID'] - 1  # Movies: 0 to (n-1)

In [9]:
ratings.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Datetime,Label
0,0,1192,5,978300760,2000-12-31 23:12:40,1
1,0,660,3,978302109,2000-12-31 23:35:09,1
2,0,913,3,978301968,2000-12-31 23:32:48,1
3,0,3407,4,978300275,2000-12-31 23:04:35,1
4,0,2354,5,978824291,2001-01-07 00:38:11,1
5,0,1196,3,978302268,2000-12-31 23:37:48,1
6,0,1286,5,978302039,2000-12-31 23:33:59,1
7,0,2803,5,978300719,2000-12-31 23:11:59,1
8,0,593,4,978302268,2000-12-31 23:37:48,1
9,0,918,4,978301368,2000-12-31 23:22:48,1


In [None]:
movies = pd.read_csv("./ml-1m/movies.dat", sep="::", header=None,
                    names=['MovieID','Title','Genre'], engine="python",
                    encoding="utf-8",
                    lineterminator="\n",
                    on_bad_lines="skip")

In [11]:
movies.head(20)

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [12]:
movies_norm = movies.copy()

In [13]:
movies_norm['Title'] = movies_norm['Title'].apply(lambda x: unidecode(x))

In [14]:
movies_norm['Genre_List'] = movies_norm['Genre'].apply(lambda x: x.split('|'))

In [15]:
movies_norm.head(10)

Unnamed: 0,MovieID,Title,Genre,Genre_List
0,1,Toy Story (1995),Animation|Children's|Comedy,"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),Adventure|Children's|Fantasy,"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama,"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),Comedy,[Comedy]
5,6,Heat (1995),Action|Crime|Thriller,"[Action, Crime, Thriller]"
6,7,Sabrina (1995),Comedy|Romance,"[Comedy, Romance]"
7,8,Tom and Huck (1995),Adventure|Children's,"[Adventure, Children's]"
8,9,Sudden Death (1995),Action,[Action]
9,10,GoldenEye (1995),Action|Adventure|Thriller,"[Action, Adventure, Thriller]"


In [16]:
users = pd.read_csv("./ml-1m/users.dat", sep="::", engine="python",
                    header=None, names=['UserID','Gender','Age','Occupation','Zip-code'])

In [17]:
users.head(10)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


# Train / Test / Validation Split

In [18]:
def leave_one_out_split(df):
    # For each user, select the last interaction as test data.
    df = df.sort_values(['UserID', 'Timestamp'])
    test_list = df.groupby('UserID').tail(1)
    train_df = df.drop(test_list.index)
    return train_df, test_list

In [19]:
train_ratings_full, test_ratings = leave_one_out_split(ratings)
print("Train+Validation samples:", len(train_ratings_full))
print("Test samples:", len(test_ratings))

Train+Validation samples: 994169
Test samples: 6040


In [20]:
def stratified_train_val_split(df, val_frac=0.1, seed=seed):
    train_idx = []
    val_idx = []
    np.random.seed(seed)
    # Group by user and sample indices for validation per user.
    for user, group in df.groupby('UserID'):
        indices = group.index.tolist()
        n_val = int(np.ceil(len(indices) * val_frac))
        val_indices = np.random.choice(indices, size=n_val, replace=False)
        train_indices = list(set(indices) - set(val_indices))
        train_idx.extend(train_indices)
        val_idx.extend(val_indices)
    # Return DataFrames for train and validation splits.
    train_df = df.loc[train_idx].reset_index(drop=True)
    val_df = df.loc[val_idx].reset_index(drop=True)
    return train_df, val_df

In [21]:
train_ratings, val_ratings = stratified_train_val_split(train_ratings_full, val_frac=0.1)

In [22]:
print("Train samples:", len(train_ratings))
print("Validation samples:", len(val_ratings))

Train samples: 892037
Validation samples: 102132


In [23]:
# -------------------------
# Precompute User-Positive Movie Mapping for Negative Sampling
# -------------------------

def build_user_positive_dict(df):
    return df.groupby('UserID')['MovieID'].apply(set).to_dict()

In [24]:
user_positive_train = build_user_positive_dict(train_ratings)

### Negative Sampling of the dataset

In [25]:
# -------------------------
# Custom Dataset with Negative Sampling (only for train and validation)
# -------------------------

class MovieLensDatasetWithNegatives(Dataset):
    def __init__(self, df, user_positive, num_movies, num_negatives=4):
        """
        df: DataFrame with positive interactions.
        user_positive: Dictionary mapping user_id -> set of positive movie_ids.
        num_movies: Total number of movies.
        num_negatives: Number of negative samples to generate per positive instance.
        """
        self.df = df.reset_index(drop=True)
        self.user_positive = user_positive
        self.num_movies = num_movies
        self.num_negatives = num_negatives
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Get the positive sample.
        user_id = int(self.df.loc[idx, 'UserID'])
        pos_movie_id = int(self.df.loc[idx, 'MovieID'])
        
        # Start with the positive sample.
        samples = [(user_id, pos_movie_id, 1)]
        
        # Generate negative samples for this user.
        for _ in range(self.num_negatives):
            neg_movie_id = random.randint(0, self.num_movies - 1)
            while neg_movie_id in self.user_positive.get(user_id, set()):
                neg_movie_id = random.randint(0, self.num_movies - 1)
            samples.append((user_id, neg_movie_id, 0))
        
        # Return all samples (positive + negatives) for this positive instance.
        return samples

In [26]:
# A collate function to flatten batches since __getitem__ returns a list of samples.
def collate_fn(batch):
    # batch is a list of lists (each element is a list of (user, movie, label) tuples)
    flat_batch = [sample for sublist in batch for sample in sublist]
    user_ids, movie_ids, labels = zip(*flat_batch)
    return (torch.LongTensor(user_ids),
            torch.LongTensor(movie_ids),
            torch.FloatTensor(labels))

### Parameters and Hyperparameters

In [None]:
num_users = users['UserID'].nunique()
## num_movies = movies_norm['MovieID'].nunique() creates an error becuase ids go up to 3951 but unique ids there's only 3883
max_movie_id = ratings['MovieID'].max()
num_movies = max_movie_id + 1
batch_size = 256
embedding_dim = 32  # Hyperparameter choice

In [28]:
# Create Dataset objects
train_dataset = MovieLensDatasetWithNegatives(train_ratings, user_positive_train, num_movies, num_negatives=4)
val_dataset = MovieLensDatasetWithNegatives(val_ratings, build_user_positive_dict(val_ratings), num_movies, num_negatives=4)

In [29]:
# Create DataLoaders

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [30]:
# For testing, we use only the positive interactions (held-out ones).
class MovieLensTestDataset(Dataset):
    def __init__(self, df):
        self.user_ids = torch.LongTensor(df['UserID'].values)
        self.movie_ids = torch.LongTensor(df['MovieID'].values)
        self.labels = torch.FloatTensor(df['Label'].values)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.user_ids[idx], self.movie_ids[idx], self.labels[idx]

In [31]:
test_dataset = MovieLensTestDataset(test_ratings)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Neural Network architecture

source: https://arxiv.org/pdf/1708.05031 

In [32]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(GMF, self).__init__()
        # Embedding layers for users and items
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        # Output layer weight (h) for combining element-wise product (Learnable weight vector)
        self.h = nn.Parameter(torch.randn(embedding_dim))
        # Sigmoid activation to map predictions to [0, 1]
        self.sigmoid = nn.Sigmoid()
      
    def forward(self, user_ids, item_ids):
        os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  
        p_u = self.user_embedding(user_ids)  # shape: [batch_size, embedding_dim]
        q_i = self.item_embedding(item_ids)  # shape: [batch_size, embedding_dim]
        interaction = p_u * q_i              # Element-wise product
        # Linear combination using the weight vector h
        score = torch.sum(interaction * self.h, dim=1)  # Weighted sum
        prediction = self.sigmoid(score)     # Map to [0, 1]
        return prediction

In [33]:
model = GMF(num_users, num_movies, embedding_dim)
model = model.to(DEVICE)

In [34]:
print("UserID range:", ratings['UserID'].min(), ratings['UserID'].max())
print("MovieID range:", ratings['MovieID'].min(), ratings['MovieID'].max())


UserID range: 0 6039
MovieID range: 0 3951


In [35]:
print(num_users)
print(num_movies)

6040
3952


## Loss, Optimizer and Training

In [36]:
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [42]:
def evaluate_ranking(model, dataset, device, K=10):
    """
    Evaluates HR@K and NDCG@K on a dataset that returns
    candidate sets (1 positive + multiple negatives).
    """
    model.eval()
    hr_sum = 0.0
    ndcg_sum = 0.0
    num_users = len(dataset)  # each __getitem__ is for one user (or one positive sample)
    
    with torch.no_grad():
        for idx in range(num_users):
            candidate_list = dataset[idx]  # e.g., [(user_id, pos_item, 1), (user_id, neg_item1, 0), ...]
            
            user_ids, item_ids, labels = zip(*candidate_list)
            user_ids = torch.LongTensor(user_ids).to(device)
            item_ids = torch.LongTensor(item_ids).to(device)
            labels = torch.FloatTensor(labels).to(device)

            # Predict scores for each candidate
            scores = model(user_ids, item_ids).cpu().numpy()
            labels = labels.cpu().numpy()
            
            # Sort candidates by predicted score (descending)
            sorted_indices = np.argsort(-scores)
            
            # Find the rank of the positive item
            for rank, sorted_idx in enumerate(sorted_indices, start=1):
                if labels[sorted_idx] == 1:
                    # Hit Ratio
                    if rank <= K:
                        hr_sum += 1.0
                        # NDCG
                        ndcg_sum += 1.0 / np.log2(rank + 1)
                    break
    
    hr_avg = hr_sum / num_users
    ndcg_avg = ndcg_sum / num_users
    return hr_avg, ndcg_avg

In [38]:
# -------------------------
# Training Loop with Validation
# -------------------------
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for user_ids, movie_ids, labels in train_loader:
        user_ids = user_ids.to(DEVICE)
        movie_ids = movie_ids.to(DEVICE)
        labels = labels.to(DEVICE)
        
        optimizer.zero_grad()
        predictions = model(user_ids, movie_ids)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * user_ids.size(0)
    
    avg_loss = epoch_loss / len(train_dataset)  # based on number of positive samples
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}")
    
    # Validation step
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for user_ids, movie_ids, labels in val_loader:
            user_ids = user_ids.to(DEVICE)
            movie_ids = movie_ids.to(DEVICE)
            labels = labels.to(DEVICE)
            
            predictions = model(user_ids, movie_ids)
            loss = criterion(predictions, labels)
            val_loss += loss.item() * user_ids.size(0)
    avg_val_loss = val_loss / len(val_dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")

Epoch 1/10, Train Loss: 3.4985
Epoch 1/10, Validation Loss: 3.4663
Epoch 2/10, Train Loss: 3.4527
Epoch 2/10, Validation Loss: 3.3798
Epoch 3/10, Train Loss: 2.7940
Epoch 3/10, Validation Loss: 2.4240
Epoch 4/10, Train Loss: 2.0706
Epoch 4/10, Validation Loss: 2.0849
Epoch 5/10, Train Loss: 1.8679
Epoch 5/10, Validation Loss: 1.9680
Epoch 6/10, Train Loss: 1.7923
Epoch 6/10, Validation Loss: 1.9289
Epoch 7/10, Train Loss: 1.7490
Epoch 7/10, Validation Loss: 1.9082
Epoch 8/10, Train Loss: 1.7105
Epoch 8/10, Validation Loss: 1.8796
Epoch 9/10, Train Loss: 1.6600
Epoch 9/10, Validation Loss: 1.8543
Epoch 10/10, Train Loss: 1.6011
Epoch 10/10, Validation Loss: 1.8349


In [40]:
# -------------------------
# Evaluation on Test Data
# -------------------------
model.eval()
test_loss = 0.0
with torch.no_grad():
    for user_ids, movie_ids, labels in test_loader:
        user_ids = user_ids.to(DEVICE)
        movie_ids = movie_ids.to(DEVICE)
        labels = labels.to(DEVICE)       
        
        predictions = model(user_ids, movie_ids)
        loss = criterion(predictions, labels)
        test_loss += loss.item() * user_ids.size(0)
avg_test_loss = test_loss / len(test_dataset)
print(f"Test Loss: {avg_test_loss:.4f}")

Test Loss: 1.1976


In [43]:
hr, ndcg = evaluate_ranking(model, val_dataset, DEVICE, K=10)
print(f"Validation HR@10: {hr:.4f}, NDCG@10: {ndcg:.4f}")

Validation HR@10: 1.0000, NDCG@10: 0.8323


## Optimization with optuna

In [1]:
import optuna

def objective(trial):
    # 1. Suggest hyperparameters
    embedding_dim = trial.suggest_categorical("embedding_dim", [16, 32, 64, 128])
    lr = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
    neg_ratio = trial.suggest_int("neg_ratio", 2, 8)  # number of negatives per positive
    
    # 2. Build your model with these hyperparams
    model = GMF(num_users, num_movies, embedding_dim).to(DEVICE)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Rebuild your dataset with the chosen negative sampling ratio
    train_dataset = MovieLensDatasetWithNegatives(
        train_ratings,
        user_positive_train,
        num_movies,
        num_negatives=neg_ratio
    )
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=collate_fn)
    
    # 3. Training loop (simplified for brevity)
    num_epochs = 5
    for epoch in range(num_epochs):
        model.train()
        for user_ids, movie_ids, labels in train_loader:
            user_ids, movie_ids, labels = user_ids.to(DEVICE), movie_ids.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            predictions = model(user_ids, movie_ids)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
    
    # 4. Evaluate on validation set for HR@10
    hr, ndcg = evaluate_ranking(model, val_dataset, DEVICE, K=10)
    
    # We want to maximize HR, but Optuna by default minimizes
    # so we return -hr (negative) to effectively maximize HR
    return -hr

# 5. Create and run the study
study = optuna.create_study(direction="minimize")  # we'll minimize -hr
study.optimize(objective, n_trials=20, n_jobs=-1)

# 6. Best hyperparameters
best_params = study.best_params
print("Best params:", best_params)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-04-01 10:39:38,476] A new study created in memory with name: no-name-be4680ae-466e-4f72-b0a4-c592aa260cda
[W 2025-04-01 10:39:38,493] Trial 1 failed with parameters: {'embedding_dim': 32, 'learning_rate': 0.0031713524379200554, 'neg_ratio': 5} because of the following error: NameError("name 'GMF' is not defined").
Traceback (most recent call last):
  File "/home/pipe/venv/deeplearningV/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_1323/1657806.py", line 10, in objective
    model = GMF(num_users, num_movies, embedding_dim).to(DEVICE)
            ^^^
NameError: name 'GMF' is not defined
[W 2025-04-01 10:39:38,493] Trial 0 failed with parameters: {'embedding_dim': 16, 'learning_rate': 0.0003254270813463736, 'neg_ratio': 5} because of the following error: NameError("name 'GMF' is not defined").
Traceback (mo

ValueError: No trials are completed yet.

## Training and Test with the best parameters (from optuna study)

In [45]:
model_best = GMF(num_users, num_movies, embedding_dim=16)
model_best = model_best.to(DEVICE)
criterion_best = nn.BCELoss()
optimizer_best = optim.Adam(model.parameters(), lr=0.00022262414278391033)

# Rebuild your dataset with the chosen negative sampling ratio
train_dataset_best = MovieLensDatasetWithNegatives(
    train_ratings,
    user_positive_train,
    num_movies,
    num_negatives=5
)
train_loader_best = DataLoader(train_dataset_best, batch_size=256, shuffle=True, collate_fn=collate_fn)
    

In [46]:
# -------------------------
# Training Loop with Validation
# -------------------------
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for user_ids, movie_ids, labels in train_loader_best:
        user_ids = user_ids.to(DEVICE)
        movie_ids = movie_ids.to(DEVICE)
        labels = labels.to(DEVICE)
        
        optimizer.zero_grad()
        predictions = model_best(user_ids, movie_ids)
        loss = criterion_best(predictions, labels)
        loss.backward()
        optimizer_best.step()
        epoch_loss += loss.item() * user_ids.size(0)
    
    avg_loss = epoch_loss / len(train_dataset)  # based on number of positive samples
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}")
    
    # Validation step
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for user_ids, movie_ids, labels in val_loader:
            user_ids = user_ids.to(DEVICE)
            movie_ids = movie_ids.to(DEVICE)
            labels = labels.to(DEVICE)
            
            predictions = model(user_ids, movie_ids)
            loss = criterion(predictions, labels)
            val_loss += loss.item() * user_ids.size(0)
    avg_val_loss = val_loss / len(val_dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")

Epoch 1/10, Train Loss: 12.8513
Epoch 1/10, Validation Loss: 1.8306
Epoch 2/10, Train Loss: 12.8723
Epoch 2/10, Validation Loss: 1.8308
Epoch 3/10, Train Loss: 12.8671
Epoch 3/10, Validation Loss: 1.8283
Epoch 4/10, Train Loss: 12.8842
Epoch 4/10, Validation Loss: 1.8303
Epoch 5/10, Train Loss: 12.8884
Epoch 5/10, Validation Loss: 1.8290
Epoch 6/10, Train Loss: 12.8632
Epoch 6/10, Validation Loss: 1.8315
Epoch 7/10, Train Loss: 12.8845
Epoch 7/10, Validation Loss: 1.8330
Epoch 8/10, Train Loss: 12.8500
Epoch 8/10, Validation Loss: 1.8309
Epoch 9/10, Train Loss: 12.8920
Epoch 9/10, Validation Loss: 1.8324
Epoch 10/10, Train Loss: 12.8667
Epoch 10/10, Validation Loss: 1.8345
