# Movie Recommendation System (1M)

## Initial library load 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random
from datetime import datetime
from unidecode import unidecode

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f249450efd0>

In [4]:
writer = SummaryWriter()  

## Data load and preprocessing

In [5]:
ratings = pd.read_csv("./ml-1m/ratings.dat", sep="::", header=None,
                      names=['UserID','MovieID','Rating','Timestamp'], engine="python")

In [6]:
# convert timestamp to datetime
ratings['Datetime'] = ratings['Timestamp'].apply(lambda ts: datetime.fromtimestamp(ts))

In [7]:
ratings['Label'] = 1

In [8]:
# Ensure user and movie IDs are zero-indexed for embedding layers.
ratings['UserID'] = ratings['UserID'] - 1  # Users: 0 to 6039
ratings['MovieID'] = ratings['MovieID'] - 1  # Movies: 0 to (n-1)

In [9]:
ratings.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Datetime,Label
0,0,1192,5,978300760,2000-12-31 23:12:40,1
1,0,660,3,978302109,2000-12-31 23:35:09,1
2,0,913,3,978301968,2000-12-31 23:32:48,1
3,0,3407,4,978300275,2000-12-31 23:04:35,1
4,0,2354,5,978824291,2001-01-07 00:38:11,1
5,0,1196,3,978302268,2000-12-31 23:37:48,1
6,0,1286,5,978302039,2000-12-31 23:33:59,1
7,0,2803,5,978300719,2000-12-31 23:11:59,1
8,0,593,4,978302268,2000-12-31 23:37:48,1
9,0,918,4,978301368,2000-12-31 23:22:48,1


In [10]:
movies = pd.read_csv("./ml-1m/movies.dat", sep="::", header=None,
                     names=['MovieID','Title','Genre'], engine="python",
                    encoding="utf-8",
                    lineterminator="\n",
                    on_bad_lines="skip")

In [11]:
movies.head(20)

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [12]:
movies_norm = movies.copy()

In [13]:
movies_norm['Title'] = movies_norm['Title'].apply(lambda x: unidecode(x))

In [14]:
movies_norm['Genre_List'] = movies_norm['Genre'].apply(lambda x: x.split('|'))

In [15]:
movies_norm.head(10)

Unnamed: 0,MovieID,Title,Genre,Genre_List
0,1,Toy Story (1995),Animation|Children's|Comedy,"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),Adventure|Children's|Fantasy,"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama,"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),Comedy,[Comedy]
5,6,Heat (1995),Action|Crime|Thriller,"[Action, Crime, Thriller]"
6,7,Sabrina (1995),Comedy|Romance,"[Comedy, Romance]"
7,8,Tom and Huck (1995),Adventure|Children's,"[Adventure, Children's]"
8,9,Sudden Death (1995),Action,[Action]
9,10,GoldenEye (1995),Action|Adventure|Thriller,"[Action, Adventure, Thriller]"


In [17]:
num_movies = movies_norm['MovieID'].nunique()

In [18]:
users = pd.read_csv("./ml-1m/users.dat", sep="::", engine="python",
                    header=None, names=['UserID','Gender','Age','Occupation','Zip-code'])

In [19]:
users.head(10)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


In [20]:
num_users = users['UserID'].nunique()

In [12]:
movies_ratings = pd.merge(ratings,movies, on="MovieID", how="outer")

In [13]:
movies_ratings.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genre
0,1.0,1,5.0,978824268.0,Toy Story (1995),Animation|Children's|Comedy
1,6.0,1,4.0,978237008.0,Toy Story (1995),Animation|Children's|Comedy
2,8.0,1,4.0,978233496.0,Toy Story (1995),Animation|Children's|Comedy
3,9.0,1,5.0,978225952.0,Toy Story (1995),Animation|Children's|Comedy
4,10.0,1,5.0,978226474.0,Toy Story (1995),Animation|Children's|Comedy
5,18.0,1,4.0,978154768.0,Toy Story (1995),Animation|Children's|Comedy
6,19.0,1,5.0,978555994.0,Toy Story (1995),Animation|Children's|Comedy
7,21.0,1,3.0,978139347.0,Toy Story (1995),Animation|Children's|Comedy
8,23.0,1,4.0,978463614.0,Toy Story (1995),Animation|Children's|Comedy
9,26.0,1,3.0,978130703.0,Toy Story (1995),Animation|Children's|Comedy


In [14]:
complete_df = pd.merge(movies_ratings, users, on="UserID", how="outer")

In [15]:
complete_df.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genre,Gender,Age,Occupation,Zip-code
0,1.0,1,5.0,978824268.0,Toy Story (1995),Animation|Children's|Comedy,F,1.0,10.0,48067
1,1.0,48,5.0,978824351.0,Pocahontas (1995),Animation|Children's|Musical|Romance,F,1.0,10.0,48067
2,1.0,150,5.0,978301777.0,Apollo 13 (1995),Drama,F,1.0,10.0,48067
3,1.0,260,4.0,978300760.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,F,1.0,10.0,48067
4,1.0,527,5.0,978824195.0,Schindler's List (1993),Drama|War,F,1.0,10.0,48067
5,1.0,531,4.0,978302149.0,"Secret Garden, The (1993)",Children's|Drama,F,1.0,10.0,48067
6,1.0,588,4.0,978824268.0,Aladdin (1992),Animation|Children's|Comedy|Musical,F,1.0,10.0,48067
7,1.0,594,4.0,978302268.0,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical,F,1.0,10.0,48067
8,1.0,595,5.0,978824268.0,Beauty and the Beast (1991),Animation|Children's|Musical,F,1.0,10.0,48067
9,1.0,608,4.0,978301398.0,Fargo (1996),Crime|Drama|Thriller,F,1.0,10.0,48067


# Train / Test / Validation Split

In [21]:
def leave_one_out_split(df):
    # For each user, select the last interaction as test data.
    df = df.sort_values(['UserID', 'Timestamp'])
    test_list = df.groupby('UserID').tail(1)
    train_df = df.drop(test_list.index)
    return train_df, test_list

In [22]:
train_ratings_full, test_ratings = leave_one_out_split(ratings)
print("Train+Validation samples:", len(train_ratings_full))
print("Test samples:", len(test_ratings))

Train+Validation samples: 994169
Test samples: 6040


In [23]:
def stratified_train_val_split(df, val_frac=0.1, seed=42):
    train_idx = []
    val_idx = []
    np.random.seed(seed)
    # Group by user and sample indices for validation per user.
    for user, group in df.groupby('UserID'):
        indices = group.index.tolist()
        n_val = int(np.ceil(len(indices) * val_frac))
        val_indices = np.random.choice(indices, size=n_val, replace=False)
        train_indices = list(set(indices) - set(val_indices))
        train_idx.extend(train_indices)
        val_idx.extend(val_indices)
    # Return DataFrames for train and validation splits.
    train_df = df.loc[train_idx].reset_index(drop=True)
    val_df = df.loc[val_idx].reset_index(drop=True)
    return train_df, val_df

In [24]:
train_ratings, val_ratings = stratified_train_val_split(train_ratings_full, val_frac=0.1)

In [25]:
print("Train samples:", len(train_ratings))
print("Validation samples:", len(val_ratings))

Train samples: 892037
Validation samples: 102132


In [16]:
def build_user_positive_dict(df):
    return df.groupby('UserID')['MovieID'].apply(set).to_dict()

In [26]:
user_positive_train = build_user_positive_dict(train_ratings)

### Negative Sampling of the dataset

In [None]:
# -------------------------
# 4. Custom Dataset with Negative Sampling
# -------------------------

class MovieLensDatasetWithNegatives(Dataset):
    def __init__(self, df, user_positive, num_movies, num_negatives=4):
        """
        df: DataFrame with positive interactions.
        user_positive: Dictionary mapping user_id -> set of positive movie_ids.
        num_movies: Total number of movies.
        num_negatives: Number of negative samples to generate per positive instance.
        """
        self.df = df.reset_index(drop=True)
        self.user_positive = user_positive
        self.num_movies = num_movies
        self.num_negatives = num_negatives
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Get the positive sample.
        user_id = int(self.df.loc[idx, 'UserID'])
        pos_movie_id = int(self.df.loc[idx, 'MovieID'])
        
        # Start with the positive sample.
        samples = [(user_id, pos_movie_id, 1)]
        
        # Generate negative samples for this user.
        for _ in range(self.num_negatives):
            neg_movie_id = random.randint(0, self.num_movies - 1)
            while neg_movie_id in self.user_positive.get(user_id, set()):
                neg_movie_id = random.randint(0, self.num_movies - 1)
            samples.append((user_id, neg_movie_id, 0))
        
        # Return all samples (positive + negatives) for this positive instance.
        return samples

In [None]:
# A collate function to flatten batches since __getitem__ returns a list of samples.
def collate_fn(batch):
    # batch is a list of lists (each element is a list of (user, movie, label) tuples)
    flat_batch = [sample for sublist in batch for sample in sublist]
    user_ids, movie_ids, labels = zip(*flat_batch)
    return (torch.LongTensor(user_ids),
            torch.LongTensor(movie_ids),
            torch.FloatTensor(labels))

In [None]:
# Create Dataset objects
train_dataset = MovieLensDatasetWithNegatives(train_ratings, user_positive_train, num_movies, num_negatives=4)
val_dataset = MovieLensDatasetWithNegatives(val_ratings, build_user_positive_dict(val_ratings), num_movies, num_negatives=4)

In [None]:
# Create DataLoaders
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Neural Network architecture

source: https://arxiv.org/pdf/1708.05031 

In [None]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(GMF, self).__init__()
        # Embedding layers for users and items
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        # Output layer weight (h) for combining element-wise product
        self.h = nn.Parameter(torch.randn(embedding_dim))
        # Sigmoid activation to map predictions to [0, 1]
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user_ids, item_ids):
        p_u = self.user_embedding(user_ids)
        q_i = self.item_embedding(item_ids)
        # Element-wise product
        interaction = p_u * q_i  
        # Linear combination using the weight vector h
        score = torch.sum(interaction * self.h, dim=1)  # Weighted sum
        prediction = self.sigmoid(score)     # Map to [0, 1]
        return prediction