# Movie Recommendation System (1M)

## Initial library load 

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random
from datetime import datetime
from unidecode import unidecode

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fc9141fefd0>

In [5]:
writer = SummaryWriter()  

## Data load

In [6]:
ratings = pd.read_csv("./ml-1m/ratings.dat", sep="::", header=None,
                      names=['UserID','MovieID','Rating','Timestamp'], engine="python")

In [17]:
# convert timestamp to datetime
ratings['Datetime'] = ratings['Timestamp'].apply(lambda ts: datetime.fromtimestamp(ts))

In [18]:
ratings.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Datetime
0,1,1193,5,978300760,2000-12-31 23:12:40
1,1,661,3,978302109,2000-12-31 23:35:09
2,1,914,3,978301968,2000-12-31 23:32:48
3,1,3408,4,978300275,2000-12-31 23:04:35
4,1,2355,5,978824291,2001-01-07 00:38:11
5,1,1197,3,978302268,2000-12-31 23:37:48
6,1,1287,5,978302039,2000-12-31 23:33:59
7,1,2804,5,978300719,2000-12-31 23:11:59
8,1,594,4,978302268,2000-12-31 23:37:48
9,1,919,4,978301368,2000-12-31 23:22:48


In [8]:
movies = pd.read_csv("./ml-1m/movies.dat", sep="::", header=None,
                     names=['MovieID','Title','Genre'], engine="python",
                    encoding="utf-8",
                    lineterminator="\n",
                    on_bad_lines="skip")

In [9]:
movies.head(20)

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [29]:
movies_norm = movies.copy()

In [30]:
movies_norm['Title'] = movies_norm['Title'].apply(lambda x: unidecode(x))

In [32]:
movies_norm['Genre_List'] = movies_norm['Genre'].apply(lambda x: x.split('|'))

In [33]:
movies_norm.head(10)

Unnamed: 0,MovieID,Title,Genre,Genre_List
0,1,Toy Story (1995),Animation|Children's|Comedy,"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),Adventure|Children's|Fantasy,"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama,"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),Comedy,[Comedy]
5,6,Heat (1995),Action|Crime|Thriller,"[Action, Crime, Thriller]"
6,7,Sabrina (1995),Comedy|Romance,"[Comedy, Romance]"
7,8,Tom and Huck (1995),Adventure|Children's,"[Adventure, Children's]"
8,9,Sudden Death (1995),Action,[Action]
9,10,GoldenEye (1995),Action|Adventure|Thriller,"[Action, Adventure, Thriller]"


In [35]:
movies.count()

MovieID    3883
Title      3883
Genre      3883
dtype: int64

In [10]:
users = pd.read_csv("./ml-1m/users.dat", sep="::", engine="python",
                    header=None, names=['UserID','Gender','Age','Occupation','Zip-code'])

In [11]:
users.head(10)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


In [34]:
users.count()

UserID        6040
Gender        6040
Age           6040
Occupation    6040
Zip-code      6040
dtype: int64

In [12]:
movies_ratings = pd.merge(ratings,movies, on="MovieID", how="outer")

In [13]:
movies_ratings.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genre
0,1.0,1,5.0,978824268.0,Toy Story (1995),Animation|Children's|Comedy
1,6.0,1,4.0,978237008.0,Toy Story (1995),Animation|Children's|Comedy
2,8.0,1,4.0,978233496.0,Toy Story (1995),Animation|Children's|Comedy
3,9.0,1,5.0,978225952.0,Toy Story (1995),Animation|Children's|Comedy
4,10.0,1,5.0,978226474.0,Toy Story (1995),Animation|Children's|Comedy
5,18.0,1,4.0,978154768.0,Toy Story (1995),Animation|Children's|Comedy
6,19.0,1,5.0,978555994.0,Toy Story (1995),Animation|Children's|Comedy
7,21.0,1,3.0,978139347.0,Toy Story (1995),Animation|Children's|Comedy
8,23.0,1,4.0,978463614.0,Toy Story (1995),Animation|Children's|Comedy
9,26.0,1,3.0,978130703.0,Toy Story (1995),Animation|Children's|Comedy


In [14]:
complete_df = pd.merge(movies_ratings, users, on="UserID", how="outer")

In [15]:
complete_df.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genre,Gender,Age,Occupation,Zip-code
0,1.0,1,5.0,978824268.0,Toy Story (1995),Animation|Children's|Comedy,F,1.0,10.0,48067
1,1.0,48,5.0,978824351.0,Pocahontas (1995),Animation|Children's|Musical|Romance,F,1.0,10.0,48067
2,1.0,150,5.0,978301777.0,Apollo 13 (1995),Drama,F,1.0,10.0,48067
3,1.0,260,4.0,978300760.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,F,1.0,10.0,48067
4,1.0,527,5.0,978824195.0,Schindler's List (1993),Drama|War,F,1.0,10.0,48067
5,1.0,531,4.0,978302149.0,"Secret Garden, The (1993)",Children's|Drama,F,1.0,10.0,48067
6,1.0,588,4.0,978824268.0,Aladdin (1992),Animation|Children's|Comedy|Musical,F,1.0,10.0,48067
7,1.0,594,4.0,978302268.0,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical,F,1.0,10.0,48067
8,1.0,595,5.0,978824268.0,Beauty and the Beast (1991),Animation|Children's|Musical,F,1.0,10.0,48067
9,1.0,608,4.0,978301398.0,Fargo (1996),Crime|Drama|Thriller,F,1.0,10.0,48067


# Data preprocessing

# Neural Network architecture

In [None]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(GMF, self).__init__()
        # Embedding layers for users and items
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        # Output layer weight (h) for combining element-wise product
        self.h = nn.Parameter(torch.randn(embedding_dim))
        # Sigmoid activation to map predictions to [0, 1]
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user_ids, item_ids):
        p_u = self.user_embedding(user_ids)
        q_i = self.item_embedding(item_ids)
        # Element-wise product
        interaction = p_u * q_i  
        # Linear combination using the weight vector h
        prediction = torch.sum(interaction * self.h, dim=1)
        return self.sigmoid(prediction)
