In [12]:
import pandas as pd
import torch

movies_df = pd.read_csv("data/ml-latest-small/movies.csv")
ratings_df = pd.read_csv("data/ml-latest-small/ratings.csv")

In [13]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [14]:
movies_dict = movies_df.set_index('movieId')["title"].to_dict()

In [15]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [16]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [17]:
ratings_df.rating.value_counts()

rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64

In [18]:
print(movies_df.movieId.nunique())
print(ratings_df.movieId.nunique())

9742
9724


In [19]:
n_users = ratings_df.userId.nunique()
n_movies = ratings_df.movieId.nunique()

print("Number of unique users is:", n_users)
print("Number of unique items/ratings is:", n_movies)
print("Number of ratings", len(ratings_df))
print("Matrix size:", n_users*n_movies)
print("Percent of matrix that is filled:", len(ratings_df) / (n_users*n_movies) * 100, "%")

Number of unique users is: 610
Number of unique items/ratings is: 9724
Number of ratings 100836
Matrix size: 5931640
Percent of matrix that is filled: 1.6999683055613624 %


In [20]:
import torch
import numpy as np
#import mathplotlib.pyplot as plt
from sklearn import preprocessing, metrics, model_selection
from torch.utils.data import DataLoader, Dataset


class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        
    # len(movie_dataset)
    def __len__(self): # Number of Users
        return len(self.users)

    # movie_dataset[1]
    def __getitem__(self, idx):

        users = self.users[idx]
        movies = self.movies[idx]
        ratings = self.ratings[idx]

        return {
            "users" : torch.tensor(users, dtype=torch.long),
            "movies" : torch.tensor(movies, dtype=torch.long),
            "ratings" : torch.tensor(ratings, dtype=torch.float)
        }

In [21]:
class RecSysModel(torch.nn.Module):
    def __init__(self, n_users, n_movies, embedding_size=256, hidden_dim=256, dropout_rate=0.2):
        super().__init__()

        #create embeddings
        self.user_embed = torch.nn.Embedding(num_embeddings=n_users, embedding_dim=embedding_size)
        self.movie_embed = torch.nn.Embedding(num_embeddings=n_movies, embedding_dim=embedding_size)


        # hidden layers
        self.fc1 = torch.nn.Linear(2 * embedding_size, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, 1)

        self.dropout = torch.nn.Dropout(p=dropout_rate)

        self.relu = torch.nn.ReLU()
        
    def forward(self, users, movies, ratings=None):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        
        output = torch.cat([user_embeds, movie_embeds], dim=1)

        x = self.relu(self.fc1(output))
        x = self.dropout(x)
        output = self.fc2(x)
        return output

In [22]:
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()
ratings_df.userId = lbl_user.fit_transform(ratings_df.userId.values)
ratings_df.movieId = lbl_movie.fit_transform(ratings_df.movieId.values)

df_train, df_valid = model_selection.train_test_split(
    ratings_df, test_size=0.1, random_state=3, stratify=ratings_df.rating.values
)

train_dataset = MovieDataset(
    users = df_train.userId.values,
    movies = df_train.movieId.values,
    ratings = df_train.rating.values
)

valid_dataset = MovieDataset(
    users = df_valid.userId.values,
    movies = df_valid.movieId.values,
    ratings = df_valid.rating.values
)

In [23]:
BATCH_SIZE = 32

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
)

valid_loader = DataLoader(dataset=valid_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
)

In [24]:
n_users = len(lbl_user.classes_)#num of unique users
n_movies = len(lbl_movie.classes_)#unique movies

recommendation_model = RecSysModel(n_users, n_movies, embedding_size=64, hidden_dim=128, dropout_rate=0.1).to(device)

optimizer = torch.optim.Adam(recommendation_model.parameters()) #gradient descent aka adjust to yield smallest error
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)

loss_fn = torch.nn.MSELoss()

In [25]:
losses = []

def train():
    epochs = 2
    total_loss = 0
    log_step = 100
    
    print(f'Training on size: {len(train_dataset)}')
    recommendation_model.train()
    
    for epoch_i in range(epochs):
        step_count = 0
        for i, train_data in enumerate(train_loader):
            users = train_data["users"].to(device)
            movies = train_data["movies"].to(device)
    
            output = recommendation_model(users, movies)
            output = output.squeeze()
            
            ratings = train_data["ratings"].to(torch.float32).to(device)
    
            
    
            loss = loss_fn(output, ratings)
            total_loss += loss.sum().item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            step_count += len(train_data["users"])
    
            if (step_count % log_step == 0 or i == len(train_loader) - 1):
                avg_loss = (total_loss / log_step)
                print(f"epoch {epoch_i} loss at step {step_count} is {avg_loss}")
                losses.append(avg_loss)
                total_loss = 0

if __name__ == '__main__':
    train()

Training on size: 90752
epoch 0 loss at step 800 is 2.6386972951889036
epoch 0 loss at step 1600 is 1.3006292653083802
epoch 0 loss at step 2400 is 0.6132201683521271
epoch 0 loss at step 3200 is 0.4252203130722046
epoch 0 loss at step 4000 is 0.3355283379554749
epoch 0 loss at step 4800 is 0.334689319729805
epoch 0 loss at step 5600 is 0.3007928156852722
epoch 0 loss at step 6400 is 0.3045295637845993
epoch 0 loss at step 7200 is 0.2920352590084076
epoch 0 loss at step 8000 is 0.29119313657283785
epoch 0 loss at step 8800 is 0.3085030776262283
epoch 0 loss at step 9600 is 0.29866264522075653
epoch 0 loss at step 10400 is 0.2712466084957123
epoch 0 loss at step 11200 is 0.261068257689476
epoch 0 loss at step 12000 is 0.27726986587047575
epoch 0 loss at step 12800 is 0.29164719879627227
epoch 0 loss at step 13600 is 0.29744533717632293
epoch 0 loss at step 14400 is 0.26924890518188477
epoch 0 loss at step 15200 is 0.26626304030418396
epoch 0 loss at step 16000 is 0.2672853118181229
epoc

In [26]:
  # Root Mean Squared Error
from sklearn.metrics import root_mean_squared_error

y_true = []
y_pred = []

recommendation_model.eval()

with torch.no_grad():
    for i, valid_data in enumerate(valid_loader):
        model_output = recommendation_model(valid_data['users'].to(device), valid_data['movies'].to(device))

        ratings = valid_data['ratings'].to(device)
        y_true.extend(ratings.cpu().numpy()) 
        y_pred.extend(model_output.cpu().numpy())


# actually calc RMSE
rmse = root_mean_squared_error(y_true, y_pred)
print(f"RMSE: {rmse:.4f}")

RMSE: 0.9080


In [27]:
from collections import defaultdict


def calculate_precision_recall(user_ratings, k, threshold):
    user_ratings.sort(key=lambda x: x[0], reverse=True)
    n_rel = sum(true_r >= threshold for _, true_r in user_ratings)
    n_rec_k = sum(est >= threshold for est, _ in user_ratings[:k])
    n_rel_and_rec_k = sum(
        (true_r >= threshold) and (est >= threshold) for est, true_r in user_ratings[:k]
    )

    precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
    recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    return precision, recall


user_ratings_comparison = defaultdict(list)

with torch.no_grad():
    for valid_data in valid_loader:
        users = valid_data["users"].to(device)
        movies = valid_data["movies"].to(device)
        ratings = valid_data["ratings"].to(device)
        output = recommendation_model(users, movies)

        for user, pred, true in zip(users, output, ratings):
            user_ratings_comparison[user.item()].append((pred[0].item(), true.item()))

user_precisions = dict()
user_based_recalls = dict()

k = 50
threshold = 3

for user_id, user_ratings in user_ratings_comparison.items():
    precision, recall = calculate_precision_recall(user_ratings, k, threshold)
    user_precisions[user_id] = precision
    user_based_recalls[user_id] = recall


    average_precision = sum(prec for prec in user_precisions.values()) / len(
    user_precisions
)
average_recall = sum(rec for rec in user_based_recalls.values()) / len(
    user_based_recalls
)

print(f"precision @ {k}: {average_precision:.4f}")
print(f"recall @ {k}: {average_recall:.4f}")

precision @ 50: 0.8899
recall @ 50: 0.8818


In [31]:
def top_recommendations(user_id, all_movies, k=5, batch_size=100):
    recommendation_model.eval()


    
    watched_movies = set(ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist())
    unwatched_movies = [m for m in all_movies if m not in watched_movies]
    # fill unwatched movies
    # for m in all_movies:
    #     if m not in watched_movies:
    #         unwatched_movies.append(m)

    prediction = []
    top_k_recommendations = []

    with torch.no_grad():
        for i in range(0, len(unwatched_movies), batch_size):
            batched_unwatched = unwatched_movies[i:i+batch_size]
            movie_tensor = torch.tensor(batched_unwatched).to(device)
            user_tensor = torch.tensor([user_id] * len(batched_unwatched)).to(device)
            prediction_model = recommendation_model(user_tensor, movie_tensor).view(-1).tolist()
            prediction.extend(zip(batched_unwatched, prediction_model))

    prediction.sort(key=lambda x: x[1], reverse=True)

    for (m_id, _) in prediction[:k]:
        top_k_recommendations.append(m_id)

    # Convert this encoded movieId's back to their original ids
    top_k_recommendations = lbl_movie.inverse_transform(top_k_recommendations)
    
    return top_k_recommendations

# ---------------

all_movies = ratings_df['movieId'].unique().tolist()
user_id = 1

recommendations = top_recommendations(user_id, all_movies, k=5)
print(f"Recommendations for user {user_id}: {recommendations}")

for i in recommendations:
    print(movies_dict[i])

user_id = 5
recommendations = top_recommendations(user_id, all_movies, k=5)
print(f"Recommendations for user {user_id}: {recommendations}")

for i in recommendations:
    print(movies_dict[i])

Recommendations for user 1: [4957  260  741 4154 5685]
Sudden Impact (1983)
Star Wars: Episode IV - A New Hope (1977)
Ghost in the Shell (Kôkaku kidôtai) (1995)
Recess: School's Out (2001)
Real Women Have Curves (2002)
Recommendations for user 5: [ 187 4154  741 1178 5685]
Party Girl (1995)
Recess: School's Out (2001)
Ghost in the Shell (Kôkaku kidôtai) (1995)
Paths of Glory (1957)
Real Women Have Curves (2002)
