In [None]:
import tqdm
import torch
import numpy as np
import pandas as pd

from collections import defaultdict

In [None]:
from loader import *
from features import *
from training import *
from model import *

# Data loading

In [None]:
movies, ratings = load_raw_data(user_limit=1000, movie_limit=500, path='data/ml-32m/')
genre_lookup, genres = generate_genre_lookup(movies)
lookup_movie_id_to_emb, lookup_emb_to_movie_id, lookup_genre_to_emb, lookup_emb_to_genre = generate_embeddings(movies, genres)
aggregated_user_scores = generate_aggregated_user_scores(ratings)

In [None]:
aggregated_user_scores.head()

In [None]:
user_features, user_movie_ratings = generate_user_features(aggregated_user_scores, genre_lookup, lookup_genre_to_emb, lookup_movie_id_to_emb)
movie_features, genre_features = generate_movie_features(genre_lookup, lookup_genre_to_emb, lookup_movie_id_to_emb)

print(f"User features of shape {user_features.shape} with {user_movie_ratings.shape} ratings and movie/genre features of shape {movie_features.shape} / {genre_features.shape}")

In [None]:
user_features = torch.from_numpy(user_features)
user_movie_ratings = torch.from_numpy(user_movie_ratings)
movie_features = torch.from_numpy(movie_features)
genre_features = torch.from_numpy(genre_features)

# Modeling section

In [None]:
movie_embedding_size = 20
genre_embedding_size = 20
user_features_embedding_size = movie_embedding_size + genre_embedding_size
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

N_epochs = 1_000  # We will be running random batches from our data in each epoch.
batch_size = 100
log_every = 100
loss_filter_window_length = 10  # We will log the last x losses to calculate a running mean.

mdl = TwoTowerModel(
    user_features, movie_features, genre_features,
    user_features_embedding_size, movie_embedding_size, genre_embedding_size
)

optimiser = torch.optim.SGD(mdl.parameters(), lr=0.00000001, momentum=0.9)
loss_fn = torch.nn.MSELoss()

filtered_loss = []
for epoch in tqdm.tqdm_notebook(range(1, N_epochs + 1)):
    # Generate the random batch:
    user_batch, move_batch, genre_batch, ratings_batch = generate_batch(
        user_features, movie_features, genre_features, user_movie_ratings, batch_size
    )

    output = mdl.forward(user_batch, move_batch, genre_batch)
    loss = loss_fn(output, ratings_batch)
    loss.backward()
    optimiser.step()

    # The following be more elegant, but we are doing so many heavy stuff here, this does not matter anymore.
    if epoch < 10:
        filtered_loss.append(loss.item())
    else:
        filtered_loss = filtered_loss[1:] + [loss.item()]  

    if epoch % log_every == 0:
        print(f"Epoch {epoch} loss: {np.mean(filtered_loss)}")