__Imports__

In [1]:
import torch
import pandas as pd
import random

from tqdm import tqdm
from torch.utils.data import random_split
from torch import Generator
from torch import optim
from model import LightGCN, train_model
from sklearn import preprocessing
from torch_sparse import SparseTensor
from preprocessing import dataset, init_interaction_edges

* There are 610 users and 9724 movies in this dataset. 
* On average, users give a movie a 3.5/5 rating.

In [2]:
dataset.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,325.127564,3101.735561,3.501557,1205946000.0
std,182.618491,2627.050983,1.042529,216261000.0
min,0.0,0.0,0.5,828124600.0
25%,176.0,900.0,3.0,1019124000.0
50%,324.0,2252.0,3.5,1186087000.0
75%,476.0,5095.25,4.0,1435994000.0
max,609.0,9723.0,5.0,1537799000.0


In [3]:
threshold = 3.5
dataset_above_thresh = dataset[dataset["rating"] >= threshold]
num_users = len(dataset["userId"].unique())
num_movies = len(dataset["movieId"].unique())

print(f"users: {num_users}, movies: {num_movies}")
dataset_above_thresh.describe()

users: 610, movies: 9724


Unnamed: 0,userId,movieId,rating,timestamp
count,61716.0,61716.0,61716.0,61716.0
mean,319.869272,3193.682513,4.176915,1219983000.0
std,181.55984,2664.966697,0.520208,209665900.0
min,0.0,0.0,3.5,828124600.0
25%,168.0,910.0,4.0,1047054000.0
50%,317.0,2335.0,4.0,1201160000.0
75%,473.0,5309.0,4.5,1436944000.0
max,609.0,9723.0,5.0,1537799000.0


__Create the graph__

In [4]:
locations, values = init_interaction_edges(dataset, "userId", "movieId", "rating", threshold)

print(locations, locations.size())
print(values, values.size())
print(values.min())

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9443, 9444, 9445]]) torch.Size([2, 61716])
tensor([4., 4., 4.,  ..., 5., 5., 5.], dtype=torch.float64) torch.Size([61716])
tensor(3.5000, dtype=torch.float64)


In [5]:
num_interactions = values.shape[0]

print(f"edges: {num_interactions}, nodes: {num_movies+num_users}")

edges: 61716, nodes: 10334


__Split into test and train sets__

In [6]:
indices = list(range(num_interactions))

generator = Generator().manual_seed(42)
train_set_split, test_set_split = random_split(indices, [0.8, 0.2], generator=generator)

print(f"train: {len(train_set_split)} interactions")
print(f"test: {len(test_set_split)} interactions")
print((num_interactions) == (len(train_set_split) + len(test_set_split)))

train: 49373 interactions
test: 12343 interactions
True


In [7]:
train_indices = locations[:, train_set_split]
train_values = values[train_set_split]

test_indices = locations[:, test_set_split]
test_values = values[test_set_split]

print(train_indices, train_indices.size())
print(train_values, train_values.size())

tensor([[ 102,  181,  433,  ...,  181,  205,  338],
        [6706, 1791, 1157,  ..., 1059,  115, 6241]]) torch.Size([2, 49373])
tensor([3.5000, 4.5000, 5.0000,  ..., 4.0000, 4.0000, 5.0000],
       dtype=torch.float64) torch.Size([49373])


__Train the model__

In [8]:
ITERATIONS = 10000
EPOCHS = 10
BATCH_SIZE = 1024
LR = 1e-3
ITERS_PER_EVAL = 200
ITERS_PER_LR_DECAY = 200
K = 10
LAMBDA = 1e-6

In [9]:
model = LightGCN(610, 9724)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device}.")

model = model.to(device)
model.train()

optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=0.01)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

locations = locations.to(device)
train_indices = train_indices.to(device)
train_set_sparse = SparseTensor(row=train_indices[0], col=train_indices[1], sparse_sizes=(10334, 10334))
train_indices

Using cpu.


tensor([[ 102,  181,  433,  ...,  181,  205,  338],
        [6706, 1791, 1157,  ..., 1059,  115, 6241]])

In [10]:
train_model(model, device, optimizer, scheduler, train_indices, train_set_sparse)

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [09:29<00:00, 17.57it/s]


In [23]:
movies = pd.read_csv("../data/movies/raw/movies.csv")
movie_ids = preprocessing.LabelEncoder()

movies.movieId = movie_ids.fit_transform(movies.movieId.values)

def predict(model, id, num):
    user_movies = dataset[dataset["userId"] == id]
    movie_ids = user_movies[["movieId", "rating"]].sort_values(by="rating", ascending=False).head(num)
    user_emb = model.users_emb.weight[id]
    scores = model.items_emb.weight @ user_emb

    _, topk = torch.topk(scores, k=len(movie_ids)+3)

    id_list = movie_ids["movieId"].to_list()

    print(f"User {id} likes:")
    for i, movie in enumerate(id_list):
        title = movies.at[movie, "title"]
        genres = movies.at[movie, "genres"]
        blurb = f"{i+1}. {title}, {genres}"
        print(blurb)

    print(f"\nUser {id} may like:")
    for i, movie in enumerate(topk):
        movie = movie.item()
        title = movies.at[movie, "title"]
        genres = movies.at[movie, "genres"]
        blurb = f"{i+1}. {title}, {genres}"
        print(blurb)

id = random.randint(0, 610)

predict(model, id, 5)

User 237 likes:
1. Man with the Golden Arm, The (1955), Drama
2. Last Temptation of Christ, The (1988), Drama
3. Out of the Past (1947), Film-Noir
4. Fried Green Tomatoes (1991), Comedy|Crime|Drama
5. Lord of the Flies (1990), Adventure|Drama|Thriller

User 237 may like:
1. Turbo Kid (2015), Action|Adventure|Sci-Fi
2. Pollock (2000), Drama
3. Jumper (2008), Action|Adventure|Drama|Sci-Fi|Thriller
4. JCVD (2008), Action|Drama|Thriller
5. Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922), Horror
6. Searching for Sugar Man (2012), Documentary
7. Celtic Pride (1996), Comedy
8. Leap of Faith (1992), Comedy|Drama
