Note that this notebook has been adapted from https://colab.research.google.com/drive/1I5S2vhcfumg1mlfNhH5MIDE4jbWZTcFW#scrollTo=X_R02ii268lQ

the data used is the one for movielens 1m



	AGE categories:
  
- 1:  "Under 18"
- 18:  "18-24"
- 25:  "25-34"
- 35:  "35-44"
- 45:  "45-49"
- 50:  "50-55"
- 56:  "56+"

In [42]:
import pandas as pd
import torch as torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np
from rich import inspect

import math
from torch.utils.data import Dataset
import itertools
import torch.nn.functional as F
import joblib

In [2]:
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = "cpu"
device

'cpu'

In [3]:
movies_df = pd.read_csv(
    "../data/ml-1m/movies.dat",
    sep="::",
    names=["movieId", "title", "genres"],
    encoding="latin-1",
    engine="python",
)
movies_df["movieId_index"] = movies_df["movieId"].astype("category").cat.codes

In [4]:
movies_df.head(4)

Unnamed: 0,movieId,title,genres,movieId_index
0,1,Toy Story (1995),Animation|Children's|Comedy,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama,3


In [5]:
movies_df.movieId_index.describe(), movies_df.movieId_index.nunique()

(count    3883.000000
 mean     1941.000000
 std      1121.069876
 min         0.000000
 25%       970.500000
 50%      1941.000000
 75%      2911.500000
 max      3882.000000
 Name: movieId_index, dtype: float64,
 3883)

In [6]:
movies_df.shape

(3883, 4)

In [7]:
users_df = pd.read_csv(
    "../data/ml-1m/users.dat",
    sep="::",
    header=None,
    names=["userId", "gender", "age", "occupation", "zipcode"],
    engine="python",
)
users_df = users_df.drop(columns=["occupation"])
users_df["gender_index"] = users_df["gender"].astype("category").cat.codes
users_df["age_index"] = users_df["age"].astype("category").cat.codes
users_df["userId_index"] = users_df["userId"].astype("category").cat.codes

In [8]:
print(movies_df.shape)
users_df.head(5)

(3883, 4)


Unnamed: 0,userId,gender,age,zipcode,gender_index,age_index,userId_index
0,1,F,1,48067,0,0,0
1,2,M,56,70072,1,6,1
2,3,M,25,55117,1,2,2
3,4,M,45,2460,1,4,3
4,5,M,25,55455,1,2,4


In [9]:
users_df.age.value_counts()

25    2096
35    1193
18    1103
45     550
50     496
56     380
1      222
Name: age, dtype: int64

In [10]:
ratings = pd.read_csv(
    "../data/ml-1m/ratings.dat",
    sep="::",
    names=["userId", "movieId", "rating", "time"],
    engine="python",
)
ratings.head()

Unnamed: 0,userId,movieId,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
ratings.userId.nunique(), ratings.movieId.nunique()

(6040, 3706)

In [12]:
ratings = ratings.join(movies_df.set_index("movieId"), on="movieId")
ratings = ratings.join(users_df.set_index("userId"), on="userId")

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,time,title,genres,movieId_index,gender,age,zipcode,gender_index,age_index,userId_index
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,1176,F,1,48067,0,0,0
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,655,F,1,48067,0,0,0
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,902,F,1,48067,0,0,0
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,3339,F,1,48067,0,0,0
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,2286,F,1,48067,0,0,0


In [14]:
ratings.movieId_index.max() + 1, ratings.movieId_index.nunique()

(3883, 3706)

In [15]:
ratings.userId_index.max() + 1, ratings.userId_index.nunique()

(6040, 6040)

In [16]:
ratings.age_index.max() + 1, ratings.age_index.nunique()

(7, 7)

In [17]:
ratings.gender_index.max() + 1, ratings.gender_index.nunique()

(2, 2)

In [18]:
feature_columns = [
    "userId_index",
    "movieId_index",
    "age_index",
    "gender_index",
]

In [19]:
# This is the width of each feature: what is the max index that resides in the ratings df:
features_sizes = {
    "userId_index": ratings["userId_index"].max() + 1,
    "movieId_index": ratings["movieId_index"].max() + 1,
    "age_index": ratings["age_index"].max() + 1,
    "gender_index": ratings["gender_index"].max() + 1,
}

print(f" features sizes \n : {features_sizes}")

# calculate offsets.
# Each feature starts from the end of the last one.

next_offset = 0
features_offsets = {}
for k, v in features_sizes.items():
    features_offsets[k] = next_offset
    next_offset += v

print(f" features offsets \n : {features_offsets}")

 features sizes 
 : {'userId_index': 6040, 'movieId_index': 3883, 'age_index': 7, 'gender_index': 2}
 features offsets 
 : {'userId_index': 0, 'movieId_index': 6040, 'age_index': 9923, 'gender_index': 9930}


Using the following adjusted indices, we could filter for specific movie, user, gender and age-group embedding:

In [20]:
# map all column indices to start from correct offset
for column in feature_columns:
    print("\n", column)
    print("before", ratings[column].min(), ratings[column].max())
    ratings[column] = ratings[column].apply(
        lambda c: c + features_offsets[column]
    )
    print("after", ratings[column].min(), ratings[column].max())


 userId_index
before 0 6039
after 0 6039

 movieId_index
before 0 3882
after 6040 9922

 age_index
before 0 6
after 9923 9929

 gender_index
before 0 1
after 9930 9931


In [21]:
ratings[[*feature_columns, "rating"]].head(5)

Unnamed: 0,userId_index,movieId_index,age_index,gender_index,rating
0,0,7216,9923,9930,5
1,0,6695,9923,9930,3
2,0,6942,9923,9930,3
3,0,9379,9923,9930,4
4,0,8326,9923,9930,5


In [22]:
data_x = torch.tensor(ratings[feature_columns].values)
data_y = torch.tensor(ratings["rating"].values).float()
dataset = data.TensorDataset(data_x, data_y)

In [23]:
bs = 1024
train_n = int(len(dataset) * 0.9)
valid_n = len(dataset) - train_n
splits = [train_n, valid_n]
assert sum(splits) == len(dataset)
trainset, devset = torch.utils.data.random_split(dataset, splits)
train_dataloader = data.DataLoader(trainset, batch_size=bs, shuffle=True)
dev_dataloader = data.DataLoader(devset, batch_size=bs, shuffle=True)

In [24]:
# copied from fastai:
def trunc_normal_(x, mean=0.0, std=1.0):
    "Truncated normal initialization."
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [25]:
class FMModel(nn.Module):
    def __init__(self, n, k):
        super().__init__()

        self.w0 = nn.Parameter(torch.zeros(1))
        self.bias = nn.Embedding(n, 1)
        self.embeddings = nn.Embedding(n, k)

        # See https://arxiv.org/abs/1711.09160
        with torch.no_grad():
            trunc_normal_(self.embeddings.weight, std=0.01)
        with torch.no_grad():
            trunc_normal_(self.bias.weight, std=0.01)

    def forward(self, X):
        emb = self.embeddings(X)
        # calculate the interactions in complexity of O(nk) see lemma 3.1 from paper
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum - sum_of_pow).sum(1) * 0.5
        bias = self.bias(X).squeeze().sum(1)
        return torch.sigmoid(self.w0 + bias + pairwise) * 5.5

In [26]:
# fit/test functions
def fit(iterator, model, optimizer, criterion):
    train_loss = 0
    model.train()
    for x, y in iterator:
        optimizer.zero_grad()
        y_hat = model(x.to(device))
        loss = criterion(y_hat, y.to(device))
        train_loss += loss.item() * x.shape[0]
        loss.backward()
        optimizer.step()
    return train_loss / len(iterator.dataset)


def test(iterator, model, criterion):
    train_loss = 0
    model.eval()
    for x, y in iterator:
        with torch.no_grad():
            y_hat = model(x.to(device))
        loss = criterion(y_hat, y.to(device))
        train_loss += loss.item() * x.shape[0]
    return train_loss / len(iterator.dataset)

In [27]:
def train_n_epochs(model, n, optimizer, scheduler):
    criterion = nn.MSELoss().to(device)
    for epoch in range(n):
        start_time = time.time()
        train_loss = fit(train_dataloader, model, optimizer, criterion)
        valid_loss = test(dev_dataloader, model, criterion)
        scheduler.step()
        secs = int(time.time() - start_time)
        print(f"epoch {epoch}. time: {secs}[s]")
        print(f"\ttrain rmse: {(math.sqrt(train_loss)):.4f}")
        print(f"\tvalidation rmse: {(math.sqrt(valid_loss)):.4f}")

In [28]:
data_x.max()

tensor(9931)

In [29]:
# we use " data_x.max() + 1 " in order to get a different embedding for each of the indices (users, genders, movies and age)

model = FMModel(data_x.max() + 1, 120).to(device)
wd = 1e-5
lr = 0.001
epochs = 10
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[7], gamma=0.1)
criterion = nn.MSELoss().to(device)
for epoch in range(epochs):
    start_time = time.time()
    train_loss = fit(train_dataloader, model, optimizer, criterion)
    valid_loss = test(dev_dataloader, model, criterion)
    scheduler.step()
    secs = int(time.time() - start_time)
    print(f"epoch {epoch}. time: {secs}[s]")
    print(f"\ttrain rmse: {(math.sqrt(train_loss)):.4f}")
    print(f"\tvalidation rmse: {(math.sqrt(valid_loss)):.4f}")

epoch 0. time: 6[s]
	train rmse: 0.9444
	validation rmse: 0.9152
epoch 1. time: 6[s]
	train rmse: 0.9039
	validation rmse: 0.9038
epoch 2. time: 6[s]
	train rmse: 0.8870
	validation rmse: 0.8907
epoch 3. time: 6[s]
	train rmse: 0.8662
	validation rmse: 0.8768
epoch 4. time: 6[s]
	train rmse: 0.8442
	validation rmse: 0.8660
epoch 5. time: 6[s]
	train rmse: 0.8212
	validation rmse: 0.8569
epoch 6. time: 6[s]
	train rmse: 0.7919
	validation rmse: 0.8499
epoch 7. time: 6[s]
	train rmse: 0.7473
	validation rmse: 0.8469
epoch 8. time: 6[s]
	train rmse: 0.7413
	validation rmse: 0.8459
epoch 9. time: 6[s]
	train rmse: 0.7363
	validation rmse: 0.8454


In [41]:
torch.save(model, "../models/recommender_1_m.pt")

## Understand Embeddings

### movies

In [31]:
movies = ratings.drop_duplicates("movieId_index").copy()
movie_embeddings = model.embeddings(
    torch.tensor(movies["movieId_index"].values, device=device).long()
)
movies["embedding"] = movie_embeddings.tolist()
movie_biases = model.bias(
    torch.tensor(movies["movieId_index"].values, device=device).long()
)
movies["bias"] = movie_biases.cpu().detach().numpy()

In [32]:
movies.shape

(3706, 15)

In [34]:
movies[["title", "movieId_index", "embedding", "bias"]].to_csv('../data/movies_embeddings.csv', index=False)

In [35]:
movies.query(" title.str.lower().str.contains('star wars') ")

Unnamed: 0,userId,movieId,rating,time,title,genres,movieId_index,gender,age,zipcode,gender_index,age_index,userId_index,embedding,bias
44,1,260,4,978300760,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,6297,F,1,48067,9930,9923,0,"[0.08743041008710861, 0.2234169840812683, 0.24...",0.335761
60,2,2628,3,978300051,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Fantasy|Sci-Fi,8599,M,56,70072,9931,9929,1,"[0.25688087940216064, 0.12146784365177155, 0.2...",0.020419
64,2,1210,4,978298151,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War,7232,M,56,70072,9931,9929,1,"[0.17311148345470428, 0.14034566283226013, 0.3...",0.179256
127,2,1196,5,978298730,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,7218,M,56,70072,9931,9929,1,"[0.03483600541949272, 0.16636419296264648, 0.1...",0.249801


In [36]:
star_wars_5_index = torch.tensor(6297, device=device)
star_war_embeddings = model.embeddings(star_wars_5_index)
cosine_similarities = torch.tensor(
    [
        F.cosine_similarity(star_war_embeddings, i, dim=0)
        for i in movie_embeddings
    ]
)
movies.iloc[cosine_similarities.argsort(descending=True).detach().numpy()][
    "title"
].values[:10]

array(['Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Star Wars: Episode VI - Return of the Jedi (1983)',
       'Raiders of the Lost Ark (1981)',
       'Indiana Jones and the Last Crusade (1989)',
       'Star Wars: Episode I - The Phantom Menace (1999)',
       'Ghostbusters (1984)', 'Soft Toilet Seats (1999)',
       'Star Trek: The Wrath of Khan (1982)', 'Batman (1989)'],
      dtype=object)

In [37]:
toy_story_index = torch.tensor(6040, device=device)
toy_story_embeddings = model.embeddings(toy_story_index)
cosine_similarities = torch.tensor(
    [
        F.cosine_similarity(toy_story_embeddings, i, dim=0)
        for i in movie_embeddings
    ]
)
[
    i
    for i in movies.iloc[
        cosine_similarities.argsort(descending=True).detach().numpy()
    ]["title"].values[:10]
]

['Toy Story (1995)',
 'Toy Story 2 (1999)',
 'Aladdin (1992)',
 "Bug's Life, A (1998)",
 'Beauty and the Beast (1991)',
 'Lion King, The (1994)',
 'Little Mermaid, The (1989)',
 'Big Combo, The (1955)',
 'Tarzan (1999)',
 'Big (1988)']

### genders

In [38]:
genders = ratings.drop_duplicates("gender_index").copy()

gender_embeddings = model.embeddings(
    torch.tensor(genders["gender_index"].values, device=device).long()
)
genders["embedding"] = gender_embeddings.tolist()
genders[['gender', 'embedding']].to_csv('../data/genders_embeddings.csv', index=False)


### age-groups:

In [40]:
ages = ratings.drop_duplicates("age_index").copy()

age_embeddings = model.embeddings(
    torch.tensor(ages["age_index"].values, device=device).long()
)
ages["embedding"] = age_embeddings.tolist()
ages = ages[['age', 'age_index']]
ages['age'] = ages['age'].replace(
    {
        1:  "Under 18",
        18:  "18-24",
        25:  "25-34",
        35:  "35-44",
        45:  "45-49",
        50:  "50-55",
        56:  "56+"
    }
)

ages.to_csv('../data/ages_embeddings.csv', index=False)
ages

Unnamed: 0,age,age_index
0,Under 18,9923
53,56+,9929
182,25-34,9925
233,45-49,9927
452,50-55,9928
523,35-44,9926
1940,18-24,9924


## Recommending

In [44]:
man_embedding = model.embeddings(torch.tensor(9754, device=device))
age18_25_embedding = model.embeddings(torch.tensor(9747, device=device))
metadata_embedding = man_embedding + age18_25_embedding
rankings = movie_biases.squeeze() + (metadata_embedding * movie_embeddings).sum(
    1
)
[
    i
    for i in movies.iloc[rankings.argsort(descending=True).cpu()][
        "title"
    ].values
][:10]

['Being John Malkovich (1999)',
 'American Beauty (1999)',
 'Fargo (1996)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)',
 'Citizen Kane (1941)',
 'Brazil (1985)',
 'Magnolia (1999)',
 'Run Lola Run (Lola rennt) (1998)',
 'Croupier (1998)',
 'Wrong Trousers, The (1993)']

In [43]:
woman_embedding = model.embeddings(torch.tensor(9753, device=device))
age50_56_embedding = model.embeddings(torch.tensor(9751, device=device))
metadata_embedding = woman_embedding + age50_56_embedding
rankings = movie_biases.squeeze() + (metadata_embedding * movie_embeddings).sum(
    1
)
[
    i
    for i in movies.iloc[rankings.argsort(descending=True).cpu()][
        "title"
    ].values
][:10]

['Patch Adams (1998)',
 'It Takes Two (1995)',
 'Random Hearts (1999)',
 'Death Wish V: The Face of Death (1994)',
 'Jingle All the Way (1996)',
 'Baby Geniuses (1999)',
 'Jack Frost (1998)',
 'Sister Act 2: Back in the Habit (1993)',
 'All Dogs Go to Heaven 2 (1996)',
 "Porky's II: The Next Day (1983)"]

In [45]:
inspect(model)

In [47]:
model = torch.load("../models/recommender_1_m.pt")