In [12]:
import fastai
from fastai import learner
from fastai.losses import MSELossFlat
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from fastai.data.core import DataLoaders
import numpy as np
from sklearn.model_selection import train_test_split

import polars as pl

In [2]:
df_ratings = pl.read_csv("../data/ml-latest-small/ratings.csv")

df_ratings.head()

userId,movieId,rating,timestamp
i64,i64,f64,i64
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931


In [3]:
embd = nn.Embedding(5, 3)

embd

Embedding(5, 3)

In [4]:
idx = torch.tensor([1, 2], dtype=torch.int32)

embd(idx)

tensor([[ 0.4143, -0.7636, -1.4355],
        [-0.3490, -2.3625,  0.1801]], grad_fn=<EmbeddingBackward0>)

In [5]:
torch.matmul(embd(idx), embd(idx).T)

tensor([[2.8154, 1.4009],
        [1.4009, 5.7356]], grad_fn=<MmBackward0>)

In [6]:
class PolarsDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.to_numpy()
        self.features = torch.FloatTensor(self.data[:, :-1])
        self.labels = torch.FloatTensor(self.data[:, -1])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [7]:
df_ratings = df_ratings.drop("timestamp")

df_ratings.head()

userId,movieId,rating
i64,i64,f64
1,1,4.0
1,3,4.0
1,6,4.0
1,47,5.0
1,50,5.0


In [8]:
# Create mapping user id and movie id
def map_dict(values):
    vals = list(set(values))
    return {v: i for i, v in enumerate(vals)}

In [9]:
movie_ids = map_dict(df_ratings["movieId"])
user_ids = map_dict(df_ratings["userId"])

df_ratings = df_ratings.with_columns([
    pl.col("userId").replace(user_ids).alias("user_id"),
    pl.col("movieId").replace(movie_ids).alias("movie_id")
])

In [10]:
valid_size = 0.2

df_train, df_valid = train_test_split(
    df_ratings.drop(["movieId", "userId"]).select(["user_id", "movie_id", "rating"]),
    test_size=valid_size,
)

In [11]:
ds_train = PolarsCollabDataset(df_train)
ds_valid = PolarsCollabDataset(df_valid)

batch_size = 32
dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
dl_valid = DataLoader(ds_valid, batch_size=batch_size, shuffle=True)

dls_ratings = DataLoaders(dl_train, dl_valid)

NameError: name 'PolarsCollabDataset' is not defined

In [None]:
class DotProductBias(nn.Module):
    def __init__(self, n_users, n_movies, n_factors, y_range):
        super(DotProductBias, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.movie_bias = nn.Embedding(n_movies, 1)
        self.y_range = y_range

    def forward(self, x):
        x = torch.tensor(x, dtype=torch.int32)
        users = self.user_factors(x[:, 0])
        movies = self.movie_factors(x[:, 1])
        user_biases = self.user_bias(x[:, 0])
        movie_biases = self.movie_bias(x[:, 1])

        res = (users * movies).sum(dim=1)
        res += (user_biases + movie_biases).flatten()

        return torch.sigmoid(res) * (y_range[1] - y_range[0]) + y_range[0]

In [None]:
n_users = df_ratings["userId"].unique().shape[0]
n_movies = df_ratings["movieId"].unique().shape[0]
n_factors = 50

y_range = (df_ratings["rating"].min(), df_ratings["rating"].max())

In [None]:
collab_model = DotProductBias(n_users, n_movies, n_factors, y_range)
collab_model

DotProductBias(
  (user_factors): Embedding(610, 50)
  (user_bias): Embedding(610, 1)
  (movie_factors): Embedding(9724, 50)
  (movie_bias): Embedding(9724, 1)
)

In [226]:
smp = next(iter(dl_train))[0]

collab_model(smp)

  x = torch.tensor(x, dtype=torch.int32)


tensor([4.9043, 0.5000, 0.9823, 0.5039, 4.9989, 4.7626, 0.5018, 0.5000, 4.9997,
        2.6362, 4.9989, 4.9290, 1.8125, 0.5012, 0.5044, 4.9789, 0.5003, 4.9988,
        0.6869, 1.2231, 0.5720, 4.9993, 4.8546, 4.7596, 2.3201, 0.5000, 4.9958,
        0.5130, 3.4653, 4.9999, 4.9947, 4.3934], grad_fn=<AddBackward0>)

In [228]:
collab_model(smp)

  x = torch.tensor(x, dtype=torch.int32)


tensor([4.9043, 0.5000, 0.9823, 0.5039, 4.9989, 4.7626, 0.5018, 0.5000, 4.9997,
        2.6362, 4.9989, 4.9290, 1.8125, 0.5012, 0.5044, 4.9789, 0.5003, 4.9988,
        0.6869, 1.2231, 0.5720, 4.9993, 4.8546, 4.7596, 2.3201, 0.5000, 4.9958,
        0.5130, 3.4653, 4.9999, 4.9947, 4.3934], grad_fn=<AddBackward0>)

In [229]:
collab_learner = learner.Learner(dls_ratings, collab_model, loss_func=MSELossFlat())

In [230]:
collab_learner.fit(50, 5e-3, wd=0.1)

  x = torch.tensor(x, dtype=torch.int32)


[0, 1.0714415311813354, 1.031864047050476, '00:07']
[1, 0.7664541602134705, 0.7796425819396973, '00:07']
[2, 0.673209547996521, 0.749434769153595, '00:07']
[3, 0.6734711527824402, 0.7544600963592529, '00:07']
[4, 0.631726086139679, 0.746547520160675, '00:08']


In [233]:
collab_learner.predict

<bound method Learner.predict of <fastai.learner.Learner object at 0x31ce9b950>>

In [239]:
collab_learner.movie_factors(torch.tensor(2))

tensor([-1.3871e-01, -3.8165e-02,  1.8014e-01, -1.2376e-01,  4.1744e-02,
        -5.7368e-02,  1.1342e-01,  8.4517e-02, -1.4927e-01, -1.7615e-01,
        -4.4052e-02, -4.1520e-02,  3.6822e-02, -1.0355e-01, -8.2239e-02,
        -1.4295e-01, -1.3178e-01,  9.4141e-03,  1.7219e-01, -2.8844e-02,
        -5.7889e-05,  2.5924e-02,  1.2674e-01,  7.1839e-02, -2.3760e-01,
         2.7293e-02, -1.6907e-01, -4.9631e-02, -1.5789e-01,  3.0090e-01,
        -1.3494e-01,  2.3973e-02,  1.3801e-01,  1.3494e-01,  1.9310e-01,
         5.4915e-02, -5.1729e-02, -2.1546e-02,  1.0770e-01,  1.2513e-01,
         9.2090e-02, -1.5244e-01, -9.9093e-03,  9.2629e-03,  7.4559e-02,
        -1.6656e-01,  4.4012e-03,  1.2736e-01,  7.8801e-02, -3.9277e-02],
       grad_fn=<EmbeddingBackward0>)

In [241]:
# Associate with movies and make a couple of recommendations
df_movies = pl.read_csv("../data/ml-latest-small/movies.csv")
df_movies.head()

movieId,title,genres
i64,str,str
1,"""Toy Story (1995)""","""Adventure|Animation|Children|C…"
2,"""Jumanji (1995)""","""Adventure|Children|Fantasy"""
3,"""Grumpier Old Men (1995)""","""Comedy|Romance"""
4,"""Waiting to Exhale (1995)""","""Comedy|Drama|Romance"""
5,"""Father of the Bride Part II (1…","""Comedy"""


In [243]:
df_ratings_mov = df_ratings.join(df_movies.drop("genres"), on="movieId")
df_ratings_mov.head()

userId,movieId,rating,user_id,movie_id,title
i64,i64,f64,i64,i64,str
1,1,4.0,0,0,"""Toy Story (1995)"""
1,3,4.0,0,2,"""Grumpier Old Men (1995)"""
1,6,4.0,0,5,"""Heat (1995)"""
1,47,5.0,0,46,"""Seven (a.k.a. Se7en) (1995)"""
1,50,5.0,0,49,"""Usual Suspects, The (1995)"""


In [246]:
df_movies_id = df_ratings_mov.select(["movieId", "movie_id", "title"]).unique()

In [261]:
df_movies_id = df_movies_id.sort("movie_id")

In [276]:
_movie_embds = (
    collab_learner.movie_factors(torch.tensor(df_movies_id["movie_id"].to_numpy()))
    .detach()
    .numpy()
    + collab_learner.movie_bias(torch.tensor(df_movies_id["movie_id"].to_numpy()))
    .detach()
    .numpy()
)

# Standardize
_movie_embds = _movie_embds / _movie_embds.sum(axis=1).reshape(-1, 1)

In [277]:
np.save("../data/movie_embeddings.npy", _movie_embds)

In [255]:
# df_movies_id.write_excel("../data/movies.xlsx")

<xlsxwriter.workbook.Workbook at 0x31b67c7a0>

array([[-0.0187987 ,  0.10761486, -0.29435503, ..., -0.30116737,
         0.17865072,  0.02025343],
       [ 0.01864266, -0.03724597, -0.10102499, ..., -0.09603266,
        -0.09170996, -0.06301638],
       [ 1.4784467 ,  0.4067808 , -1.919999  , ..., -1.3575077 ,
        -0.8399011 ,  0.41863376],
       ...,
       [ 0.18171374,  0.46859145,  0.5245141 , ..., -0.5189195 ,
         0.51350373, -0.5088581 ],
       [-1.2465489 , -0.04525044, -0.15683722, ...,  0.46330243,
        -0.19046976, -1.5545137 ],
       [ 0.52228975, -0.5752992 ,  0.54537344, ...,  0.5703891 ,
        -0.5257256 , -0.5341888 ]], dtype=float32)