In [2]:
!unzip ml-latest.zip

Archive:  ml-latest.zip
   creating: ml-latest/
  inflating: ml-latest/links.csv     
  inflating: ml-latest/tags.csv      
  inflating: ml-latest/genome-tags.csv  
  inflating: ml-latest/ratings.csv   
  inflating: ml-latest/README.txt    
  inflating: ml-latest/genome-scores.csv  
  inflating: ml-latest/movies.csv    


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch import Tensor
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from IPython.display import display, clear_output
import matplotlib.image as mpimg
import pandas as pd
from numpy import prod
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import LabelBinarizer
import time
import glob
import json

from itertools import product
from collections import namedtuple
from collections import OrderedDict

import warnings
"""Optional[...] is a shorthand notation for Union[..., None], 
telling the type checker that either an object of the specific type is required, 
or None is required. ... stands for any valid type hint, 
including complex compound types or a Union[] of more types. 
Whenever you have a keyword argument with default value None, 
you should use Optional."""
from torch.jit.annotations import Optional, Tuple

from torch.hub import load_state_dict_from_url

from torchsummary import summary
from tqdm import tqdm



In [4]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device:', torch.cuda.get_device_name(torch.cuda.current_device()))

Device: NVIDIA GeForce RTX 3090


In [8]:
path='./ml-latest/'

In [9]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [10]:
movies = pd.read_csv(path+'movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
g = ratings.groupby('userId')['rating'].count()
topUsers = g.sort_values(ascending=False)[:15]

g=ratings.groupby('movieId')['rating'].count()
topMovies=g.sort_values(ascending=False)[:15]

top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')

pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,1,110,260,296,318,356,480,527,589,593,1196,1198,1210,2571,2959
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
48470,4.5,4.5,4.0,5.0,4.5,4.0,4.0,4.5,5.0,3.5,4.0,3.0,3.5,3.5,3.5
63783,5.0,1.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,4.0,5.0,5.0,5.0
77609,3.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.5,2.5,4.0,3.0,4.0,2.0
94843,1.0,2.5,2.5,3.5,2.5,3.0,1.0,0.5,,3.0,2.5,2.5,1.0,3.0,2.5
111908,5.0,3.5,5.0,5.0,5.0,2.0,3.0,5.0,4.0,2.0,5.0,5.0,3.5,5.0,4.0
117490,4.0,5.0,4.0,4.5,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0
123100,4.0,3.5,4.0,4.5,4.5,3.5,4.0,4.5,4.0,4.0,4.0,4.0,4.0,4.0,3.5
134596,3.0,4.0,4.0,4.0,4.5,3.0,4.0,4.0,4.0,4.0,4.0,4.5,4.0,4.5,4.0
141955,2.5,2.5,3.5,3.0,1.0,1.0,2.0,,3.0,3.0,3.5,3.0,3.0,2.5,1.5
158002,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0


In [14]:
!pip install fastai

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting fastai
  Downloading fastai-2.7.12-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.1/233.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting fastdownload<2,>=0.0.5 (from fastai)
  Downloading fastdownload-0.0.7-py3-none-any.whl (12 kB)
Collecting fastcore<1.6,>=1.5.29 (from fastai)
  Downloading fastcore-1.5.29-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting fastprogress>=0.2.4 (from fastai)
  Downloading fastprogress-1.0.3-py3-none-any.whl (12 kB)
Collecting spacy<4 (from fastai)
  Downloading spacy-3.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting spacy-legac

In [None]:
!conda install -c fastchan fastai

In [18]:
from fastai.learner import *
from fastai.column_data import *

ModuleNotFoundError: No module named 'fastai.column_data'

In [17]:
val_idxs = get_cv_idxs(len(ratings))
wd=2e-4
n_factors = 50



NameError: name 'get_cv_idxs' is not defined

In [None]:
cf = CollabFilterDataset.from_csv(path, 'ratings.csv', 'userId', 'movieId', 'rating')
learn = cf.get_learner(n_factors, val_idxs, 64, opt_fn=optim.Adam, tmp_name=tmp_path, models_name=models_path)


In [None]:
learn.fit(1e-2, 2, wds=wd, cycle_len=1, cycle_mult=2)

In [None]:
preds = learn.predict()

In [None]:
y=learn.data.val_y
sns.jointplot(preds, y, kind='hex', stat_func=None);

In [None]:
movie_names = movies.set_index('movieId')['title'].to_dict()
g=ratings.groupby('movieId')['rating'].count()
topMovies=g.sort_values(ascending=False).index.values[:3000]
topMovieIdx = np.array([cf.item2idx[o] for o in topMovies])

In [None]:
m=learn.model; m.cuda()

In [None]:
movie_bias = to_np(m.ib(V(topMovieIdx)))

In [None]:
movie_ratings = [(b[0], movie_names[i]) for i,b in zip(topMovies,movie_bias)]

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
movie_pca = pca.fit(movie_emb.T).components_

In [None]:
idxs = np.random.choice(len(topMovies), 50, replace=False)
X = fac0[idxs]
Y = fac1[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(topMovies[idxs], X, Y):
    plt.text(x,y,movie_names[i], color=np.random.rand(3)*0.7, fontsize=11)
plt.show()

In [5]:
class MovieEmbeddingModel(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        self.movie_embeddings = nn.Embedding(n_movies, n_factors)
        self.fc1 = nn.Linear(n_factors, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.movie_embeddings(x)
        x = torch.mean(x, dim=1) # average embeddings
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x.squeeze()

In [None]:
wd=1e-5
model = MovieEmbeddingModel(n_users, n_movies).cuda()
opt = optim.Adam(model.parameters(), 1e-3, weight_decay=wd)