In [1]:
import torch
print(f"{torch.__version__ = }")

torch.__version__ = '1.13.1+cpu'


In [2]:
import os
os.environ["TORCH"] = torch.__version__

In [4]:
from torch_geometric.data import download_url, extract_zip

url = (
    "https://files.grouplens.org/"
    "datasets/movielens/ml-latest-small.zip"
)
extract_zip(download_url(url, "."), ".")

Using existing file ml-latest-small.zip
Extracting ./ml-latest-small.zip


In [6]:
import numpy as np
import pandas as pd

In [6]:
movies_path = "./ml-latest-small/movies.csv"
movies_df = pd.read_csv(movies_path)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings_path = "./ml-latest-small/ratings.csv"
ratings_df = pd.read_csv(ratings_path)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


**(?0)** I think ratings are from 0 to 5, right? (With 0 being bad and 5 being good)  

In [8]:
ratings_df.loc[ratings_df.loc[:, "rating"] < 2.0, :].head()

Unnamed: 0,userId,movieId,rating,timestamp
205,1,3176,1.0,964983504
261,3,31,0.5,1306463578
262,3,527,0.5,1306464275
263,3,647,0.5,1306463619
264,3,688,0.5,1306464228


Sth similar to one-hot vector feature of `"genres"`

In [9]:
genres_df = movies_df.loc[:, "genres"].str.get_dummies("|")
genres_df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**(?1)** `(no genre listed)`?  

In [10]:
(movies_df.loc[:, "genres"] == "").sum()

0

In [11]:
(movies_df.loc[:, "genres"] == None).sum()

0

In [12]:
(movies_df.loc[:, "genres"] == "(no genres listed)").sum()

34

**(?2)** Make sure that for those rows whose `"(no genres listed)"` equals `1` have all other columns `0`.  
**(R2)**

In [13]:
genres_df.loc[genres_df.loc[:, "(no genres listed)"] == 1, :].head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
8517,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8684,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8687,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8782,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8836,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**(?3)** I think we can get rid of the `"(no genre listed)"` column.

In [14]:
def no_dunders(obj, return_list=False):
    gen = (s for s in dir(obj) if not s.startswith("_"))
    if return_list:
        return list(gen)
    else:
        return gen

In [15]:
cols_to_rm = ["(no genres listed)"]
if all(s in genres_df.columns for s in cols_to_rm):
    genres_df.drop(columns=cols_to_rm, inplace=True)
genres_df.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


We shall use the **movie genres as feature vector** for movie nodes.

In [16]:
movie_feats = torch.from_numpy(genres_df.values).to(torch.float)
movie_feats.shape

torch.Size([9742, 19])

In [17]:
torch.float, torch.double

(torch.float32, torch.float64)

Because not every movie is being rated, the `"movieId"`'s appearing
in `ratings_df` will not cover every id, neither should it follow any
particular (e.g. increasing) order.

In [18]:
unique_movie_ids = ratings_df["movieId"].unique()
unique_movie_ids

array([     1,      3,      6, ..., 160836, 163937, 163981])

In [21]:
np.array_equal(np.sort(unique_movie_ids), unique_movie_ids)

False

In [50]:
movies_df.loc[:, "movieId"].unique()

array([     1,      2,      3, ..., 193585, 193587, 193609])

In [54]:
len(unique_movie_ids), len(movies_df.loc[:, "movieId"].unique())

(9724, 9742)

In [56]:
S1 = set(unique_movie_ids)
S2 =set(movies_df.loc[:, "movieId"].unique())
S1.issubset(S2)

True

In [22]:
unique_user_ids = ratings_df["userId"].unique()
unique_user_ids

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [23]:
len(unique_user_ids), unique_user_ids.min(), unique_user_ids.max()

(610, 1, 610)

In [24]:
pd.RangeIndex(10)

RangeIndex(start=0, stop=10, step=1)

In [33]:
movie_id_df = pd.DataFrame(data={
    "movieId": unique_movie_ids,
    "0_movieId": pd.RangeIndex(len(unique_movie_ids)),
})
movie_id_df

Unnamed: 0,movieId,0_movieId
0,1,0
1,3,1
2,6,2
3,47,3
4,50,4
...,...,...
9719,160341,9719
9720,160527,9720
9721,160836,9721
9722,163937,9722


**Rmk.** The `pd.RangeIndex` seems to be not particularly special; we
could have used `np.arange` instead.

In [32]:
user_id_df = pd.DataFrame(data={
    "userId": unique_user_ids,
    "0_userId": pd.RangeIndex(len(unique_user_ids)),
})
user_id_df.tail()

Unnamed: 0,userId,0_userId
605,606,605
606,607,606
607,608,607
608,609,608
609,610,609


In [42]:
rating_movie_id_df = pd.merge(
    ratings_df["movieId"], movie_id_df,
    left_on="movieId", right_on="movieId",
    how="left",
)
rating_movie_id_df.tail()

Unnamed: 0,movieId,0_movieId
100831,166534,3120
100832,168248,2035
100833,168250,3121
100834,168252,1392
100835,170875,2873


In [40]:
rating_movie_id_df.shape, movie_id_df.shape

((100836, 2), (9724, 2))

In [44]:
rating_user_id_df = pd.merge(
    ratings_df["userId"], user_id_df,
    left_on="userId", right_on="userId",
    how="left",
)
rating_user_id_df.tail()

Unnamed: 0,userId,0_userId
100831,610,609
100832,610,609
100833,610,609
100834,610,609
100835,610,609


In [45]:
rating_movie_ids = torch.from_numpy(rating_movie_id_df.loc[:, "0_movieId"].values)
rating_user_ids = torch.from_numpy(rating_user_id_df.loc[:, "0_userId"].values)
rating_movie_ids.shape, rating_user_ids.shape

(torch.Size([100836]), torch.Size([100836]))

In [46]:
edge_user_to_movie = torch.stack([
    rating_user_ids,
    rating_movie_ids,
])
edge_user_to_movie

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 3121, 1392, 2873]])

In [47]:
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData