# Installation

In [1]:
# Install required packages.
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

[K     |████████████████████████████████| 7.9 MB 7.1 MB/s 
[K     |████████████████████████████████| 3.5 MB 7.4 MB/s 
[K     |████████████████████████████████| 482 kB 8.1 MB/s 
[K     |████████████████████████████████| 41 kB 234 kB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


> Trying to install pytorch-geometric-temporal

In [None]:
!pip install torch-geometric-temporal


In [3]:
from torch_geometric_temporal.nn.recurrent import GConvGRU


In [6]:
from torch_geometric_temporal.dataset import METRLADatasetLoader
from torch_geometric_temporal.nn.attention import *

> So Installation of PyG Temporal is working this way

In [None]:
!pip install sentence-transformers -q

[K     |████████████████████████████████| 78 kB 6.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 33.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 44.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 42.9 MB/s 
[K     |████████████████████████████████| 61 kB 542 kB/s 
[K     |████████████████████████████████| 895 kB 48.1 MB/s 
[K     |████████████████████████████████| 596 kB 35.5 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


# DataPart

In [None]:
from torch_geometric.data import download_url, extract_zip

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')

movie_path = './ml-latest-small/movies.csv'
rating_path = './ml-latest-small/ratings.csv'

Downloading https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Extracting ./ml-latest-small.zip


In [None]:
import pandas as pd

print(pd.read_csv(movie_path).head())
print(pd.read_csv(rating_path).head())

   movieId  ...                                       genres
0        1  ...  Adventure|Animation|Children|Comedy|Fantasy
1        2  ...                   Adventure|Children|Fantasy
2        3  ...                               Comedy|Romance
3        4  ...                         Comedy|Drama|Romance
4        5  ...                                       Comedy

[5 rows x 3 columns]
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


> Coming up with node level feature representation x of shape **[num_nodes, num_features]**

In [None]:
movies_df=pd.read_csv(movie_path)
ratings_df=pd.read_csv(rating_path)

In [None]:
movies_df.index.unique()

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            9732, 9733, 9734, 9735, 9736, 9737, 9738, 9739, 9740, 9741],
           dtype='int64', length=9742)

In [None]:
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [None]:
ratings_df.index

RangeIndex(start=0, stop=100836, step=1)

In [None]:
ratings_df.index.unique()

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            100826, 100827, 100828, 100829, 100830, 100831, 100832, 100833,
            100834, 100835],
           dtype='int64', length=100836)

In [None]:
movies_df.values

array([[1, 'Toy Story (1995)',
        'Adventure|Animation|Children|Comedy|Fantasy'],
       [2, 'Jumanji (1995)', 'Adventure|Children|Fantasy'],
       [3, 'Grumpier Old Men (1995)', 'Comedy|Romance'],
       ...,
       [193585, 'Flint (2017)', 'Drama'],
       [193587, 'Bungo Stray Dogs: Dead Apple (2018)',
        'Action|Animation'],
       [193609, 'Andrew Dice Clay: Dice Rules (1991)', 'Comedy']],
      dtype=object)

In [None]:
ratings_df.values

array([[1.00000000e+00, 1.00000000e+00, 4.00000000e+00, 9.64982703e+08],
       [1.00000000e+00, 3.00000000e+00, 4.00000000e+00, 9.64981247e+08],
       [1.00000000e+00, 6.00000000e+00, 4.00000000e+00, 9.64982224e+08],
       ...,
       [6.10000000e+02, 1.68250000e+05, 5.00000000e+00, 1.49427305e+09],
       [6.10000000e+02, 1.68252000e+05, 5.00000000e+00, 1.49384635e+09],
       [6.10000000e+02, 1.70875000e+05, 3.00000000e+00, 1.49384642e+09]])

In [None]:
import torch

def load_node_csv(path, index_col, encoders= None, **kwargs):
  df=pd.read_csv(path, index_col=index_col, **kwargs)
  #df=pd.read_csv(path)
  print(len(df))
  mapping= {index: i for i, index in enumerate(df.index.unique())}

  x= None
  if encoders is not None:
    xs= [encoder(df[col]) for col, encoder in encoders.items()]
    x=torch.cat(xs, dim=-1)

  return x, mapping

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
class SequenceEncoder(object):

  def __init__(self, model_name='all-MiniLM-L6-v2', device= None):
    self.device= device
    self.model= SentenceTransformer(model_name, device=device)

  @torch.no_grad()
  def __call__(self, df):
    x=self.model.encode(df.values, show_progress_bar=True,
                        convert_to_tensor=True, device= self.device)
    
    return x.cpu()

In [None]:
set(g for col in movies_df.genres.values for g in col.split("|"))

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [None]:
class GenresEncoder(object):
  def __init__(self, sep="|"):
    self.sep=sep

  def __call__(self, df):

    genres=set(g for col in df.values for g in col.split(self.sep))
    mapping= {genre: i for i, genre in enumerate(genres)}

    x= torch.zeros(len(df), len(mapping))
    for i, col in enumerate(df.values):
      for genre in col.split(self.sep):
        x[i, mapping[genre]] = 1
    return x

In [None]:
movie_x, movie_mapping=load_node_csv(movie_path,
                                     index_col='movieId',
                                     encoders= {'title': SequenceEncoder(),
                                                "genres": GenresEncoder()})

9742


Batches:   0%|          | 0/305 [00:00<?, ?it/s]

In [None]:
movie_mapping

In [None]:
movie_x, movie_x.shape

(tensor([[-0.0828,  0.0530,  0.0536,  ...,  0.0000,  0.0000,  0.0000],
         [-0.1053,  0.1508, -0.0264,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0988,  0.0176, -0.0527,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [-0.1115,  0.0310, -0.0177,  ...,  0.0000,  0.0000,  1.0000],
         [ 0.0366,  0.0137,  0.0315,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0500, -0.0141, -0.0031,  ...,  0.0000,  0.0000,  0.0000]]),
 torch.Size([9742, 404]))

In [None]:
_, user_mapping = load_node_csv(rating_path, index_col='userId')


100836


> Now making our HetroData

In [None]:
from torch_geometric.data import HeteroData

In [None]:
data=HeteroData()


In [None]:
data['user'].num_nodes=len(user_mapping)
data['movie'].x=movie_x
print(data)

HeteroData(
  [1muser[0m={ num_nodes=610 },
  [1mmovie[0m={ x=[9742, 404] }
)


In [None]:
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [None]:
len(ratings_df),len(movies_df)

(100836, 9742)

In [None]:
def load_edge_csv(path, src_index_col, src_mapping,
                  dst_index_col, dst_mapping, encoders=None, **kwargs):
  df=pd.read_csv(path, **kwargs)

  src=[src_mapping[index] for index in df[src_index_col]]
  dst=[dst_mapping[index] for index in df[dst_index_col]]
  edge_index=torch.tensor([src, dst])

  edge_attr= None
  if encoders is not None:
      edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
      edge_attr = torch.cat(edge_attrs, dim=-1)

  return edge_index, edge_attr

In [None]:
class IdentityEncoder(object):
  def __init__(self, dtype=None):
    self.dtype=dtype

  def __call__(self, df):
    return torch.from_numpy(df.values).view(-1, 1).to(self.dtype)

In [None]:
edge_index, edge_label = load_edge_csv(
    rating_path,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='movieId',
    dst_mapping=movie_mapping,
    encoders={'rating': IdentityEncoder(dtype=torch.long)},
)

In [None]:
edge_index.shape

torch.Size([2, 100836])

In [None]:
edge_index.t()

tensor([[   0,    0],
        [   0,    2],
        [   0,    5],
        ...,
        [ 609, 9462],
        [ 609, 9463],
        [ 609, 9503]])

In [None]:
edge_label

tensor([[4],
        [4],
        [4],
        ...,
        [5],
        [5],
        [3]])

In [None]:
data

HeteroData(
  [1muser[0m={ num_nodes=610 },
  [1mmovie[0m={ x=[9742, 404] }
)

In [None]:
data['user', 'rates', 'movie'].edge_index = edge_index
data['user', 'rates', 'movie'].edge_label = edge_label

In [None]:
print(data)

HeteroData(
  [1muser[0m={ num_nodes=610 },
  [1mmovie[0m={ x=[9742, 404] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100836],
    edge_label=[100836, 1]
  }
)
