# Loading Dataset

# Based on PyG document

In [1]:
from torch_geometric.data import download_url, extract_zip

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')

movie_path = './ml-latest-small/movies.csv'
rating_path = './ml-latest-small/ratings.csv'

Using existing file ml-latest-small.zip
Extracting .\ml-latest-small.zip


In [142]:
import pandas as pd

movie=pd.read_csv(movie_path)
rating=pd.read_csv(rating_path)
movie.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [143]:
mx=-float("inf")
for genre in list(movie.genres):
    gen=genre.split("|")
    if len(gen)> mx:
        mx=len(gen)
print("The maximum number of genres associated with a movie:",mx)

The maximum number of genres associated with a movie: 10


In [144]:
len(movie)

9742

In [145]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [146]:
print("Maximum number of times a user A rates a movie B:", max(rating.groupby(by=["userId","movieId"]).count()['rating']))

Maximum number of times a user A rates a movie B: 1


In [147]:
import torch

def load_node_csv(path, index_col, encoders=None, **kwargs):
    df = pd.read_csv(path, index_col=index_col, **kwargs)
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

## Sentence Transformers

In [148]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.76214531e-01  1.20601252e-01 -2.93624073e-01 -2.29858026e-01
 -8.22923928e-02  2.37709522e-01  3.39984864e-01 -7.80964196e-01
  1.18127614e-01  1.63373962e-01 -1.37715712e-01  2.40282789e-01
  4.25125599e-01  1.72417849e-01  1.05279692e-01  5.18164098e-01
  6.22218400e-02  3.99285793e-01 -1.81652278e-01 -5.85578680e-01
  4.49722409e-02 -1.72750309e-01 -2.68443495e-01 -1.47386149e-01
 -1.89217970e-01  1.92150578e-01 -3.83842468e-01 -3.96007091e-01
  4.30648863e-01 -3.15320134e-01  3.65949631e-01  6.05158620e-02
  3.57325703e-01  1.59736529e-01 -3.00983816e-01  2.63250291e-01
 -3.94311100e-01  1.84855521e-01 -3.99549276e-01 -2.67889529e-01
 -5.45117497e-01 -3.13403942e-02 -4.30644333e-01  1.33278117e-01
 -1.74793795e-01 -4.35465544e-01 -4.77379113e-01  7.12555572e-02
 -7.37001151e-02  5.69137156e-01 -2.82579720e-01  5.24975285e-02
 -8.20007861e-01  1.98296756e-01  1.69511825e-01  2.71780342e-01
  2.64610

In [149]:
embedding.shape

(384,)

In [150]:
class SequenceEncoder(object):
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [151]:
class GenresEncoder(object):
    def __init__(self, sep='|'):
        self.sep = sep

    def __call__(self, df):
        genres = set(g for col in df.values for g in col.split(self.sep))
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x

In [152]:
movie_x, movie_mapping = load_node_csv(
    movie_path, index_col='movieId', encoders={
        'title': SequenceEncoder(),
        'genres': GenresEncoder()
    })

HBox(children=(FloatProgress(value=0.0, description='Batches', max=305.0, style=ProgressStyle(description_widt…




In [22]:
movie_x.shape

torch.Size([9742, 404])

In [26]:
movie_x[0]

tensor([-8.2835e-02,  5.3030e-02,  5.3577e-02, -2.7935e-02,  1.6134e-02,
         1.2132e-02,  2.4147e-02,  2.0295e-02, -5.5472e-03,  1.3974e-02,
         4.2932e-02,  5.0252e-02,  2.6336e-03,  5.3083e-02,  1.0653e-02,
         2.6884e-02,  1.5536e-02,  4.0110e-02,  1.5220e-02, -9.0487e-02,
         4.0381e-02, -4.1801e-03,  3.8394e-02, -1.7117e-02, -1.0566e-01,
         1.8147e-02,  3.2558e-03, -4.7458e-02, -1.1269e-01, -3.1918e-02,
         1.1425e-02,  2.4668e-02, -1.9351e-02, -6.9325e-02, -1.2954e-02,
        -2.1300e-02, -4.1694e-02, -5.4402e-02,  5.9398e-02, -2.4313e-02,
        -1.3531e-02, -5.9771e-02,  5.5338e-02, -2.7162e-02,  8.4955e-02,
         1.4287e-02, -8.2289e-02, -4.7492e-02,  3.2746e-02,  2.7004e-02,
        -1.0257e-01, -2.9592e-02,  7.4494e-02, -7.6777e-02,  5.6812e-02,
         2.4909e-02,  2.7727e-02,  2.7807e-02,  4.4575e-02, -6.6666e-02,
         5.2070e-02,  1.5745e-03,  2.6283e-02, -5.5703e-03,  8.5972e-02,
        -4.1571e-02, -1.8053e-02, -2.7062e-02, -2.7

In [27]:
_, user_mapping = load_node_csv(rating_path, index_col='userId')

In [30]:
from torch_geometric.data import HeteroData

data = HeteroData()

data['user'].num_nodes = len(user_mapping)  # Users do not have any features.
data['movie'].x = movie_x


In [31]:
print(data)

HeteroData(
  [1muser[0m={ num_nodes=610 },
  [1mmovie[0m={ x=[9742, 404] }
)


In [32]:
def load_edge_csv(path, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    df = pd.read_csv(path, **kwargs)

    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])

    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr


In [34]:
class IdentityEncoder(object):
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(df.values).view(-1, 1).to(self.dtype)

In [35]:
edge_index, edge_label = load_edge_csv(
    rating_path,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='movieId',
    dst_mapping=movie_mapping,
    encoders={'rating': IdentityEncoder(dtype=torch.long)},
)

data['user', 'rates', 'movie'].edge_index = edge_index
data['user', 'rates', 'movie'].edge_label = edge_label

In [36]:
print(data)

HeteroData(
  [1muser[0m={ num_nodes=610 },
  [1mmovie[0m={ x=[9742, 404] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100836],
    edge_label=[100836, 1]
  }
)


In [43]:
data['user','rates','movie'].edge_label

tensor([[4],
        [4],
        [4],
        ...,
        [5],
        [5],
        [3]])

# Movie Lens : Link Prediction

In [48]:
import os.path as osp


In [3]:
import torch
import torch.nn.functional as F
from torch.nn import Linear

import torch_geometric.transforms as T
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import SAGEConv, to_hetero

In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [50]:
dataset = MovieLens("/data/MovieLens", model_name='all-MiniLM-L6-v2')
data = dataset[0].to(device)


Downloading https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Extracting \data\MovieLens\raw\ml-latest-small.zip
Processing...


HBox(children=(FloatProgress(value=0.0, description='Batches', max=305.0, style=ProgressStyle(description_widt…




Done!


### Add user node features for message passing:

In [51]:
data['user'].x = torch.eye(data['user'].num_nodes, device=device)
del data['user'].num_nodes

In [53]:
data = T.ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

In [54]:
# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)

In [55]:
train_data

HeteroData(
  [1mmovie[0m={ x=[9742, 404] },
  [1muser[0m={ x=[610, 610] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 80670],
    edge_label=[80670],
    edge_label_index=[2, 80670]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 80670] }
)

In [56]:
weight = torch.bincount(train_data['user', 'movie'].edge_label)
weight = weight.max() / weight

In [57]:
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [58]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

In [59]:
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

In [60]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)


In [61]:
model = Model(hidden_channels=32).to(device)


In [62]:
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

In [63]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [72]:
train_data

HeteroData(
  [1mmovie[0m={ x=[9742, 404] },
  [1muser[0m={ x=[610, 610] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 80670],
    edge_label=[80670],
    edge_label_index=[2, 80670]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 80670] }
)

In [73]:
 train_data['user','rates', 'movie']

{'edge_index': tensor([[ 560,  560,  433,  ...,  406,   79,  602],
        [1055, 7620, 2195,  ..., 2226, 6414, 2233]]), 'edge_label': tensor([4, 3, 3,  ..., 5, 4, 2]), 'edge_label_index': tensor([[ 560,  560,  433,  ...,  406,   79,  602],
        [1055, 7620, 2195,  ..., 2226, 6414, 2233]])}

In [75]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'movie'].edge_label_index)
    target = train_data['user', 'movie'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)


In [76]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_label_index)
    #pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [78]:
for epoch in range(1, 301):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 17.7010, Train: 3.0638, Val: 3.0579, Test: 3.0648
Epoch: 002, Loss: 14.6256, Train: 2.4824, Val: 2.4788, Test: 2.4866
Epoch: 003, Loss: 10.1050, Train: 1.4062, Val: 1.4055, Test: 1.4152
Epoch: 004, Loss: 6.0586, Train: 1.5468, Val: 1.5308, Test: 1.5307
Epoch: 005, Loss: 13.8226, Train: 1.0817, Val: 1.0765, Test: 1.0843
Epoch: 006, Loss: 7.4966, Train: 1.4881, Val: 1.4875, Test: 1.4978
Epoch: 007, Loss: 6.0237, Train: 1.9552, Val: 1.9532, Test: 1.9627
Epoch: 008, Loss: 7.2789, Train: 2.1923, Val: 2.1893, Test: 2.1983
Epoch: 009, Loss: 8.3667, Train: 2.2599, Val: 2.2565, Test: 2.2656
Epoch: 010, Loss: 8.7197, Train: 2.2127, Val: 2.2096, Test: 2.2189
Epoch: 011, Loss: 8.4523, Train: 2.0748, Val: 2.0724, Test: 2.0823
Epoch: 012, Loss: 7.7458, Train: 1.8549, Val: 1.8536, Test: 1.8644
Epoch: 013, Loss: 6.8074, Train: 1.5669, Val: 1.5670, Test: 1.5788
Epoch: 014, Loss: 5.9528, Train: 1.2669, Val: 1.2679, Test: 1.2805
Epoch: 015, Loss: 5.6361, Train: 1.0768, Val: 1.0770, Test

Epoch: 124, Loss: 2.9037, Train: 1.0978, Val: 1.1358, Test: 1.1545
Epoch: 125, Loss: 2.8992, Train: 1.0966, Val: 1.1351, Test: 1.1538
Epoch: 126, Loss: 2.8964, Train: 1.1018, Val: 1.1394, Test: 1.1585
Epoch: 127, Loss: 2.8936, Train: 1.1007, Val: 1.1388, Test: 1.1579
Epoch: 128, Loss: 2.8898, Train: 1.0969, Val: 1.1359, Test: 1.1549
Epoch: 129, Loss: 2.8879, Train: 1.0994, Val: 1.1378, Test: 1.1570
Epoch: 130, Loss: 2.8843, Train: 1.0982, Val: 1.1368, Test: 1.1561
Epoch: 131, Loss: 2.8812, Train: 1.0954, Val: 1.1348, Test: 1.1541
Epoch: 132, Loss: 2.8791, Train: 1.0991, Val: 1.1378, Test: 1.1573
Epoch: 133, Loss: 2.8755, Train: 1.0988, Val: 1.1376, Test: 1.1572
Epoch: 134, Loss: 2.8727, Train: 1.0946, Val: 1.1343, Test: 1.1537
Epoch: 135, Loss: 2.8703, Train: 1.0970, Val: 1.1363, Test: 1.1557
Epoch: 136, Loss: 2.8673, Train: 1.0965, Val: 1.1362, Test: 1.1556
Epoch: 137, Loss: 2.8643, Train: 1.0959, Val: 1.1359, Test: 1.1553
Epoch: 138, Loss: 2.8620, Train: 1.0977, Val: 1.1375, Test: 1.

Epoch: 247, Loss: 2.6227, Train: 1.0632, Val: 1.1321, Test: 1.1478
Epoch: 248, Loss: 2.6164, Train: 1.0722, Val: 1.1393, Test: 1.1551
Epoch: 249, Loss: 2.6161, Train: 1.0675, Val: 1.1361, Test: 1.1517
Epoch: 250, Loss: 2.6106, Train: 1.0663, Val: 1.1357, Test: 1.1521
Epoch: 251, Loss: 2.6117, Train: 1.0806, Val: 1.1477, Test: 1.1645
Epoch: 252, Loss: 2.6076, Train: 1.0497, Val: 1.1229, Test: 1.1365
Epoch: 253, Loss: 2.6131, Train: 1.0988, Val: 1.1635, Test: 1.1812
Epoch: 254, Loss: 2.6169, Train: 1.0398, Val: 1.1163, Test: 1.1294
Epoch: 255, Loss: 2.6264, Train: 1.1162, Val: 1.1786, Test: 1.1971
Epoch: 256, Loss: 2.6350, Train: 1.0335, Val: 1.1119, Test: 1.1230
Epoch: 257, Loss: 2.6292, Train: 1.0979, Val: 1.1637, Test: 1.1813
Epoch: 258, Loss: 2.6085, Train: 1.0568, Val: 1.1303, Test: 1.1453
Epoch: 259, Loss: 2.5915, Train: 1.0562, Val: 1.1300, Test: 1.1446
Epoch: 260, Loss: 2.5862, Train: 1.0905, Val: 1.1581, Test: 1.1750
Epoch: 261, Loss: 2.5966, Train: 1.0400, Val: 1.1179, Test: 1.

In [79]:
@torch.no_grad()
def test_Predic(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return target

In [82]:
Prediction = test_Predic(test_data)

In [85]:
Prediction==0

tensor([False, False, False,  ..., False, False, False])

In [95]:
real=test_data["user", "rates", "movie"]['edge_label']

In [97]:
len(real[real==0])

144

In [99]:
from sklearn.metrics import accuracy_score

In [102]:
Prediction = test_Predic(val_data)

In [103]:
accuracy_score(val_data["user", "rates", "movie"]['edge_label'], Prediction)

1.0

In [106]:
val_data["user", "rates", "movie"]['edge_label_index']

tensor([[   5,  104,  596,  ...,  493,   90,  466],
        [ 320, 5715,  686,  ...,  793, 2441, 1586]])

In [111]:
import numpy as np

In [115]:
org_rating=np.array(list(rating["rating"]))

In [118]:
len(org_rating)

100836

In [119]:
org_rating[org_rating==0.0]

array([], dtype=float64)

In [124]:
real=data["user", "rates", "movie"]['edge_label']

In [131]:
len(real[real==0])

1370

In [129]:
rating.groupby("rating").count()

Unnamed: 0_level_0,userId,movieId,timestamp
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.5,1370,1370,1370
1.0,2811,2811,2811
1.5,1791,1791,1791
2.0,7551,7551,7551
2.5,5550,5550,5550
3.0,20047,20047,20047
3.5,13136,13136,13136
4.0,26818,26818,26818
4.5,8551,8551,8551
5.0,13211,13211,13211


In [138]:
len(rating)

100836

In [140]:
len(movie)

9742

In [141]:
data

HeteroData(
  [1mmovie[0m={ x=[9742, 404] },
  [1muser[0m={ x=[610, 610] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100836],
    edge_label=[100836]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 100836] }
)

In [154]:
100836/(9742*610)

0.016968273253211548