<a href="https://colab.research.google.com/github/mojtabaSefidi/Machine-Learning-with-Graphs/blob/main/MLG_Final_Project_100k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !unzip ml-100k.zip
# !pip uninstall jupyter
# !pip install jupyter

## Install Essential Packages

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install -q torch_geometric
!pip install -q torch-sparse==0.6.13
!pip install -q torch_scatter
# !pip install -q ogb

## Install Essential Libraries

In [3]:

import torch
import networkx as nx
import torch_geometric
from torch_geometric.utils import to_networkx
from torch_geometric.datasets import TUDataset
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.loader import DataLoader
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GraphConv, SAGEConv, global_add_pool
from torch_geometric.data import HeteroData
from torch_geometric import transforms
from torch_geometric.loader import LinkNeighborLoader

In [4]:
from numpy.random import seed
from tensorflow import keras
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense
from keras.models import Model

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import pandas as pd
import torch

In [6]:
def plot_history(list_train_loss, list_train_acc, list_val_loss, list_val_acc, n_epochs, title):
    
    plt.figure(figsize=(18,8),linewidth = 7, edgecolor="whitesmoke")    
    n = n_epochs
    
    plt.plot(list(range(1, n_epochs+1)), list_train_acc, color='orange',marker=".")
    plt.plot(list(range(1, n_epochs+1)), list_train_loss,'b',marker=".")
    
    plt.plot(list(range(1, n_epochs+1)), list_val_acc,'r')  
    plt.plot(list(range(1, n_epochs+1)), list_val_loss,'g')
    
    plt.legend(['Train Accuracy','Train Loss','Test Accuracy','Test Loss'])
    plt.grid(True)
    
    # plt.gca().set_ylim(0,1)

    plt.xlabel("Number of Epochs")
    plt.ylabel("Value")
    plt.suptitle(title, size=16, y=0.927)
    plt.show()

## Read the Dataset

In [7]:
# from ogb.graphproppred import PygGraphPropPredDataset
# from torch_geometric.loader import DataLoader

# dataset = PygGraphPropPredDataset(name = 'ogbg-molhiv') 

# split_idx = dataset.get_idx_split() 
# train_data_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True)
# valiation_data_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=True)
# test_data_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=True)

In [8]:
# !unzip /content/MovieLens1M.zip

In [9]:
# movies_df = pd.read_csv('/content/ml-100k/u.item', sep='|', engine='python', encoding="latin-1",
#                         names = ['MovieID', 'Movie Title', 'Release Date', 'video release date','IMDbURL',
#                                  'Unknown','Action','Adventure','Animation', 'Childrens','Comedy',
#                                  'Crime','Documentary','Drama','Fantasy', 'Film-Noir','Horror',
#                                  'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'])

# rating_df = pd.read_csv('/content/ml-100k/u.data', sep='\t', engine='python', encoding="latin-1",
#                         names=['UserID','MovieID','Rating','Timestamp'])

# users_df = pd.read_csv('/content/ml-100k/u.user', sep='|', engine='python', encoding="latin-1",
#                        names=['UserID','Gender','Age','Occupation','Zipcode'])

# movies_df.to_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/movies_df.csv', index=False)
# rating_df.to_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/rating_df.csv', index=False)
# users_df.to_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/users_df.csv', index=False)

In [10]:
geners = np.array(['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
                   'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western'], dtype=str)

values = list(range(0, len(geners)))
geners2vec = dict(zip(geners, values))

def extract_geners(text, sep='|'):
  return text.split(sep)

def geners2vector(df_geners, maper):
  result = np.zeros((len(df_geners),len(maper)), dtype='int8')
  for index, text in enumerate(df_geners):
    geners = extract_geners(text)
    for gener in geners:
      result[index][maper.get(gener)] = 1
  
  return result

In [11]:
def year_extractor(text):
  return text[text.rfind('(')+1:text.rfind(')')]

def calculate_average_rating(rating_df, movies_df, Movie_id_col='MovieID', rating_col='Rating'):
  
  rating_avg = rating_df.groupby(Movie_id_col).mean()[rating_col].round(4).to_dict()
  result = [] 
  for id in movies_df[Movie_id_col]:
    result.append(rating_avg.get(id, 0))
  movies_df['Average Rating'] = result
  return movies_df

Occupation_mapper = { 0: "other", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                     4: "college/grad student",5: "customer service", 6: "doctor/health care",
                     7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student",
                     11: "lawyer", 12: "programmer", 13: "retired", 14: "sales/marketing",
                     15: "scientist", 16: "self-employed", 17: "technician/engineer",
                     18: "tradesman/craftsman", 19: "unemployed", 20: "writer"}

def code2Occupation(occupation_col, mapper):
  return occupation_col.map(mapper)


def extract_user_feature(users_df):
  scaler = StandardScaler()
  age = scaler.fit_transform(users_df[['Age']])
  
  encoder = OneHotEncoder(handle_unknown='ignore')
  occupation = encoder.fit_transform(users_df[['Occupation']]).toarray()
  features = np.hstack((users_df[['Gender']], age, occupation))
  return torch.from_numpy(features).to(torch.float)

def extract_movie_feature(movies_df, mapper):
  scaler = StandardScaler()
  numerical = scaler.fit_transform(movies_df[['year',	'averge_rating']])
  
  categorical = geners2vector(movies_df['Genres'], mapper)
  features = np.hstack((numerical, categorical))
  return torch.from_numpy(features).to(torch.float)

def Timestamp2Date(timestamp):
  return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

def add_age_group(df, age_col='Age'):
  bins= [0,20,25,39,60,110]
  labels = ['Teenage','Young Adult','Adult', 'Older Adult','Old']
  df['AgeGroup'] = pd.cut(df[age_col], bins=bins, labels=labels, right=False)
  return df

def remove_movies(rating_df, movies_df, title_col='Movie Title', movie_id_col='MovieID'):
  unknown_movies = movies_df[movies_df[title_col]=='unknown'][movie_id_col]
  rating_df = rating_df[~rating_df[movie_id_col].isin(unknown_movies)]
  movies_df = movies_df[~movies_df[movie_id_col].isin(unknown_movies)]
  return rating_df, movies_df

In [12]:
movies_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/movies_df.csv')
ratings_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/rating_df.csv')
users_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/users_df.csv')
movies_df.shape, ratings_df.shape, users_df.shape


((1682, 24), (100000, 4), (943, 5))

In [13]:
ratings_df, movies_df = remove_movies(ratings_df, movies_df)

In [14]:
ratings_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [15]:
movies_df.drop(['video release date', 'IMDbURL'], axis=1, inplace=True)
movies_df['Release Date'] = pd.to_datetime(movies_df['Release Date'])
movies_df['Release Year'] = movies_df['Release Date'].dt.year
movies_df['Release Month'] = movies_df['Release Date'].dt.month
movies_df['Release Day'] = movies_df['Release Date'].dt.strftime('%j').apply(int)
movies_df = movies_df.sort_values(by=['Release Date']).reset_index(drop=True)
movies_df = calculate_average_rating(ratings_df, movies_df)
movies_df.tail(2).T

Unnamed: 0,1679,1680
MovieID,1432,315
Movie Title,"Mighty, The (1998)",Apt Pupil (1998)
Release Date,1998-10-09 00:00:00,1998-10-23 00:00:00
Unknown,0,0
Action,0,0
Adventure,0,0
Animation,0,0
Childrens,0,0
Comedy,0,0
Crime,0,0


In [16]:
# !pip install --upgrade geopy
# from geopy.geocoders import Nominatim
# def zipcode2city(zipcode):
#   geolocator = Nominatim(timeout=10, user_agent = "dlab.berkeley.edu-workshop")
#   try:
#     return geolocator.geocode({"postalcode": zipcode})[0].split(',')[0]
#   except:
#     'not found'

# users_df['Zipcode'].apply(zipcode2city)

In [17]:
users_df = add_age_group(users_df)
users_df

Unnamed: 0,UserID,Age,Gender,Occupation,Zipcode,AgeGroup
0,1,24,M,technician,85711,Young Adult
1,2,53,F,other,94043,Older Adult
2,3,23,M,writer,32067,Young Adult
3,4,24,M,technician,43537,Young Adult
4,5,33,F,other,15213,Adult
...,...,...,...,...,...,...
938,939,26,F,student,33319,Adult
939,940,32,M,administrator,02215,Adult
940,941,20,M,student,97229,Young Adult
941,942,48,F,librarian,78209,Older Adult


In [18]:
ratings_df.loc[:,'Timestamp'] = ratings_df.loc[:,'Timestamp'].apply(Timestamp2Date)
ratings_df.sort_values(by='Timestamp', inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

ratings_df.loc[:,'Timestamp'] = pd.to_datetime(ratings_df.loc[:,'Timestamp'])
ratings_df.loc[:,'Year'] = ratings_df.loc[:,'Timestamp'].dt.year
ratings_df.loc[:,'Month'] = ratings_df.loc[:,'Timestamp'].dt.month
ratings_df.loc[:,'Weekday'] = ratings_df.loc[:,'Timestamp'].dt.weekday
ratings_df.loc[:,'Hour'] = ratings_df.loc[:,'Timestamp'].dt.hour
ratings_df.loc[:,'DayofYear'] = ratings_df.loc[:,'Timestamp'].dt.strftime('%j')

ratings_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Year,Month,Weekday,Hour,DayofYear
0,259,255,4,1997-09-20 03:05:10,1997,9,5,3,263
1,259,286,4,1997-09-20 03:05:27,1997,9,5,3,263
2,259,298,4,1997-09-20 03:05:54,1997,9,5,3,263
3,259,185,4,1997-09-20 03:06:21,1997,9,5,3,263
4,259,173,4,1997-09-20 03:07:23,1997,9,5,3,263
...,...,...,...,...,...,...,...,...,...
99986,729,689,4,1998-04-22 23:10:38,1998,4,2,23,112
99987,729,300,4,1998-04-22 23:10:38,1998,4,2,23,112
99988,729,748,4,1998-04-22 23:10:38,1998,4,2,23,112
99989,729,313,3,1998-04-22 23:10:38,1998,4,2,23,112


In [19]:
def generate_graph_edges(movies_df, ratings_df, users_df, rate_threshold=4):
  unique_user_id = ratings_df['UserID'].unique()
  unique_user_id = pd.DataFrame(data={
      'UserID': unique_user_id,
      'mappedID': pd.RangeIndex(len(unique_user_id))
      })
  userid_mapper = dict(zip(unique_user_id.iloc[:,0], unique_user_id.iloc[:,-1]))
  
  print("1. Mapping UserID to consecutive values... ")

  unique_movie_id = ratings_df['MovieID'].unique()
  unique_movie_id = pd.DataFrame(data={
      'MovieID': unique_movie_id,
      'mappedID': pd.RangeIndex(len(unique_movie_id))
      })
  movieid_mapper = dict(zip(unique_movie_id.iloc[:,0], unique_movie_id.iloc[:,-1]))
  print("2. Mapping MovieID to consecutive values... ")

  users_df['UserID'] = users_df['UserID'].map(userid_mapper)
  users_df = users_df.sort_values(by='UserID').reset_index(drop=True)
  movies_df['MovieID'] =  movies_df['MovieID'].map(movieid_mapper)
  movies_df = movies_df.sort_values(by='MovieID').reset_index(drop=True)
  ratings_df['UserID'] = ratings_df['UserID'].map(userid_mapper)
  ratings_df['MovieID'] =  ratings_df['MovieID'].map(movieid_mapper)
  ratings_df = ratings_df.sort_values(by=['UserID','MovieID']).reset_index(drop=True)

  # ratings_user_id = pd.merge(ratings_df[ratings_df['Rating']>=rate_threshold]['UserID'], unique_user_id,
  #                            left_on='UserID', right_on='UserID', how='left')
  # ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
  # ratings_movie_id = pd.merge(ratings_df[ratings_df['Rating']>=rate_threshold]['MovieID'], unique_movie_id,
  #                             left_on='MovieID', right_on='MovieID', how='left')
  # ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
  # user2movie_edge = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
  # print("3. Extract Graph Edges... ")
  # print("=================================================")
  
  return movies_df, ratings_df, users_df

In [20]:
train = pd.read_csv('/content/u1.base', sep='\t', engine='python', encoding="latin-1",
                        names=['UserID','MovieID','Rating','Timestamp'])
train = train.iloc[:50000,:]

In [21]:
test = pd.read_csv('/content/u1.test', sep='\t', engine='python', encoding="latin-1",
                        names=['UserID','MovieID','Rating','Timestamp'])

# test.loc[:,'Timestamp'] = test.loc[:,'Timestamp'].apply(Timestamp2Date)
# test.sort_values(by='Timestamp', inplace=True)
# test.reset_index(drop=True, inplace=True)

# test.loc[:,'Timestamp'] = pd.to_datetime(test.loc[:,'Timestamp'])
# test.loc[:,'Year'] = test.loc[:,'Timestamp'].dt.year
# test.loc[:,'Month'] = test.loc[:,'Timestamp'].dt.month
# test.loc[:,'Weekday'] = test.loc[:,'Timestamp'].dt.weekday
# test.loc[:,'Hour'] = test.loc[:,'Timestamp'].dt.hour
# test.loc[:,'DayofYear'] = test.loc[:,'Timestamp'].dt.strftime('%j')

test

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198
...,...,...,...,...
19995,458,648,4,886395899
19996,458,1101,4,886397931
19997,459,934,3,879563639
19998,460,10,3,882912371


In [22]:
# movies_df, test, users_df, user2movie_edge = generate_graph_edges(movies_df, test, users_df)
# movies_df, test, users_df = generate_graph_edges(movies_df, test, users_df)
# movies_df = movies_df.sort_values(by='MovieID').reset_index(drop=True)
# users_df = users_df.sort_values(by='UserID').reset_index(drop=True)
# test.reset_index(drop=True, inplace=True)

In [23]:
users_df = users_df.sort_values(by='UserID').reset_index(drop=True)
movies_df = movies_df.sort_values(by='MovieID').reset_index(drop=True)
test = test.sort_values(by=['UserID','MovieID']).reset_index(drop=True)
train = train.sort_values(by=['UserID','MovieID']).reset_index(drop=True)


In [24]:
def extract_user_feature(users_df):
  scaler = StandardScaler()
  # age = scaler.fit_transform(users_df[['Age']])
  
  encoder = OneHotEncoder(handle_unknown='ignore')
  categorical_df = encoder.fit_transform(users_df[['Occupation', 'Gender']]).toarray()
  features = np.hstack((categorical_df, users_df[['UserID', 'Age']]))
  return features

def extract_movie_feature(movies_df, exclude_cols):
  movies_df = movies_df.drop(exclude_cols, axis=1)
  scaler = StandardScaler()
  # movies_df[['Release Year', 'Release Month', 'Average Rating', 'Release Day']] = scaler.fit_transform(
  #     movies_df[['Release Year', 'Release Month', 'Average Rating', 'Release Day']])
  features = movies_df.to_numpy()
  return features

In [25]:
user_features = extract_user_feature(users_df)
movie_features = extract_movie_feature(movies_df, ['Movie Title', 'Release Date'])
movie_features.shape, user_features.shape

((1681, 24), (943, 25))

In [26]:
def build_AE(encoding_dim = 8, input_shape= 24):

  input_dim = Input(shape = (input_shape, ))

  # Encoder Layers
  encoded1 = Dense(16, activation = 'relu')(input_dim)
  encoded2 = Dense(encoding_dim, activation = 'relu')(encoded1)

  # Decoder Layers
  decoded1 = Dense(16, activation = 'relu')(encoded2)
  decoded2 = Dense(input_shape, activation = 'relu')(decoded1)

  # Combine Encoder and Deocder layers
  autoencoder = Model(inputs = input_dim, outputs = decoded2)
  encoder = Model(inputs = input_dim, outputs = encoded2)
  # Compile the Model
  autoencoder.compile(optimizer = keras.optimizers.Adadelta(learning_rate=0.001), loss = 'binary_crossentropy')
  print(autoencoder.summary())
  return autoencoder, encoder

In [27]:
scaler = StandardScaler()
movie_features = scaler.fit_transform(movie_features)
user_features = scaler.fit_transform(user_features)

In [28]:
movie_autoencoder, movie_encoder = build_AE(encoding_dim = 8, input_shape= 24)
print('---------------------------------------Trainig of Movie AE Sarts:')
movie_autoencoder.fit(movie_features, movie_features, epochs = 150, batch_size = 16, shuffle = True)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 24)]              0         
                                                                 
 dense (Dense)               (None, 16)                400       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 16)                144       
                                                                 
 dense_3 (Dense)             (None, 24)                408       
                                                                 
Total params: 1,088
Trainable params: 1,088
Non-trainable params: 0
_________________________________________________________________
None
---------------------------------------Trainig of Movi

<keras.callbacks.History at 0x7f2c8d3674f0>

In [29]:
user_autoencoder, user_encoder = build_AE(encoding_dim = 8, input_shape= 25)
print('---------------------------------------Trainig of User AE Sarts:')
user_autoencoder.fit(user_features, user_features, epochs = 150, batch_size = 8, shuffle = True)

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 25)]              0         
                                                                 
 dense_4 (Dense)             (None, 16)                416       
                                                                 
 dense_5 (Dense)             (None, 8)                 136       
                                                                 
 dense_6 (Dense)             (None, 16)                144       
                                                                 
 dense_7 (Dense)             (None, 25)                425       
                                                                 
Total params: 1,121
Trainable params: 1,121
Non-trainable params: 0
_________________________________________________________________
None
---------------------------------------Trainig of Us

<keras.callbacks.History at 0x7f2c8722d0d0>

In [30]:
movie_features_reduced = movie_encoder.predict(movie_features)
user_features_reduced = user_encoder.predict(user_features)



In [31]:
def generate_edge(rating_df, rating_threshold=4):

  graph_edges = [[],[]]
  edge_weight = []

  for userID, movieID, rating in rating_df[['UserID','MovieID','Rating']].itertuples(index=False):
    if rating >= rating_threshold:
      graph_edges[0].append(userID)
      graph_edges[1].append(movieID)
      edge_weight.append(rating)
    
    else:
      continue
  
  return torch.tensor(graph_edges, dtype=torch.long), torch.tensor(edge_weight, dtype=torch.float)

In [32]:
graph_edges_train, label_train = generate_edge(train, rating_threshold=0)
graph_edges_test, label_test = generate_edge(test, rating_threshold=0)


In [33]:
# user2movie_edge, user2movie_edge_weight = generate_edge(rating_df)
# user2movie_edge, user2movie_edge.shape

In [34]:
# torch.from_numpy(rating_df['UserID'].unique()).to(torch.int16),torch.from_numpy(rating_df['MovieID'].unique()).to(torch.int16)
# dataset["user"].node_id.dtype

In [35]:
users_df['UserID'].nunique() == len(users_df)
movies_df['MovieID'].nunique() == len(movies_df)
torch.tensor(user_features_reduced, dtype=torch.float)


tensor([[0.0000, 0.7366, 0.5950,  ..., 0.4377, 0.0000, 0.0000],
        [0.0000, 0.2081, 0.0000,  ..., 0.0000, 0.3419, 0.0000],
        [0.0000, 0.0575, 0.3367,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.1117, 0.2382,  ..., 0.6949, 0.0000, 0.9369],
        [0.0000, 0.0000, 0.5908,  ..., 0.5197, 0.3159, 0.0000],
        [0.0000, 0.1049, 0.2449,  ..., 0.6719, 0.0000, 0.9311]])

In [36]:
train_dataset = HeteroData()

train_dataset["user"].node_id = torch.arange(users_df['UserID'].nunique())
train_dataset["movie"].node_id = torch.arange(movies_df['MovieID'].nunique())
train_dataset["movie"].x = torch.tensor(movie_features_reduced, dtype=torch.float)
train_dataset["user"].x = torch.tensor(user_features_reduced, dtype=torch.float)
train_dataset["user", "rates", "movie"].edge_index = graph_edges_train
train_dataset["user", "rates", "movie"].edge_label_index = graph_edges_train
train_dataset["user", "rates", "movie"].edge_label = label_train

train_dataset = transforms.ToUndirected()(train_dataset)
train_dataset = transforms.NormalizeFeatures()(train_dataset)
train_dataset

HeteroData(
  [1muser[0m={
    node_id=[943],
    x=[943, 8]
  },
  [1mmovie[0m={
    node_id=[1681],
    x=[1681, 8]
  },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 50000],
    edge_label_index=[2, 50000],
    edge_label=[50000]
  },
  [1m(movie, rev_rates, user)[0m={
    edge_index=[2, 50000],
    edge_label=[50000]
  }
)

In [37]:
test_dataset = HeteroData()

test_dataset["user"].node_id = torch.arange(users_df['UserID'].nunique())
test_dataset["movie"].node_id = torch.arange(movies_df['MovieID'].nunique())
test_dataset["movie"].x = torch.tensor(movie_features_reduced, dtype=torch.float)
test_dataset["user"].x = torch.tensor(user_features_reduced, dtype=torch.float)
test_dataset["user", "rates", "movie"].edge_index = graph_edges_test
test_dataset["user", "rates", "movie"].edge_label_index = graph_edges_test
test_dataset["user", "rates", "movie"].edge_label = label_test

test_dataset = transforms.ToUndirected()(test_dataset)
test_dataset = transforms.NormalizeFeatures()(test_dataset)
test_dataset

HeteroData(
  [1muser[0m={
    node_id=[943],
    x=[943, 8]
  },
  [1mmovie[0m={
    node_id=[1681],
    x=[1681, 8]
  },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 20000],
    edge_label_index=[2, 20000],
    edge_label=[20000]
  },
  [1m(movie, rev_rates, user)[0m={
    edge_index=[2, 20000],
    edge_label=[20000]
  }
)

In [38]:
# dataset["movie"].x.dtype , dataset["user", "rates", "movie"].edge_index.dtype, dataset["user"].node_id.dtype, dataset["movie"].node_id.dtype

In [58]:
# data = HeteroData()
# # Save node indices:
# data["user"].node_id = torch.arange(len(unique_user_id))
# data["movie"].node_id = torch.arange(len(movies_df))
# # Add the node features and edge indices:
# data["movie"].x = movie_feat
# data["user", "rates", "movie"].edge_index = edge_index_user_to_movie
# # We also need to make sure to add the reverse edges from movies to users
# # in order to let a GNN be able to pass messages in both directions.
# # We can leverage the `T.ToUndirected()` transform for this from PyG:
# data = T.ToUndirected()(data)

In [59]:
# transform = transforms.RandomLinkSplit(
#     num_val=0.1,
#     num_test=0.2,
#     disjoint_train_ratio=0.3,
#     neg_sampling_ratio=2.0,
#     add_negative_train_samples=False,
#     edge_types=("user", "rates", "movie"),
#     rev_edge_types=("movie", "rev_rates", "user"), 
# )
# train_dataset, val_dataset, test_dataset = transform(train_dataset)

In [60]:
# train_dataset, train_data

In [61]:
#  validation_dataset, val_data

In [62]:
# test_dataset, test_data

In [39]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_dataset["user", "rates", "movie"].edge_label_index
edge_label = train_dataset["user", "rates", "movie"].edge_label
train_loader = LinkNeighborLoader(
    data=train_dataset,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=64,
    shuffle=True,
)
train_loader

LinkNeighborLoader()

In [40]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = test_dataset["user", "rates", "movie"].edge_label_index
edge_label = test_dataset["user", "rates", "movie"].edge_label
test_loader = LinkNeighborLoader(
    data=test_dataset,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=64,
    shuffle=True,
)
test_loader

LinkNeighborLoader()

In [41]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x
# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:

class Classifier(torch.nn.Module):
    def forward(self, x_user, x_movie, edge_label_index):
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)

class Model(torch.nn.Module):
    
    def __init__(self, hidden_channels, user_features_dim, movie_features_dim):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.user_lin = torch.nn.Linear(user_features_dim, hidden_channels)
        self.movie_lin = torch.nn.Linear(movie_features_dim, hidden_channels)
        self.user_emb = torch.nn.Embedding(train_dataset["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(train_dataset["movie"].num_nodes, hidden_channels)
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=train_dataset.metadata())
        self.classifier = Classifier()
    
    def forward(self, data):
        # print(data["movie"].node_id)
        x_dict = {
          # "user": self.user_lin(data["user"].x) + self.user_emb(data["user"].node_id),
          "user": self.user_lin(data["user"].x) + self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        } 
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )
        return pred
        
model = Model(hidden_channels=8, user_features_dim=8, movie_features_dim=8)
model

Model(
  (user_lin): Linear(in_features=8, out_features=8, bias=True)
  (movie_lin): Linear(in_features=8, out_features=8, bias=True)
  (user_emb): Embedding(943, 8)
  (movie_emb): Embedding(1681, 8)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(8, 8, aggr=mean)
      (movie__rev_rates__user): SAGEConv(8, 8, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(8, 8, aggr=mean)
      (movie__rev_rates__user): SAGEConv(8, 8, aggr=mean)
    )
  )
  (classifier): Classifier()
)

In [43]:
import tqdm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 50):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.mse_loss(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


100%|██████████| 782/782 [00:15<00:00, 50.07it/s]


Epoch: 001, Loss: 2.3816


100%|██████████| 782/782 [00:12<00:00, 64.28it/s]


Epoch: 002, Loss: 2.0916


100%|██████████| 782/782 [00:12<00:00, 64.89it/s]


Epoch: 003, Loss: 1.9773


100%|██████████| 782/782 [00:12<00:00, 64.12it/s]


Epoch: 004, Loss: 1.9056


100%|██████████| 782/782 [00:11<00:00, 65.90it/s]


Epoch: 005, Loss: 1.8331


100%|██████████| 782/782 [00:12<00:00, 64.32it/s]


Epoch: 006, Loss: 1.7909


100%|██████████| 782/782 [00:12<00:00, 65.07it/s]


Epoch: 007, Loss: 1.7569


100%|██████████| 782/782 [00:11<00:00, 66.25it/s]


Epoch: 008, Loss: 1.7148


100%|██████████| 782/782 [00:12<00:00, 61.06it/s]


Epoch: 009, Loss: 1.6950


100%|██████████| 782/782 [00:11<00:00, 65.69it/s]


Epoch: 010, Loss: 1.6780


100%|██████████| 782/782 [00:13<00:00, 58.82it/s]


Epoch: 011, Loss: 1.6519


100%|██████████| 782/782 [00:11<00:00, 65.37it/s]


Epoch: 012, Loss: 1.6359


100%|██████████| 782/782 [00:11<00:00, 65.41it/s]


Epoch: 013, Loss: 1.6357


100%|██████████| 782/782 [00:12<00:00, 64.52it/s]


Epoch: 014, Loss: 1.6174


100%|██████████| 782/782 [00:12<00:00, 65.08it/s]


Epoch: 015, Loss: 1.6102


100%|██████████| 782/782 [00:12<00:00, 62.22it/s]


Epoch: 016, Loss: 1.6072


100%|██████████| 782/782 [00:12<00:00, 64.71it/s]


Epoch: 017, Loss: 1.5927


100%|██████████| 782/782 [00:12<00:00, 65.05it/s]


Epoch: 018, Loss: 1.5914


100%|██████████| 782/782 [00:12<00:00, 64.07it/s]


Epoch: 019, Loss: 1.5898


100%|██████████| 782/782 [00:11<00:00, 65.26it/s]


Epoch: 020, Loss: 1.5826


100%|██████████| 782/782 [00:12<00:00, 63.13it/s]


Epoch: 021, Loss: 1.5725


100%|██████████| 782/782 [00:12<00:00, 64.71it/s]


Epoch: 022, Loss: 1.5496


100%|██████████| 782/782 [00:12<00:00, 64.39it/s]


Epoch: 023, Loss: 1.5549


100%|██████████| 782/782 [00:12<00:00, 64.29it/s]


Epoch: 024, Loss: 1.5543


100%|██████████| 782/782 [00:12<00:00, 64.62it/s]


Epoch: 025, Loss: 1.5488


100%|██████████| 782/782 [00:12<00:00, 63.38it/s]


Epoch: 026, Loss: 1.5428


100%|██████████| 782/782 [00:12<00:00, 64.62it/s]


Epoch: 027, Loss: 1.5372


100%|██████████| 782/782 [00:12<00:00, 64.52it/s]


Epoch: 028, Loss: 1.5448


100%|██████████| 782/782 [00:12<00:00, 63.32it/s]


Epoch: 029, Loss: 1.5271


100%|██████████| 782/782 [00:12<00:00, 65.00it/s]


Epoch: 030, Loss: 1.5195


100%|██████████| 782/782 [00:12<00:00, 62.86it/s]


Epoch: 031, Loss: 1.5212


100%|██████████| 782/782 [00:12<00:00, 64.61it/s]


Epoch: 032, Loss: 1.5193


100%|██████████| 782/782 [00:12<00:00, 63.59it/s]


Epoch: 033, Loss: 1.5119


100%|██████████| 782/782 [00:12<00:00, 64.20it/s]


Epoch: 034, Loss: 1.5101


100%|██████████| 782/782 [00:12<00:00, 64.74it/s]


Epoch: 035, Loss: 1.5025


100%|██████████| 782/782 [00:12<00:00, 62.28it/s]


Epoch: 036, Loss: 1.5064


100%|██████████| 782/782 [00:12<00:00, 64.57it/s]


Epoch: 037, Loss: 1.4999


100%|██████████| 782/782 [00:12<00:00, 63.12it/s]


Epoch: 038, Loss: 1.4991


100%|██████████| 782/782 [00:12<00:00, 64.16it/s]


Epoch: 039, Loss: 1.4842


100%|██████████| 782/782 [00:12<00:00, 65.01it/s]


Epoch: 040, Loss: 1.4918


100%|██████████| 782/782 [00:12<00:00, 63.10it/s]


Epoch: 041, Loss: 1.4880


100%|██████████| 782/782 [00:12<00:00, 63.78it/s]


Epoch: 042, Loss: 1.4900


100%|██████████| 782/782 [00:12<00:00, 63.93it/s]


Epoch: 043, Loss: 1.4867


100%|██████████| 782/782 [00:12<00:00, 64.73it/s]


Epoch: 044, Loss: 1.4802


100%|██████████| 782/782 [00:12<00:00, 64.98it/s]


Epoch: 045, Loss: 1.4836


100%|██████████| 782/782 [00:12<00:00, 63.23it/s]


Epoch: 046, Loss: 1.4847


100%|██████████| 782/782 [00:12<00:00, 65.00it/s]


Epoch: 047, Loss: 1.4810


100%|██████████| 782/782 [00:12<00:00, 63.52it/s]


Epoch: 048, Loss: 1.4595


100%|██████████| 782/782 [00:12<00:00, 64.81it/s]

Epoch: 049, Loss: 1.4678





In [44]:
predicted = []
labels = []
i=0
for sampled_data in tqdm.tqdm(test_loader):
    with torch.no_grad():
        sampled_data.to(device)
        predicted.append(model(sampled_data))
        labels.append(sampled_data["user", "rates", "movie"].edge_label)

predicted = torch.cat(predicted, dim=0).cpu().numpy()
labels = torch.cat(labels, dim=0).cpu().numpy()
# auc = roc_auc_score(labels, predicted)
# predicted, labels
print()
# print(f"Test AUC: {auc:.4f}")

100%|██████████| 313/313 [00:01<00:00, 159.22it/s]







In [45]:
predicted

array([2.3054893, 3.377002 , 1.9046615, ..., 2.1718554, 0.4161067,
       2.8712547], dtype=float32)

In [46]:
labels

array([3., 5., 4., ..., 0., 0., 0.], dtype=float32)

In [47]:
from sklearn.metrics import mean_squared_error, roc_auc_score

mean_squared_error(labels, predicted, squared=False), mean_squared_error(labels, predicted, squared=True), 

(1.2218683, 1.4929621)

In [48]:
labels.reshape(-1, 1).shape

(60000, 1)

In [86]:
from sklearn.metrics import classification_report
l = np.where(labels>=3, 1, 0)
p = np.where(predicted>=3, 1, 0)
print(classification_report(l, p))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89     43583
           1       0.83      0.48      0.61     16417

    accuracy                           0.83     60000
   macro avg       0.83      0.72      0.75     60000
weighted avg       0.83      0.83      0.82     60000



In [None]:
from torch_geometric.data import download_url, extract_zip

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')

movies_path = './ml-latest-small/movies.csv'
ratings_path = './ml-latest-small/ratings.csv'

Using existing file ml-latest-small.zip
Extracting ./ml-latest-small.zip


In [None]:
# Load the entire movie data frame into memory:
movies_df = pd.read_csv(movies_path, index_col='movieId')

# Split genres and convert into indicator variables:
genres = movies_df['genres'].str.get_dummies('|')
print(genres[["Action", "Adventure", "Drama", "Horror"]].head())
# Use genres as movie input features:
movie_feat = torch.from_numpy(genres.values).to(torch.float)
assert movie_feat.size() == (9742, 20)  # 20 genres in total.

         Action  Adventure  Drama  Horror
movieId                                  
1             0          1      0       0
2             0          1      0       0
3             0          0      0       0
4             0          0      1       0
5             0          0      0       0


In [None]:
len(movies_df)

9742

In [None]:
movie_feat.shape

torch.Size([9742, 20])

In [None]:
# movies_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/movies_df.csv')
# ratings_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/rating_df.csv')
# users_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/users_df.csv')
# movies_df.shape, ratings_df.shape, users_df.shape


In [None]:
# def generate_graph_edges(movies_df, ratings_df, users_df, rate_threshold=4):
#   unique_user_id = ratings_df['UserID'].unique()
#   unique_user_id = pd.DataFrame(data={
#       'UserID': unique_user_id,
#       'mappedID': pd.RangeIndex(len(unique_user_id))
#       })
#   userid_mapper = dict(zip(unique_user_id.iloc[:,0], unique_user_id.iloc[:,-1]))
  
#   print("1. Mapping UserID to consecutive values... ")

#   unique_movie_id = ratings_df['MovieID'].unique()
#   unique_movie_id = pd.DataFrame(data={
#       'MovieID': unique_movie_id,
#       'mappedID': pd.RangeIndex(len(unique_movie_id))
#       })
#   movieid_mapper = dict(zip(unique_movie_id.iloc[:,0], unique_movie_id.iloc[:,-1]))
#   print("2. Mapping MovieID to consecutive values... ")

#   users_df['UserID'] = users_df['UserID'].map(userid_mapper)
#   movies_df['MovieID'] =  movies_df['MovieID'].map(movieid_mapper)
#   ratings_df['UserID'] = ratings_df['UserID'].map(userid_mapper)
#   ratings_df['MovieID'] =  ratings_df['MovieID'].map(movieid_mapper)
  
#   ratings_user_id = pd.merge(ratings_df[ratings_df['Rating']>=rate_threshold]['UserID'], unique_user_id,
#                              left_on='UserID', right_on='UserID', how='left')
#   ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
#   ratings_movie_id = pd.merge(ratings_df[ratings_df['Rating']>=rate_threshold]['MovieID'], unique_movie_id,
#                               left_on='MovieID', right_on='MovieID', how='left')
#   ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
#   edge_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
#   print()
#   print("3. Extract Graph Edges... ")
#   print("=================================================")
  
#   return movies_df.reset_index(drop=True), ratings_df.reset_index(drop=True), users_df.reset_index(drop=True), edge_user_to_movie

In [None]:
# # Load the entire ratings data frame into memory:
# # ratings_df = pd.read_csv(ratings_path)

# # Create a mapping from unique user indices to range [0, num_user_nodes):
# unique_user_id = ratings_df['UserID'].unique()
# unique_user_id = pd.DataFrame(data={
#     'UserID': unique_user_id,
#     'mappedID': pd.RangeIndex(len(unique_user_id)),
# })
# print("Mapping of user IDs to consecutive values:")
# print("==========================================")
# print(unique_user_id.head())
# print()
# # Create a mapping from unique movie indices to range [0, num_movie_nodes):
# unique_movie_id = ratings_df['MovieID'].unique()
# unique_movie_id = pd.DataFrame(data={
#     'MovieID': unique_movie_id,
#     'mappedID': pd.RangeIndex(len(unique_movie_id)),
# })
# print("Mapping of movie IDs to consecutive values:")
# print("===========================================")
# print(unique_movie_id.head())
# # Perform merge to obtain the edges from users and movies:
# ratings_user_id = pd.merge(ratings_df[ratings_df['Rating']>=4]['UserID'], unique_user_id,
#                             left_on='UserID', right_on='UserID', how='left')
# ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
# ratings_movie_id = pd.merge(ratings_df[ratings_df['Rating']>=4]['MovieID'], unique_movie_id,
#                             left_on='MovieID', right_on='MovieID', how='left')
# ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
# # With this, we are ready to construct our `edge_index` in COO format
# # following PyG semantics:
# edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
# # assert edge_index_user_to_movie.size() == (2, 100836)
# print()
# print("Final edge indices pointing from users to movies:")
# print("=================================================")
# print(edge_index_user_to_movie)

In [None]:
# Load the entire ratings data frame into memory:
ratings_df = pd.read_csv(ratings_path)

# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()
# Create a mapping from unique movie indices to range [0, num_movie_nodes):
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedID': pd.RangeIndex(len(unique_movie_id)),
})
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())
# Perform merge to obtain the edges from users and movies:
ratings_user_id = pd.merge(ratings_df[ratings_df['rating']>=4]['userId'], unique_user_id,
                            left_on='userId', right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_movie_id = pd.merge(ratings_df[ratings_df['rating']>=4]['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
# assert edge_index_user_to_movie.size() == (2, 100836)
print()
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)

Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        3         1
2        6         2
3       47         3
4       50         4

Final edge indices pointing from users to movies:
tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 2035, 3121, 1392]])


In [None]:
# Load the entire ratings data frame into memory:
ratings_df = pd.read_csv(ratings_path)

# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()
# Create a mapping from unique movie indices to range [0, num_movie_nodes):
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedID': pd.RangeIndex(len(unique_movie_id)),
})
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())
# Perform merge to obtain the edges from users and movies:
ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                            left_on='userId', right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
weights = ratings_df['rating'].to_numpy()
# assert edge_index_user_to_movie.size() == (2, 100836)
print()
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)

Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        3         1
2        6         2
3       47         3
4       50         4

Final edge indices pointing from users to movies:
tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 3121, 1392, 2873]])


In [None]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

In [None]:
len(movies_df)

9742

In [None]:
len(edge_index_user_to_movie[0])

100836

In [None]:
# data = HeteroData()
# # Save node indices:
# data["user"].node_id = torch.arange(len(unique_user_id))
# data["movie"].node_id = torch.arange(len(movies_df)-1)
# # Add the node features and edge indices:
# data["movie"].x = movie_features
# data["user"].x = user_features
# data["user", "rates", "movie"].edge_index = edge_index_user_to_movie
# # We also need to make sure to add the reverse edges from movies to users
# # in order to let a GNN be able to pass messages in both directions.
# # We can leverage the `T.ToUndirected()` transform for this from PyG:
# data = T.ToUndirected()(data)
# data

In [None]:
data = HeteroData()
# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(movies_df))
# Add the node features and edge indices:
data["movie"].x = movie_feat
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie
# data["user", "rates", "movie"].edge_label = torch.from_numpy(weights).to(torch.long)
# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

In [None]:
data["movie"].x.dtype , data["user", "rates", "movie"].edge_index.dtype, data["user"].node_id.dtype, data["movie"].node_id.dtype

(torch.float32, torch.int64, torch.int64, torch.int64)

In [None]:
# data["user", "rates", "movie"].edge_label.size(0)

In [None]:
data

HeteroData(
  [1muser[0m={ node_id=[610] },
  [1mmovie[0m={
    node_id=[9742],
    x=[9742, 20]
  },
  [1m(user, rates, movie)[0m={ edge_index=[2, 100836] },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 100836] }
)

In [None]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.2,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"), 
)
train_data, val_data, test_data = transform(data)

In [None]:
train_data

HeteroData(
  [1muser[0m={ node_id=[610] },
  [1mmovie[0m={
    node_id=[9742],
    x=[9742, 20]
  },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 49411],
    edge_label=[21175],
    edge_label_index=[2, 21175]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 49411] }
)

In [None]:
np.isin(train_data["user", "rates", "movie"].edge_label_index,train_data["user", "rates", "movie"].edge_index)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True, False,  True]])

In [None]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)
train_loader

In [None]:
validation_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=False,
)

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x
# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:

class Classifier(torch.nn.Module):
    def forward(self, x_user, x_movie, edge_label_index):
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        # self.user_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()
    
    def forward(self, data):
        # print(data["movie"].node_id)
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        } 
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )
        return pred
        
model = Model(hidden_channels=8)
model

Model(
  (movie_lin): Linear(in_features=20, out_features=8, bias=True)
  (user_emb): Embedding(610, 8)
  (movie_emb): Embedding(9742, 8)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(8, 8, aggr=mean)
      (movie__rev_rates__user): SAGEConv(8, 8, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(8, 8, aggr=mean)
      (movie__rev_rates__user): SAGEConv(8, 8, aggr=mean)
    )
  )
  (classifier): Classifier()
)

In [None]:
class Learning_Evaluation(torch.nn.Module):
    def __init__(
        self,
        model,
        learning_rate=0.001,
        best_results=[0, 0, 0],
        ):
      
      super().__init__()
      self.model = model
      self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
      self.criterion = torch.nn.BCEWithLogitsLoss()
      self.best_results = best_results
    
    def train(self, data_loader):
        self.model.train()

        for data in data_loader:
            out = self.model(data)
            label = data["user", "rates", "movie"].edge_label
            loss = self.criterion(out, label)
            # loss = F.binary_cross_entropy_with_logits(out, label)
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

    def evaluate(self, data_loader):
        self.model.eval()
        
        predicted = []
        labels = []
        total_loss = total_sample = 0
        
        for data in data_loader:
          with torch.no_grad():
            data.to(device)
            out = model(data)
            label = data["user", "rates", "movie"].edge_label
            total_loss += self.criterion(out, label) * len(out)
            # total_loss += F.binary_cross_entropy_with_logits(out, label)
            total_sample += len(out)
            predicted.append(out)
            labels.append(label)
            
        predicted = torch.cat(predicted, dim=0).cpu().numpy()
        labels = torch.cat(labels, dim=0).cpu().numpy()
        # auc = roc_auc_score(labels, predicted)

        return total_loss/ total_sample

    def train_model(self, train_data_loader, validation_data_loader, n_epochs=15, best_model_saving_path='best_model.pth'):
      
      list_train_auc, list_train_loss, list_validation_auc, list_validation_loss = [], [], [], []
      print('Train and Evaluation started...')
      for epoch in range(1, n_epochs+1):
          self.train(train_data_loader)
          
          # train_auc, train_loss = self.evaluate(train_data_loader)
          train_loss = self.evaluate(train_data_loader)
          # list_train_auc.append(train_auc)
          list_train_loss.append(float(train_loss.detach()))
          
          # validation_auc, validation_loss = self.evaluate(validation_data_loader)
          validation_loss = self.evaluate(validation_data_loader)
          if self.best_results[1] < validation_loss :
            self.best_results[0], self.best_results[1], self.best_results[-1] = epoch, train_loss ,validation_loss
            torch.save(model, best_model_saving_path)

          # list_validation_auc.append(validation_auc)
          list_validation_loss.append(float(validation_loss.detach()))

          # print(f'Epoch: {epoch:03d}, Train ROC-AUC: {train_auc:.4f}, Train Loss: {train_loss:.4f}, Validation ROC-AUC: {validation_auc:.4f}, Validation Loss: {validation_loss:.4f}')
          print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Validation Loss: {validation_loss:.4f}')
      
      print('---------------------------------------------------')
      print('Train and Evaluation finished...')
      print(f'Best Results of the model : Epoch: {self.best_results[0]:03d}, Train Loss: {self.best_results[1]:.4f}, Validation Loss: {self.best_results[-1]:.4f}')
      print(f'Model weights restored from epoch: {self.best_results[0]:03d}')
      return list_train_auc, list_train_loss, list_validation_auc, list_validation_loss

    def evaluate_test(self, data_loder, best_model_path='best_model.pth'):
      model = torch.load('best_model.pth')
      predicted = []
      labels = []
      for data in tqdm.tqdm(data_loder):
          with torch.no_grad():
              data.to(device)
              predicted.append(model(data))
              labels.append(data["user", "rates", "movie"].edge_label)

      predicted = torch.cat(predicted, dim=0).cpu().numpy()
      labels = torch.cat(labels, dim=0).cpu().numpy()
      auc = roc_auc_score(labels, predicted)
      print()
      print(f"Test AUC: {auc:.4f}")
      return auc

      
      

In [None]:
experiment = Learning_Evaluation(model)
list_train_acc, list_train_loss, list_test_acc, list_test_loss = experiment.train_model(train_loader, validation_loader, n_epochs=15)

Train and Evaluation started...
Epoch: 001, Train Loss: 0.2765, Validation Loss: 0.3512
Epoch: 002, Train Loss: 0.2720, Validation Loss: 0.3526
Epoch: 003, Train Loss: 0.2690, Validation Loss: 0.3393
Epoch: 004, Train Loss: 0.2679, Validation Loss: 0.3524
Epoch: 005, Train Loss: 0.2661, Validation Loss: 0.3227
Epoch: 006, Train Loss: 0.2656, Validation Loss: 0.3493
Epoch: 007, Train Loss: 0.2615, Validation Loss: 0.3760
Epoch: 008, Train Loss: 0.2626, Validation Loss: 0.3319
Epoch: 009, Train Loss: 0.2574, Validation Loss: 0.3458
Epoch: 010, Train Loss: 0.2565, Validation Loss: 0.3506
Epoch: 011, Train Loss: 0.2604, Validation Loss: 0.3573
Epoch: 012, Train Loss: 0.2552, Validation Loss: 0.3288
Epoch: 013, Train Loss: 0.2516, Validation Loss: 0.3584
Epoch: 014, Train Loss: 0.2521, Validation Loss: 0.3503
Epoch: 015, Train Loss: 0.2508, Validation Loss: 0.3458
---------------------------------------------------
Train and Evaluation finished...
Best Results of the model : Epoch: 015, Tra

In [None]:
test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=False,
)

model = torch.load('best_model.pth')
predicted = []
labels = []
for sampled_data in tqdm.tqdm(test_loader):
    with torch.no_grad():
        sampled_data.to(device)
        predicted.append(model(sampled_data))
        labels.append(sampled_data["user", "rates", "movie"].edge_label)

predicted = torch.cat(predicted, dim=0).cpu().numpy()
labels = torch.cat(labels, dim=0).cpu().numpy()
auc = roc_auc_score(labels, predicted)
predicted, labels
print()
print(f"Test AUC: {auc:.4f}")


experiment.evaluate_test(test_loader)

100%|██████████| 114/114 [00:01<00:00, 100.55it/s]



Test AUC: 0.9203


100%|██████████| 114/114 [00:00<00:00, 139.48it/s]


Test AUC: 0.9199





0.9199309271749129

In [None]:
import tqdm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


for epoch in range(1, 15):
    # total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


100%|██████████| 80/80 [00:01<00:00, 50.90it/s]


Epoch: 001, Loss: 0.3759


100%|██████████| 80/80 [00:02<00:00, 37.35it/s]


Epoch: 002, Loss: 0.3728


 45%|████▌     | 36/80 [00:00<00:01, 36.93it/s]


KeyboardInterrupt: ignored

In [None]:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=False,
)
# sampled_data = next(iter(val_loader))

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
predicted = []
labels = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        predicted.append(model(sampled_data))
        labels.append(sampled_data["user", "rates", "movie"].edge_label)

predicted = torch.cat(predicted, dim=0).cpu().numpy()
labels = torch.cat(labels, dim=0).cpu().numpy()
auc = roc_auc_score(labels, predicted)
predicted, labels
print()
print(f"Validation AUC: {auc:.4f}")

# print(f"Validation f1: {f1:.4f}")
# print(f"Validation acc: {acc:.4f}")

100%|██████████| 114/114 [00:00<00:00, 150.92it/s]


Validation AUC: 0.9042





In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve, precision_recall_curve, balanced_accuracy_score, precision_recall_fscore_support

precision_recall_fscore_support(labels, predicted)

In [None]:
predicteds = np.where(predicted>0,1,0)

In [None]:
roc_auc_score(labels, predicteds), f1_score(labels, predicteds), accuracy_score(labels, predicteds)
