<a href="https://colab.research.google.com/github/mojtabaSefidi/Machine-Learning-with-Graphs/blob/main/MLG_Final_Project_100k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !unzip ml-100k.zip
# !pip uninstall jupyter
# !pip install jupyter

## Install Essential Packages

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
!pip install -q torch_geometric
!pip install -q torch-sparse==0.6.13
!pip install -q torch_scatter
# !pip install -q ogb

## Install Essential Libraries

In [4]:

import torch
import networkx as nx
import torch_geometric
from torch_geometric.utils import to_networkx
from torch_geometric.datasets import TUDataset
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.loader import DataLoader
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GraphConv, SAGEConv, global_add_pool
from torch_geometric.data import HeteroData
from torch_geometric import transforms
from torch_geometric.loader import LinkNeighborLoader

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import pandas as pd
import torch

In [6]:
def plot_history(list_train_loss, list_train_acc, list_val_loss, list_val_acc, n_epochs, title):
    
    plt.figure(figsize=(18,8),linewidth = 7, edgecolor="whitesmoke")    
    n = n_epochs
    
    plt.plot(list(range(1, n_epochs+1)), list_train_acc, color='orange',marker=".")
    plt.plot(list(range(1, n_epochs+1)), list_train_loss,'b',marker=".")
    
    plt.plot(list(range(1, n_epochs+1)), list_val_acc,'r')  
    plt.plot(list(range(1, n_epochs+1)), list_val_loss,'g')
    
    plt.legend(['Train Accuracy','Train Loss','Test Accuracy','Test Loss'])
    plt.grid(True)
    
    # plt.gca().set_ylim(0,1)

    plt.xlabel("Number of Epochs")
    plt.ylabel("Value")
    plt.suptitle(title, size=16, y=0.927)
    plt.show()

## Read the Dataset

In [7]:
# from ogb.graphproppred import PygGraphPropPredDataset
# from torch_geometric.loader import DataLoader

# dataset = PygGraphPropPredDataset(name = 'ogbg-molhiv') 

# split_idx = dataset.get_idx_split() 
# train_data_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True)
# valiation_data_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=True)
# test_data_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=True)

In [8]:
# !unzip /content/MovieLens1M.zip

In [9]:
# movies_df = pd.read_csv('/content/ml-100k/u.item', sep='|', engine='python', encoding="latin-1",
#                         names = ['MovieID', 'Movie Title', 'Release Date', 'video release date','IMDbURL',
#                                  'Unknown','Action','Adventure','Animation', 'Childrens','Comedy',
#                                  'Crime','Documentary','Drama','Fantasy', 'Film-Noir','Horror',
#                                  'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'])

# rating_df = pd.read_csv('/content/ml-100k/u.data', sep='\t', engine='python', encoding="latin-1",
#                         names=['UserID','MovieID','Rating','Timestamp'])

# users_df = pd.read_csv('/content/ml-100k/u.user', sep='|', engine='python', encoding="latin-1",
#                        names=['UserID','Gender','Age','Occupation','Zipcode'])

# movies_df.to_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/movies_df.csv', index=False)
# rating_df.to_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/rating_df.csv', index=False)
# users_df.to_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/users_df.csv', index=False)

In [10]:
geners = np.array(['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
                   'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western'], dtype=str)

values = list(range(0, len(geners)))
geners2vec = dict(zip(geners, values))

def extract_geners(text, sep='|'):
  return text.split(sep)

def geners2vector(df_geners, maper):
  result = np.zeros((len(df_geners),len(maper)), dtype='int8')
  for index, text in enumerate(df_geners):
    geners = extract_geners(text)
    for gener in geners:
      result[index][maper.get(gener)] = 1
  
  return result

In [11]:
def year_extractor(text):
  return text[text.rfind('(')+1:text.rfind(')')]

def calculate_average_rating(rating_df, movies_df, Movie_id_col='MovieID', rating_col='Rating'):
  
  rating_avg = rating_df.groupby(Movie_id_col).mean()[rating_col].round(4).to_dict()
  result = [] 
  for id in movies_df[Movie_id_col]:
    result.append(rating_avg.get(id, 0))
  movies_df['Average Rating'] = result
  return movies_df

Occupation_mapper = { 0: "other", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                     4: "college/grad student",5: "customer service", 6: "doctor/health care",
                     7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student",
                     11: "lawyer", 12: "programmer", 13: "retired", 14: "sales/marketing",
                     15: "scientist", 16: "self-employed", 17: "technician/engineer",
                     18: "tradesman/craftsman", 19: "unemployed", 20: "writer"}

def code2Occupation(occupation_col, mapper):
  return occupation_col.map(mapper)


def extract_user_feature(users_df):
  scaler = StandardScaler()
  age = scaler.fit_transform(users_df[['Age']])
  
  encoder = OneHotEncoder(handle_unknown='ignore')
  occupation = encoder.fit_transform(users_df[['Occupation']]).toarray()
  features = np.hstack((users_df[['Gender']], age, occupation))
  return torch.from_numpy(features).to(torch.float)

def extract_movie_feature(movies_df, mapper):
  scaler = StandardScaler()
  numerical = scaler.fit_transform(movies_df[['year',	'averge_rating']])
  
  categorical = geners2vector(movies_df['Genres'], mapper)
  features = np.hstack((numerical, categorical))
  return torch.from_numpy(features).to(torch.float)

def Timestamp2Date(timestamp):
  return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

def add_age_group(df, age_col='Age'):
  bins= [0,20,25,39,60,110]
  labels = ['Teenage','Young Adult','Adult', 'Older Adult','Old']
  df['AgeGroup'] = pd.cut(df[age_col], bins=bins, labels=labels, right=False)
  return df

def remove_movies(rating_df, movies_df, title_col='Movie Title', movie_id_col='MovieID'):
  unknown_movies = movies_df[movies_df[title_col]=='unknown'][movie_id_col]
  rating_df = rating_df[~rating_df[movie_id_col].isin(unknown_movies)]
  movies_df = movies_df[~movies_df[movie_id_col].isin(unknown_movies)]
  return rating_df, movies_df

In [114]:
movies_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/movies_df.csv')
ratings_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/rating_df.csv')
users_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/users_df.csv')
movies_df.shape, ratings_df.shape, users_df.shape


((1682, 24), (100000, 4), (943, 5))

In [115]:
ratings_df, movies_df = remove_movies(ratings_df, movies_df)

In [116]:
ratings_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [117]:
movies_df.drop(['video release date', 'IMDbURL'], axis=1, inplace=True)
movies_df['Release Date'] = pd.to_datetime(movies_df['Release Date'])
movies_df['Release Year'] = movies_df['Release Date'].dt.year
movies_df['Release Month'] = movies_df['Release Date'].dt.month
movies_df['Release Day'] = movies_df['Release Date'].dt.strftime('%j').apply(int)
movies_df = movies_df.sort_values(by=['Release Date']).reset_index(drop=True)
movies_df = calculate_average_rating(ratings_df, movies_df)
movies_df.tail(2).T

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['Release Date'] = pd.to_datetime(movies_df['Release Date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['Release Year'] = movies_df['Release Date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

Unnamed: 0,1679,1680
MovieID,1432,315
Movie Title,"Mighty, The (1998)",Apt Pupil (1998)
Release Date,1998-10-09 00:00:00,1998-10-23 00:00:00
Unknown,0,0
Action,0,0
Adventure,0,0
Animation,0,0
Childrens,0,0
Comedy,0,0
Crime,0,0


In [118]:
# !pip install --upgrade geopy
# from geopy.geocoders import Nominatim
# def zipcode2city(zipcode):
#   geolocator = Nominatim(timeout=10, user_agent = "dlab.berkeley.edu-workshop")
#   try:
#     return geolocator.geocode({"postalcode": zipcode})[0].split(',')[0]
#   except:
#     'not found'

# users_df['Zipcode'].apply(zipcode2city)

In [119]:
users_df = add_age_group(users_df)
users_df

Unnamed: 0,UserID,Age,Gender,Occupation,Zipcode,AgeGroup
0,1,24,M,technician,85711,Young Adult
1,2,53,F,other,94043,Older Adult
2,3,23,M,writer,32067,Young Adult
3,4,24,M,technician,43537,Young Adult
4,5,33,F,other,15213,Adult
...,...,...,...,...,...,...
938,939,26,F,student,33319,Adult
939,940,32,M,administrator,02215,Adult
940,941,20,M,student,97229,Young Adult
941,942,48,F,librarian,78209,Older Adult


In [120]:
ratings_df.loc[:,'Timestamp'] = ratings_df.loc[:,'Timestamp'].apply(Timestamp2Date)
ratings_df.sort_values(by='Timestamp', inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

ratings_df.loc[:,'Timestamp'] = pd.to_datetime(ratings_df.loc[:,'Timestamp'])
ratings_df.loc[:,'Year'] = ratings_df.loc[:,'Timestamp'].dt.year
ratings_df.loc[:,'Month'] = ratings_df.loc[:,'Timestamp'].dt.month
ratings_df.loc[:,'Weekday'] = ratings_df.loc[:,'Timestamp'].dt.weekday
ratings_df.loc[:,'Hour'] = ratings_df.loc[:,'Timestamp'].dt.hour
ratings_df.loc[:,'DayofYear'] = ratings_df.loc[:,'Timestamp'].dt.strftime('%j')

ratings_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Year,Month,Weekday,Hour,DayofYear
0,259,255,4,1997-09-20 03:05:10,1997,9,5,3,263
1,259,286,4,1997-09-20 03:05:27,1997,9,5,3,263
2,259,298,4,1997-09-20 03:05:54,1997,9,5,3,263
3,259,185,4,1997-09-20 03:06:21,1997,9,5,3,263
4,259,173,4,1997-09-20 03:07:23,1997,9,5,3,263
...,...,...,...,...,...,...,...,...,...
99986,729,689,4,1998-04-22 23:10:38,1998,4,2,23,112
99987,729,300,4,1998-04-22 23:10:38,1998,4,2,23,112
99988,729,748,4,1998-04-22 23:10:38,1998,4,2,23,112
99989,729,313,3,1998-04-22 23:10:38,1998,4,2,23,112


In [121]:
def generate_graph_edges(movies_df, ratings_df, users_df, rate_threshold=4):
  unique_user_id = ratings_df['UserID'].unique()
  unique_user_id = pd.DataFrame(data={
      'UserID': unique_user_id,
      'mappedID': pd.RangeIndex(len(unique_user_id))
      })
  userid_mapper = dict(zip(unique_user_id.iloc[:,0], unique_user_id.iloc[:,-1]))
  
  print("1. Mapping UserID to consecutive values... ")

  unique_movie_id = ratings_df['MovieID'].unique()
  unique_movie_id = pd.DataFrame(data={
      'MovieID': unique_movie_id,
      'mappedID': pd.RangeIndex(len(unique_movie_id))
      })
  movieid_mapper = dict(zip(unique_movie_id.iloc[:,0], unique_movie_id.iloc[:,-1]))
  print("2. Mapping MovieID to consecutive values... ")

  users_df['UserID'] = users_df['UserID'].map(userid_mapper)
  movies_df['MovieID'] =  movies_df['MovieID'].map(movieid_mapper)
  ratings_df['UserID'] = ratings_df['UserID'].map(userid_mapper)
  ratings_df['MovieID'] =  ratings_df['MovieID'].map(movieid_mapper)
  
  ratings_user_id = pd.merge(ratings_df[ratings_df['Rating']>=rate_threshold]['UserID'], unique_user_id,
                             left_on='UserID', right_on='UserID', how='left')
  ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
  ratings_movie_id = pd.merge(ratings_df[ratings_df['Rating']>=rate_threshold]['MovieID'], unique_movie_id,
                              left_on='MovieID', right_on='MovieID', how='left')
  ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
  user2movie_edge = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
  print("3. Extract Graph Edges... ")
  print("=================================================")
  
  return movies_df.reset_index(drop=True), ratings_df.reset_index(drop=True), users_df.reset_index(drop=True), user2movie_edge.to(torch.long)

In [122]:
movies_df, ratings_df, users_df, user2movie_edge = generate_graph_edges(movies_df, ratings_df, users_df)

1. Mapping UserID to consecutive values... 
2. Mapping MovieID to consecutive values... 
3. Extract Graph Edges... 


In [123]:
def extract_user_feature(users_df):
  scaler = StandardScaler()
  # age = scaler.fit_transform(users_df[['Age']])
  
  encoder = OneHotEncoder(handle_unknown='ignore')
  categorical_df = encoder.fit_transform(users_df[['Occupation', 'Gender']]).toarray()
  features = np.hstack((categorical_df, users_df[['Age']]))
  return torch.from_numpy(features).to(torch.int16)

def extract_movie_feature(movies_df, exclude_cols):
  movies_df = movies_df.drop(exclude_cols, axis=1)
  scaler = StandardScaler()
  # movies_df[['Release Year', 'Release Month', 'Average Rating', 'Release Day']] = scaler.fit_transform(
  #     movies_df[['Release Year', 'Release Month', 'Average Rating', 'Release Day']])
  features = movies_df.to_numpy()
  print(features)
  return torch.from_numpy(features).to(torch.int16)

In [125]:
user_features = extract_user_feature(users_df)
movie_features = extract_movie_feature(movies_df, ['MovieID', 'Movie Title', 'Release Date'])
movie_features.shape, user_features.shape

[[  0.       0.       0.     ...   1.       1.       3.5556]
 [  0.       0.       0.     ...   1.       1.       3.    ]
 [  0.       0.       0.     ...   1.       1.       3.7778]
 ...
 [  0.       0.       0.     ...   4.      93.       3.    ]
 [  0.       0.       0.     ...  10.     282.       1.    ]
 [  0.       0.       0.     ...  10.     296.       4.1   ]]


(torch.Size([1681, 23]), torch.Size([943, 24]))

In [126]:
def generate_edge(rating_df, rating_threshold=4):

  graph_edges = [[],[]]
  edge_weight = []

  for userID, movieID, rating in rating_df[['UserID','MovieID','Rating']].itertuples(index=False):
    if rating >= rating_threshold:
      graph_edges[0].append(userID)
      graph_edges[1].append(movieID)
      edge_weight.append(rating)
    
    else:
      continue
  
  return torch.tensor(graph_edges, dtype=torch.long), torch.tensor(edge_weight, dtype=torch.long)

In [127]:
# user2movie_edge, user2movie_edge_weight = generate_edge(rating_df)
# user2movie_edge, user2movie_edge.shape

In [128]:
# torch.from_numpy(rating_df['UserID'].unique()).to(torch.int16),torch.from_numpy(rating_df['MovieID'].unique()).to(torch.int16)
# dataset["user"].node_id.dtype

In [129]:
dataset = HeteroData()

dataset["user"].node_id = torch.arange(len(users_df)).to(torch.int16)
dataset["movie"].node_id = torch.arange(len(movies_df)).to(torch.int16)
dataset["movie"].x = movie_features
dataset["user"].x = user_features
dataset["user", "rates", "movie"].edge_index = user2movie_edge
# dataset["user", "rates", "movie"].edge_weight = user2movie_edge_weight

dataset = transforms.ToUndirected()(dataset)

In [131]:
transform = transforms.RandomLinkSplit(
    num_val=0.1,
    num_test=0.2,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"), 
)

train_dataset, validation_dataset, test_dataset = transform(dataset)

In [132]:
# train_dataset, train_data

In [133]:
#  validation_dataset, val_data

In [134]:
# test_dataset, test_data

In [None]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_dataset["user", "rates", "movie"].edge_label_index
edge_label = train_dataset["user", "rates", "movie"].edge_label
train_loader = LinkNeighborLoader(
    data=train_dataset,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=32,
    shuffle=True,
)
train_loader

In [79]:
train_loader

LinkNeighborLoader()

In [82]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x
# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user, x_movie, edge_label_index):
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)
class Model(torch.nn.Module):
    def __init__(self, hidden_channels, user_features_dim, movie_features_dim):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.user_lin = torch.nn.Linear(user_features_dim, hidden_channels)
        self.movie_lin = torch.nn.Linear(movie_features_dim, hidden_channels)
        self.user_emb = torch.nn.Embedding(dataset["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(dataset["movie"].num_nodes, hidden_channels)
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=dataset.metadata())
        self.classifier = Classifier()
    def forward(self, data):
        # print(data["movie"].node_id)
        x_dict = {
          # "user": self.user_lin(data["user"].x) + self.user_emb(data["user"].node_id),
          "user": self.user_lin(data["user"].x) + self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        } 
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )
        return pred
        
model = Model(hidden_channels=8, user_features_dim=24, movie_features_dim=23)
model

Model(
  (user_lin): Linear(in_features=24, out_features=8, bias=True)
  (movie_lin): Linear(in_features=23, out_features=8, bias=True)
  (user_emb): Embedding(943, 8)
  (movie_emb): Embedding(1681, 8)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(8, 8, aggr=mean)
      (movie__rev_rates__user): SAGEConv(8, 8, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(8, 8, aggr=mean)
      (movie__rev_rates__user): SAGEConv(8, 8, aggr=mean)
    )
  )
  (classifier): Classifier()
)

In [83]:
import tqdm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 6):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


  0%|          | 0/364 [00:00<?, ?it/s]


RuntimeError: ignored

In [None]:
from torch_geometric.data import download_url, extract_zip

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')

movies_path = './ml-latest-small/movies.csv'
ratings_path = './ml-latest-small/ratings.csv'

In [None]:
# Load the entire movie data frame into memory:
movies_df = pd.read_csv(movies_path, index_col='movieId')

# Split genres and convert into indicator variables:
genres = movies_df['genres'].str.get_dummies('|')
print(genres[["Action", "Adventure", "Drama", "Horror"]].head())
# Use genres as movie input features:
movie_feat = torch.from_numpy(genres.values).to(torch.float)
assert movie_feat.size() == (9742, 20)  # 20 genres in total.

In [None]:
len(movies_df)

9742

In [None]:
movie_feat.shape

torch.Size([9742, 20])

In [None]:
# movies_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/movies_df.csv')
# ratings_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/rating_df.csv')
# users_df = pd.read_csv('/content/gdrive/MyDrive/MLG_Final_Project/MovieLens100k/Data/users_df.csv')
# movies_df.shape, ratings_df.shape, users_df.shape


((1682, 24), (100000, 4), (943, 5))

In [None]:
# def generate_graph_edges(movies_df, ratings_df, users_df, rate_threshold=4):
#   unique_user_id = ratings_df['UserID'].unique()
#   unique_user_id = pd.DataFrame(data={
#       'UserID': unique_user_id,
#       'mappedID': pd.RangeIndex(len(unique_user_id))
#       })
#   userid_mapper = dict(zip(unique_user_id.iloc[:,0], unique_user_id.iloc[:,-1]))
  
#   print("1. Mapping UserID to consecutive values... ")

#   unique_movie_id = ratings_df['MovieID'].unique()
#   unique_movie_id = pd.DataFrame(data={
#       'MovieID': unique_movie_id,
#       'mappedID': pd.RangeIndex(len(unique_movie_id))
#       })
#   movieid_mapper = dict(zip(unique_movie_id.iloc[:,0], unique_movie_id.iloc[:,-1]))
#   print("2. Mapping MovieID to consecutive values... ")

#   users_df['UserID'] = users_df['UserID'].map(userid_mapper)
#   movies_df['MovieID'] =  movies_df['MovieID'].map(movieid_mapper)
#   ratings_df['UserID'] = ratings_df['UserID'].map(userid_mapper)
#   ratings_df['MovieID'] =  ratings_df['MovieID'].map(movieid_mapper)
  
#   ratings_user_id = pd.merge(ratings_df[ratings_df['Rating']>=rate_threshold]['UserID'], unique_user_id,
#                              left_on='UserID', right_on='UserID', how='left')
#   ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
#   ratings_movie_id = pd.merge(ratings_df[ratings_df['Rating']>=rate_threshold]['MovieID'], unique_movie_id,
#                               left_on='MovieID', right_on='MovieID', how='left')
#   ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
#   edge_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
#   print()
#   print("3. Extract Graph Edges... ")
#   print("=================================================")
  
#   return movies_df.reset_index(drop=True), ratings_df.reset_index(drop=True), users_df.reset_index(drop=True), edge_user_to_movie

In [None]:
# # Load the entire ratings data frame into memory:
# # ratings_df = pd.read_csv(ratings_path)

# # Create a mapping from unique user indices to range [0, num_user_nodes):
# unique_user_id = ratings_df['UserID'].unique()
# unique_user_id = pd.DataFrame(data={
#     'UserID': unique_user_id,
#     'mappedID': pd.RangeIndex(len(unique_user_id)),
# })
# print("Mapping of user IDs to consecutive values:")
# print("==========================================")
# print(unique_user_id.head())
# print()
# # Create a mapping from unique movie indices to range [0, num_movie_nodes):
# unique_movie_id = ratings_df['MovieID'].unique()
# unique_movie_id = pd.DataFrame(data={
#     'MovieID': unique_movie_id,
#     'mappedID': pd.RangeIndex(len(unique_movie_id)),
# })
# print("Mapping of movie IDs to consecutive values:")
# print("===========================================")
# print(unique_movie_id.head())
# # Perform merge to obtain the edges from users and movies:
# ratings_user_id = pd.merge(ratings_df[ratings_df['Rating']>=4]['UserID'], unique_user_id,
#                             left_on='UserID', right_on='UserID', how='left')
# ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
# ratings_movie_id = pd.merge(ratings_df[ratings_df['Rating']>=4]['MovieID'], unique_movie_id,
#                             left_on='MovieID', right_on='MovieID', how='left')
# ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
# # With this, we are ready to construct our `edge_index` in COO format
# # following PyG semantics:
# edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
# # assert edge_index_user_to_movie.size() == (2, 100836)
# print()
# print("Final edge indices pointing from users to movies:")
# print("=================================================")
# print(edge_index_user_to_movie)

In [None]:
# Load the entire ratings data frame into memory:
ratings_df = pd.read_csv(ratings_path)

# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()
# Create a mapping from unique movie indices to range [0, num_movie_nodes):
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedID': pd.RangeIndex(len(unique_movie_id)),
})
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())
# Perform merge to obtain the edges from users and movies:
ratings_user_id = pd.merge(ratings_df[ratings_df['rating']>=4]['userId'], unique_user_id,
                            left_on='userId', right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_movie_id = pd.merge(ratings_df[ratings_df['rating']>=4]['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
# assert edge_index_user_to_movie.size() == (2, 100836)
print()
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)

Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        3         1
2        6         2
3       47         3
4       50         4

Final edge indices pointing from users to movies:
tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 2035, 3121, 1392]])


In [None]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

In [None]:
len(movies_df)

9742

In [None]:
len(edge_index_user_to_movie[0])

48580

In [None]:
# data = HeteroData()
# # Save node indices:
# data["user"].node_id = torch.arange(len(unique_user_id))
# data["movie"].node_id = torch.arange(len(movies_df)-1)
# # Add the node features and edge indices:
# data["movie"].x = movie_features
# data["user"].x = user_features
# data["user", "rates", "movie"].edge_index = edge_index_user_to_movie
# # We also need to make sure to add the reverse edges from movies to users
# # in order to let a GNN be able to pass messages in both directions.
# # We can leverage the `T.ToUndirected()` transform for this from PyG:
# data = T.ToUndirected()(data)
# data

In [None]:
data = HeteroData()
# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(movies_df))
# Add the node features and edge indices:
data["movie"].x = movie_feat
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie
# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

In [None]:
data

HeteroData(
  [1muser[0m={ node_id=[610] },
  [1mmovie[0m={
    node_id=[9742],
    x=[9742, 20]
  },
  [1m(user, rates, movie)[0m={ edge_index=[2, 48580] },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 48580] }
)

In [None]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.2,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"), 
)
train_data, val_data, test_data = transform(data)

In [None]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=32,
    shuffle=True,
)
train_loader

LinkNeighborLoader()

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x
# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user, x_movie, edge_label_index):
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        # self.user_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()
    def forward(self, data):
        # print(data["movie"].node_id)
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        } 
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )
        return pred
        
model = Model(hidden_channels=8)
model

Model(
  (movie_lin): Linear(in_features=20, out_features=8, bias=True)
  (user_emb): Embedding(610, 8)
  (movie_emb): Embedding(9742, 8)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(8, 8, aggr=mean)
      (movie__rev_rates__user): SAGEConv(8, 8, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(8, 8, aggr=mean)
      (movie__rev_rates__user): SAGEConv(8, 8, aggr=mean)
    )
  )
  (classifier): Classifier()
)

In [None]:
import tqdm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, 15):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


 98%|█████████▊| 312/319 [00:05<00:00, 60.38it/s]


KeyboardInterrupt: ignored

In [None]:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)
sampled_data = next(iter(val_loader))

In [None]:
from sklearn.metrics import roc_auc_score
preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)
pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")

100%|██████████| 40/40 [00:00<00:00, 119.62it/s]


Validation AUC: 0.8714



