# Light GCN * 2 Notebook

## 0. Env Preparation

### 0.1 Install Packages

In [1]:
!pip install ipywidgets
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cu116.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.12.0+cu116.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

### 0.2 Import Packages

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import torch_geometric
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

Check our environment

In [None]:
torch_geometric.__version__

Set device for torch

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## 1. Data Processing

### 1.1 Read Data


In order to access the Google Drive, first go to: https://drive.google.com/drive/folders/1TzyQFgm_szZMo6d6RyrwPE4Hkc_0I0og?usp=sharing, then go to "Shared with me", right click on the folder and select "Add a shortcut to Drive".

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
columns_name = ['place_index', 'user_index', 'rating']
review_df_1 = pd.read_csv("/content/drive/MyDrive/cities/toronto/train.tsv", sep="\t")[columns_name].astype(int)
review_df_2 = pd.read_csv("/content/drive/MyDrive/cities/Los_angeles/train.tsv", sep="\t")[columns_name].astype(int)

### 1.2 Number of User and Place

In [None]:
max_user_id_1 = review_df_1['user_index'].max()
max_place_id_1 = review_df_1['place_index'].max()
print(max_user_id_1)
print(max_place_id_1)

In [None]:
max_user_id_2 = review_df_2['user_index'].max()
max_place_id_2 = review_df_2['place_index'].max()
print(max_user_id_2)
print(max_place_id_2)

In [None]:
max_node_id_1 = max_user_id_1 + max_place_id_1 + 1 # since place_id starts from 0
print(max_node_id_1)

In [None]:
max_node_id_2 = max_user_id_2 + max_place_id_2 + 1 # since place_id starts from 0
print(max_node_id_2)

### 1.3 Split Data

In [None]:
# It is possible some users or restaurants in test do not exist in 
# train after this split.
# Will need to retrain the model with combined train+test after picking
# the best model structure
train_1, test_1 = train_test_split(review_df_1.values, test_size=0.1)
train_df_1 = pd.DataFrame(train_1, columns=review_df_1.columns)
test_df_1 = pd.DataFrame(test_1, columns=review_df_1.columns)

In [None]:
train_2, test_2 = train_test_split(review_df_2.values, test_size=0.1)
train_df_2 = pd.DataFrame(train_2, columns=review_df_2.columns)
test_df_2 = pd.DataFrame(test_2, columns=review_df_2.columns)

### 1.4 Get distribution of different ratings

In [None]:
# Weights will be used to normalize loss function
def get_weights(df):
    rating_counts = np.array([len(df[df['rating'] == i]) for i in [1, 2, 3, 4, 5]])
    inverse_count = 1 / rating_counts
    norm = np.linalg.norm(inverse_count)
    normalized_inverse_count = inverse_count / norm

    return normalized_inverse_count

In [None]:
weights_1 = get_weights(train_df_1)
print(weights_1)

In [None]:
weights_2 = get_weights(train_df_2)
print(weights_2)

In [None]:
train_df_1['weight'] = train_df_1['rating'].map(lambda val: weights_1[int(val)-1])
test_df_1['weight'] = test_df_1['rating'].map(lambda val: weights_1[int(val)-1])

In [None]:
train_df_2['weight'] = train_df_2['rating'].map(lambda val: weights_2[int(val)-1])
test_df_2['weight'] = test_df_2['rating'].map(lambda val: weights_2[int(val)-1])

In [None]:
# Check data snippet
train_df_1.head(5)

In [None]:
train_df_2.head(5)

### 1.5 Dataset and Dataloader

In [None]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data.to_numpy()
        
    def __getitem__(self, index):
        return self.data[index, 0].astype(np.compat.long), \
            self.data[index, 1].astype(np.compat.long), \
            self.data[index, 2:3].astype(np.float32), \
            self.data[index, 3]
    
    def __len__(self):
        return len(self.data)

## 2. Graph Construction

In [None]:
u_t_1 = torch.LongTensor(train_df_1.user_index.to_numpy())
p_t_1 = torch.LongTensor(train_df_1.place_index.to_numpy()) + max_user_id_1 + 1

train_edge_index_1 = torch.stack((torch.cat([u_t_1, p_t_1]),torch.cat([p_t_1, u_t_1]))).to(device)

In [None]:
u_t_2 = torch.LongTensor(train_df_2.user_index.to_numpy())
p_t_2 = torch.LongTensor(train_df_2.place_index.to_numpy()) + max_user_id_2 + 1

train_edge_index_2 = torch.stack((torch.cat([u_t_2, p_t_2]),torch.cat([p_t_2, u_t_2]))).to(device)

In [None]:
train_df_1['place_index'] = train_df_1['place_index'] + max_user_id_1 + 1
test_df_1['place_index'] = test_df_1['place_index'] + max_user_id_1 + 1
# assert that there's no index overlapping
intersection_1 = set(train_df_1['place_index'].unique()).intersection(set(train_df_1['user_index'].unique()))
assert len(intersection_1) == 0

intersection_1 = set(test_df_1['place_index'].unique()).intersection(set(test_df_1['user_index'].unique()))
assert len(intersection_1) == 0

train_dataset_1 = MyDataset(train_df_1)
test_dataset_1 = MyDataset(test_df_1)

In [None]:
train_df_2['place_index'] = train_df_2['place_index'] + max_user_id_2 + 1
test_df_2['place_index'] = test_df_2['place_index'] + max_user_id_2 + 1
# assert that there's no index overlapping
intersection_2 = set(train_df_2['place_index'].unique()).intersection(set(train_df_2['user_index'].unique()))
assert len(intersection_2) == 0

intersection_2 = set(test_df_2['place_index'].unique()).intersection(set(test_df_2['user_index'].unique()))
assert len(intersection_2) == 0

train_dataset_2 = MyDataset(train_df_2)
test_dataset_2 = MyDataset(test_df_2)

## 3. Model Architecture

### 3.1 LightGCN Convolutional Layer

In [None]:
class LightGCNConv(MessagePassing):
    def __init__(self, **kwargs):
        super().__init__(aggr='add')

    def forward(self, x, edge_index, num_nodes):
        # Compute normalization
        from_, to_ = edge_index
        deg = degree(to_, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[from_] * deg_inv_sqrt[to_]
        # Start propagating messages (no update after aggregation)
        return self.propagate(edge_index, x=x, norm=norm)

    def message(self, x_j, norm):
        return norm.view(-1, 1) * x_j

In [None]:
# Initialize node embeddings as one-hot embeddings
test_x = torch.Tensor(np.eye(5))

# Construct edges
test_edge_index = torch.LongTensor(np.array([
  [0, 0, 1, 1, 2, 3, 3, 4],
  [2, 3, 3, 4, 0, 0, 1, 1]
]))

# Check out the result of passing the embeddings through our Graph Convolutional Network
LightGCNConv()(test_x, test_edge_index, 5)

### 3.2 Recommender System GNN

In [None]:
class LightGCN(nn.Module):
    def __init__(self, latent_dim, num_layers, max_index):
        super(LightGCN, self).__init__()
        self.embedding = nn.Embedding(max_index, latent_dim)
        self.convs = nn.ModuleList(LightGCNConv() for _ in range(num_layers))
        self.init_parameters()
        self.nn = nn.Linear(2*latent_dim, 1)

        self.max_index = max_index

    def init_parameters(self):
        nn.init.normal_(self.embedding.weight, std=0.1) 

    def forward(self, edge_index):
        emb0 = self.embedding.weight
        embs = [emb0]
        emb = emb0
        for conv in self.convs:
            emb = conv(x=emb, edge_index=edge_index, num_nodes=self.max_index)
            embs.append(emb)

        out = torch.mean(torch.stack(embs, dim=0), dim=0)
        return emb0, out
    
    def pred(self, users, items, embeddings):
        user_emb = embeddings[users]
        item_emb = embeddings[items]
        x = torch.cat((user_emb,item_emb), 1)
        x = self.nn(x)
        return x

## 4. Train and evaluate models

Set Parameters

In [None]:
latent_dim = 64
n_layers = 3 

EPOCHS = 5
BATCH_SIZE = 100
DECAY = 0.0003
LR = 0.0005
K = 2

Set Model

In [None]:
lightgcn_1 = LightGCN(
    latent_dim=latent_dim,
    num_layers=n_layers,
    max_index=max_node_id_1 + 1
)
lightgcn_1 = lightgcn_1.to(device)

In [None]:
lightgcn_2 = LightGCN(
    latent_dim=latent_dim,
    num_layers=n_layers,
    max_index=max_node_id_2 + 1
)
lightgcn_2 = lightgcn_2.to(device)

Create a model for the shared layers

Set Train Function

In [None]:
def get_testset_loss(model_1, testset_1, embeddings_1, model_2, testset_2, embeddings_2, loss_fn):
    loss_list = []
    model_1.eval()
    model_2.eval()
    with torch.no_grad():
        test_dataloader_1 = DataLoader(testset_1, batch_size=BATCH_SIZE)
        test_dataloader_2 = DataLoader(testset_2, batch_size=BATCH_SIZE)
        for model_1_data, model_2_data in tuple(zip(test_dataloader_1, test_dataloader_2)):
            items_1, users_1, ratings_1, weights_1 = model_1_data
            items_2, users_2, ratings_2, weights_2 = model_2_data
            users_1, items_1, ratings_1, weights_1 = users_1.to(device), items_1.to(device), ratings_1.to(device), weights_1.to(device)
            users_2, items_2, ratings_2, weights_2 = users_2.to(device), items_2.to(device), ratings_2.to(device), weights_2.to(device)
            pred_1 = model_1.pred(users_1, items_1, embeddings_1)
            pred_2 = model_2.pred(users_2, items_2, embeddings_2)
            loss = loss_fn(pred_1, ratings_1, weights_1, pred_2, ratings_2, weights_2, 0.5)
            
            loss_list.append(loss.item())
            
    return sum(loss_list) / len(loss_list)


def train(model_1, optimizer_1, train_dataset_1, test_dataset_1, train_edge_index_1, model_2, optimizer_2, train_dataset_2, test_dataset_2, train_edge_index_2, loss_fn):
    loss_list_epoch = []
    valid_loss_list_epoch = []
    train_dataloader_1 = DataLoader(train_dataset_1, batch_size=BATCH_SIZE)
    train_dataloader_2 = DataLoader(train_dataset_2, batch_size=BATCH_SIZE)
    min_valid_loss = None
    min_loss_model = None
    for epoch in tqdm(range(EPOCHS)):
        n_batch_1 = int(len(train_dataset_1)/BATCH_SIZE)
        n_batch_2 = int(len(train_dataset_2)/BATCH_SIZE)
        loss_list = []
        model_1.train()
        model_2.train()
        for model_1_data, model_2_data in tqdm(tuple(zip(train_dataloader_1, train_dataloader_2))):
            items_1, users_1, ratings_1, weights_1 = model_1_data
            items_2, users_2, ratings_2, weights_2 = model_2_data
            optimizer_1.zero_grad()
            optimizer_2.zero_grad()
            users_1, items_1, ratings_1, weights_1 = users_1.to(device), items_1.to(device), ratings_1.to(device), weights_1.to(device)
            users_2, items_2, ratings_2, weights_2 = users_2.to(device), items_2.to(device), ratings_2.to(device), weights_2.to(device)
            _, embeddings_1 = model_1(train_edge_index_1)
            _, embeddings_2 = model_2(train_edge_index_2)
            pred_1 = model_1.pred(users_1, items_1, embeddings_1)
            pred_2 = model_2.pred(users_2, items_2, embeddings_2)
            loss = loss_fn(pred_1, ratings_1, weights_1, pred_2, ratings_2, weights_2, 0.5)
            loss.backward()
            optimizer_1.step()
            optimizer_2.step()
            loss_list.append(loss.item())
            
        # evaluate on validation data
        valid_loss = get_testset_loss(model_1, test_dataset_1, embeddings_1, model_2, test_dataset_2, embeddings_2, loss_fn)
        if min_valid_loss is None or valid_loss < min_valid_loss:
            min_valid_loss = valid_loss
            min_loss_model_1 = torch.save(model_1.state_dict(), f"model_1_epoch_{epoch}.ckpt")
            min_loss_model_2 = torch.save(model_2.state_dict(), f"model_2_epoch_{epoch}.ckpt")
            
        valid_loss_list_epoch.append(round(valid_loss, 4))
        loss_list_epoch.append(round(np.mean(loss_list),4))

    return loss_list_epoch, valid_loss_list_epoch

Set Loss and Optimizer

In [None]:
# Calculate weights of different labels and define weighted MSE loss
def weighted_MSE(preds_1, targets_1, weights_1, preds_2, targets_2, weights_2, l):
    loss_1 = (weights_1 * (preds_1 - targets_1) ** 2).mean()
    loss_2 = (weights_2 * (preds_2 - targets_2) ** 2).mean()
    return l * loss_1 + (1 - l) * loss_2

In [None]:
loss_function = weighted_MSE
optimizer_1 = torch.optim.Adam(lightgcn_1.parameters(), lr=LR)
optimizer_2 = torch.optim.Adam(lightgcn_2.parameters(), lr=LR)

Train the Model

In [None]:
loss_history, valid_loss_history = train(lightgcn_1, optimizer_1, train_dataset_1, test_dataset_1, train_edge_index_1, lightgcn_2, optimizer_2, train_dataset_2, test_dataset_2, train_edge_index_2, loss_function)

Plot the Loss

In [None]:
epoch_list = [(i+1) for i in range(EPOCHS)]

plt.plot(epoch_list, loss_history, label='Training Loss')
plt.plot(epoch_list, valid_loss_history, label='Validation Loss')
with open('/content/drive/MyDrive/CS330_Project/mt(joint_loss)_cs330_light_gcn_lr=0.0005.txt', 'a+') as fp:
  for i in range(len(epoch_list)):
    fp.write("%s %s %s\n" % (epoch_list[i], loss_history[i], valid_loss_history[i]))

print('loss history', loss_history)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()