# Light GCN Notebook

## 0. Env Preparation

### 0.1 Install Packages

Make sure to download the right PyG version

In [1]:
import torch
torch.__version__

'1.13.0+cu116'

In [2]:
!pip install pyg-lib torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.13.0+cu116.html
!pip install torch-geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.13.0+cu116.html
Collecting pyg-lib
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/pyg_lib-0.1.0%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.9 MB/s 
[?25hCollecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/torch_scatter-2.1.0%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 84.1 MB/s 
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/torch_sparse-0.6.15%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 78.7 MB/s 
Installing collected packages: torch-sparse, torch-scatter, pyg-lib
Successfully installed pyg-lib-0.1.0+pt113cu116 torch-scatter-2.1.0+pt113cu116 torch-sparse-0.6.15+pt113cu116
Looking in in

### 0.2 Import Packages

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import torch_geometric
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torchsummary import summary

Check our environment

In [4]:
!python --version

Python 3.8.16


In [5]:
torch_geometric.__version__

'2.2.0'

Set device for torch

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## 1. Pre-Training on Los Angeles

### 1.1 Read Data


In order to access the Google Drive, first go to: https://drive.google.com/drive/folders/1TzyQFgm_szZMo6d6RyrwPE4Hkc_0I0og?usp=sharing, then go to "Shared with me", right click on the folder and select "Add a shortcut to Drive".

In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [8]:
columns_name = ['place_index', 'user_index', 'rating']
review_df = pd.read_csv("/content/drive/MyDrive/cities/Los_angeles/train.tsv", sep="\t")[columns_name].astype(int)

### 1.2 Number of User and Place

In [9]:
max_user_id = review_df['user_index'].max()
max_place_id = review_df['place_index'].max()
print(max_user_id)
print(max_place_id)

68941
9523


In [10]:
max_node_id = max_user_id + max_place_id + 1 # since place_id starts from 0
print(max_node_id)

78465


### 1.3 Split Data

In [11]:
# It is possible some users or restaurants in test do not exist in 
# train after this split.
# Will need to retrain the model with combined train+test after picking
# the best model structure
train, test = train_test_split(review_df.values, test_size=0.1)
train_df = pd.DataFrame(train, columns=review_df.columns)
test_df = pd.DataFrame(test, columns=review_df.columns)

### 1.4 Get distribution of different ratings

In [12]:
# Weights will be used to normalize loss function
def get_weights(df):
    rating_counts = np.array([len(df[df['rating'] == i]) for i in [1, 2, 3, 4, 5]])
    inverse_count = 1 / rating_counts
    norm = np.linalg.norm(inverse_count)
    normalized_inverse_count = inverse_count / norm

    return normalized_inverse_count

weights = get_weights(train_df)
print(weights)

[0.5941015  0.75257173 0.25542619 0.11558842 0.0455629 ]


In [13]:
train_df['weight'] = train_df['rating'].map(lambda val: weights[int(val)-1])
test_df['weight'] = test_df['rating'].map(lambda val: weights[int(val)-1])

In [14]:
# Check data snippet
train_df.head(5)

Unnamed: 0,place_index,user_index,rating,weight
0,131,48989,5,0.045563
1,3842,1880,5,0.045563
2,916,10869,5,0.045563
3,2730,46697,4,0.115588
4,3504,21556,2,0.752572


### 1.5 Dataset and Dataloader

In [15]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data.to_numpy()
        
    def __getitem__(self, index):
        return self.data[index, 0].astype(np.compat.long), \
            self.data[index, 1].astype(np.compat.long), \
            self.data[index, 2:3].astype(np.float32), \
            self.data[index, 3]
    
    def __len__(self):
        return len(self.data)

## 1.6 Graph Construction

In [16]:
u_t = torch.LongTensor(train_df.user_index.to_numpy())
p_t = torch.LongTensor(train_df.place_index.to_numpy()) + max_user_id + 1

train_edge_index = torch.stack((torch.cat([u_t, p_t]),torch.cat([p_t, u_t]))).to(device)

In [17]:
train_df['place_index'] = train_df['place_index'] + max_user_id + 1
test_df['place_index'] = test_df['place_index'] + max_user_id + 1
# assert that there's no index overlapping
intersection = set(train_df['place_index'].unique()).intersection(set(train_df['user_index'].unique()))
assert len(intersection) == 0

intersection = set(test_df['place_index'].unique()).intersection(set(test_df['user_index'].unique()))
assert len(intersection) == 0

train_dataset = MyDataset(train_df)
test_dataset = MyDataset(test_df)

## 1.7 Model Architecture

### 1.7.1 LightGCN Convolutional Layer

In [18]:
class LightGCNConv(MessagePassing):
    def __init__(self, **kwargs):
        super().__init__(aggr='add')

    def forward(self, x, edge_index, num_nodes, grad):
        # Compute normalization
        from_, to_ = edge_index
        deg = degree(to_, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[from_] * deg_inv_sqrt[to_]
        # Start propagating messages (no update after aggregation)
        return self.propagate(edge_index, x=x, norm=norm, requires_grad=grad)

    def message(self, x_j, norm):
        return norm.view(-1, 1) * x_j

In [19]:
# Initialize node embeddings as one-hot embeddings
test_x = torch.Tensor(np.eye(5))

# Construct edges
test_edge_index = torch.LongTensor(np.array([
  [0, 0, 1, 1, 2, 3, 3, 4],
  [2, 3, 3, 4, 0, 0, 1, 1]
]))

# Check out the result of passing the embeddings through our Graph Convolutional Network
LightGCNConv()(test_x, test_edge_index, 5, True)

tensor([[0.0000, 0.0000, 0.7071, 0.5000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.5000, 0.7071],
        [0.7071, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.7071, 0.0000, 0.0000, 0.0000]])

### 1.7.2 Recommender System GNN (with layer freezing option for LightGCNConv)

In [20]:
class LightGCN(nn.Module):
    def __init__(self, latent_dim, num_layers, max_index):
        super(LightGCN, self).__init__()
        self.embedding = nn.Embedding(max_index, latent_dim)
        self.convs = nn.ModuleList(LightGCNConv() for _ in range(num_layers))
        self.init_parameters()
        self.nn = nn.Linear(2*latent_dim, 1)
        self.num_layers = num_layers
        self.max_index = max_index

    def init_parameters(self):
        nn.init.normal_(self.embedding.weight, std=0.1) 

    def forward(self, edge_index, freezed_layers = []):
        emb0 = self.embedding.weight
        embs = [emb0]
        emb = emb0
        # Set all layers requires_grad = True as default
        if freezed_layers == []:
          freezed_layers = [False for l in range(self.num_layers)]
        else:
          assert len(freezed_layers) == self.num_layers
        idx = 0
        for conv in self.convs:
            emb = conv(x=emb, edge_index=edge_index, num_nodes=self.max_index, grad=not freezed_layers[idx])
            embs.append(emb)
            idx = idx + 1
        out = torch.mean(torch.stack(embs, dim=0), dim=0)
        return emb0, out
    
    def pred(self, users, items, embeddings):
        user_emb = embeddings[users]
        item_emb = embeddings[items]
        x = torch.cat((user_emb,item_emb), 1)
        x = self.nn(x)
        return x

## 1.8 Train and evaluate models

Set Parameters

In [21]:
latent_dim = 64
n_layers = 10

EPOCHS = 5
BATCH_SIZE = 100
DECAY = 0.0001
LR = 0.0005
K = 2

Set Model

In [22]:
lightgcn = LightGCN(
    latent_dim=latent_dim,
    num_layers=n_layers,
    max_index=max_node_id + 1)
lightgcn = lightgcn.to(device)

Set Train Function

In [23]:
def get_testset_loss(model, testset, loss_fn, embeddings):
    loss_list = []
    model.eval()
    with torch.no_grad():
        for items, users, ratings, weights in DataLoader(testset, batch_size=BATCH_SIZE):
            users, items, ratings, weights = users.to(device), items.to(device), ratings.to(device), weights.to(device)
            pred = model.pred(users, items, embeddings)
            loss = loss_fn(pred, ratings, weights)
            
            loss_list.append(loss.item())
            
    return sum(loss_list) / len(loss_list)


def train(model, optimizer, train_dataset, test_dataset, train_edge_index, loss_fn, freezed_layers):
    loss_list_epoch = []
    valid_loss_list_epoch = []
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    min_valid_loss = None
    min_loss_model = None
    for epoch in tqdm(range(EPOCHS)):
        n_batch = int(len(train_dataset)/BATCH_SIZE)
        loss_list = []
        model.train()
        for items, users, ratings, weights in tqdm(train_dataloader):
            optimizer.zero_grad()
            users, items, ratings, weights = users.to(device), items.to(device), ratings.to(device), weights.to(device)
            _, embeddings = model(train_edge_index, freezed_layers)
            pred = model.pred(users, items, embeddings)
            loss = loss_fn(pred, ratings, weights)
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())
            
        # evaluate on validation data
        valid_loss = get_testset_loss(model, test_dataset, loss_fn, embeddings)
        if min_valid_loss is None or valid_loss < min_valid_loss:
            min_valid_loss = valid_loss
            min_loss_model = torch.save(model.state_dict(), f"epoch_{epoch}.ckpt")
            
        valid_loss_list_epoch.append(round(valid_loss, 4))
        loss_list_epoch.append(round(np.mean(loss_list),4))

    return loss_list_epoch, valid_loss_list_epoch

Set Loss and Optimizer

In [24]:
# Calculate weights of different labels and define weighted MSE loss
def weighted_MSE(preds, targets, weights):
    return (weights * (preds - targets) ** 2).mean()

loss_function = weighted_MSE
optimizer = torch.optim.Adam(lightgcn.parameters(), lr=LR)

Train the Model

In [None]:
loss_history, valid_loss_history = train(lightgcn, optimizer, train_dataset, test_dataset, train_edge_index, loss_function, freezed_layers=[])

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/11663 [00:00<?, ?it/s]

  0%|          | 0/11663 [00:00<?, ?it/s]

Plot and store the Loss

In [None]:
epoch_list = [(i+1) for i in range(EPOCHS)]

plt.plot(epoch_list, loss_history, label='Training Loss')
plt.plot(epoch_list, valid_loss_history, label='Validation Loss')
with open('/content/drive/MyDrive/CS330_Project/cs330_light_gcn_V5_LA_lr=0.0005_lf-.txt', 'a+') as fp:
  for i in range(len(epoch_list)):
    fp.write("%s %s %s\n" % (epoch_list[i], loss_history[i], valid_loss_history[i]))

print('loss history', loss_history)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# 2. Freeze the 2 last LightGCNConv layers and train on Toronto



In [None]:
columns_name = ['place_index', 'user_index', 'rating']
review_df = pd.read_csv("/content/drive/MyDrive/cities/toronto/train.tsv", sep="\t")[columns_name].astype(int)

max_user_id = review_df['user_index'].max()
max_place_id = review_df['place_index'].max()
print(max_user_id)
print(max_place_id)

max_node_id = max_user_id + max_place_id + 1  # since place_id starts from 0
print(max_node_id)

# It is possible some users or restaurants in test do not exist in
# train after this split.
# Will need to retrain the model with combined train+test after picking
# the best model structure
train, test = train_test_split(review_df.values, test_size=0.1)
train_df = pd.DataFrame(train, columns=review_df.columns)
test_df = pd.DataFrame(test, columns=review_df.columns)


# Weights will be used to normalize loss function
def get_weights(df):
    rating_counts = np.array([len(df[df['rating'] == i]) for i in [1, 2, 3, 4, 5]])
    inverse_count = 1 / rating_counts
    norm = np.linalg.norm(inverse_count)
    normalized_inverse_count = inverse_count / norm

    return normalized_inverse_count


weights = get_weights(train_df)
print(weights)

train_df['weight'] = train_df['rating'].map(lambda val: weights[int(val) - 1])
test_df['weight'] = test_df['rating'].map(lambda val: weights[int(val) - 1])

# Check data snippet
train_df.head(5)

u_t = torch.LongTensor(train_df.user_index.to_numpy())
p_t = torch.LongTensor(train_df.place_index.to_numpy()) + max_user_id + 1

train_edge_index = torch.stack((torch.cat([u_t, p_t]), torch.cat([p_t, u_t]))).to(device)
print('train_edge_index la', train_edge_index.shape)

train_df['place_index'] = train_df['place_index'] + max_user_id + 1
test_df['place_index'] = test_df['place_index'] + max_user_id + 1
# assert that there's no index overlapping
intersection = set(train_df['place_index'].unique()).intersection(set(train_df['user_index'].unique()))
assert len(intersection) == 0

intersection = set(test_df['place_index'].unique()).intersection(set(test_df['user_index'].unique()))
assert len(intersection) == 0

train_dataset = MyDataset(train_df)
test_dataset = MyDataset(test_df)

latent_dim = 64
n_layers = 10

EPOCHS = 5
BATCH_SIZE = 100
DECAY = 0.0001
LR = 0.0005
K = 2


def get_testset_loss(model, testset, loss_fn, embeddings):
    loss_list = []
    model.eval()
    with torch.no_grad():
        for items, users, ratings, weights in DataLoader(testset, batch_size=BATCH_SIZE):
            users, items, ratings, weights = users.to(device), items.to(device), ratings.to(device), weights.to(device)
            pred = model.pred(users, items, embeddings)
            loss = loss_fn(pred, ratings, weights)

            loss_list.append(loss.item())

    return sum(loss_list) / len(loss_list)


def train(model, optimizer, train_dataset, test_dataset, train_edge_index, loss_fn, freezed_layers):
    loss_list_epoch = []
    valid_loss_list_epoch = []
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    min_valid_loss = None
    min_loss_model = None
    for epoch in tqdm(range(EPOCHS)):
        n_batch = int(len(train_dataset) / BATCH_SIZE)
        loss_list = []
        model.train()
        for items, users, ratings, weights in tqdm(train_dataloader):
            optimizer.zero_grad()
            users, items, ratings, weights = users.to(device), items.to(device), ratings.to(device), weights.to(device)
            _, embeddings = model(train_edge_index, freezed_layers)
            pred = model.pred(users, items, embeddings)
            loss = loss_fn(pred, ratings, weights)
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())

        # evaluate on validation data
        valid_loss = get_testset_loss(model, test_dataset, loss_fn, embeddings)
        if min_valid_loss is None or valid_loss < min_valid_loss:
            min_valid_loss = valid_loss
            min_loss_model = torch.save(model.state_dict(), f"epoch_{epoch}.ckpt")

        valid_loss_list_epoch.append(round(valid_loss, 4))
        loss_list_epoch.append(round(np.mean(loss_list), 4))

    return loss_list_epoch, valid_loss_list_epoch


# Calculate weights of different labels and define weighted MSE loss
def weighted_MSE(preds, targets, weights):
    return (weights * (preds - targets) ** 2).mean()


loss_function = weighted_MSE
optimizer = torch.optim.Adam(lightgcn.parameters(), lr=LR)

loss_history, valid_loss_history = train(lightgcn, optimizer, train_dataset, test_dataset, train_edge_index,
                                         loss_function, freezed_layers=[True, True, True, True, True, True, True, True, False, False])

epoch_list = [(i+1) for i in range(EPOCHS)]

plt.plot(epoch_list, loss_history, label='Training Loss')
plt.plot(epoch_list, valid_loss_history, label='Validation Loss')
with open('/content/drive/MyDrive/CS330_Project/cs330_light_gcn_V5_Toronto_lr=0.0005_lf+.txt', 'a+') as fp:
  for i in range(len(epoch_list)):
    fp.write("%s %s %s\n" % (epoch_list[i], loss_history[i], valid_loss_history[i]))

print('loss history', loss_history)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# 3. Freeze the 2 last LightGCNConv layers and train on Austin

Important: Added pandas dropna(), please check if it changes the output!

In [None]:
columns_name = ['place_index', 'user_index', 'rating']
review_df = pd.read_csv("/content/drive/MyDrive/cities/Austin/train.tsv", sep="\t")[columns_name].dropna().astype(int) # dropna()!!!

# review_df.to_csv("/content/drive/MyDrive/cities/Seattle/review_df.csv")

max_user_id = review_df['user_index'].max()
max_place_id = review_df['place_index'].max()
print(max_user_id)
print(max_place_id)

max_node_id = max_user_id + max_place_id + 1  # since place_id starts from 0
print(max_node_id)

# It is possible some users or restaurants in test do not exist in
# train after this split.
# Will need to retrain the model with combined train+test after picking
# the best model structure
train, test = train_test_split(review_df.values, test_size=0.1)
train_df = pd.DataFrame(train, columns=review_df.columns)
test_df = pd.DataFrame(test, columns=review_df.columns)


# Weights will be used to normalize loss function
def get_weights(df):
    rating_counts = np.array([len(df[df['rating'] == i]) for i in [1, 2, 3, 4, 5]])
    inverse_count = 1 / rating_counts
    norm = np.linalg.norm(inverse_count)
    normalized_inverse_count = inverse_count / norm

    return normalized_inverse_count


weights = get_weights(train_df)
print(weights)

train_df['weight'] = train_df['rating'].map(lambda val: weights[int(val) - 1])
test_df['weight'] = test_df['rating'].map(lambda val: weights[int(val) - 1])

# Check data snippet
train_df.head(5)

u_t = torch.LongTensor(train_df.user_index.to_numpy())
p_t = torch.LongTensor(train_df.place_index.to_numpy()) + max_user_id + 1

train_edge_index = torch.stack((torch.cat([u_t, p_t]), torch.cat([p_t, u_t]))).to(device)
print('train_edge_index la', train_edge_index.shape)

train_df['place_index'] = train_df['place_index'] + max_user_id + 1
test_df['place_index'] = test_df['place_index'] + max_user_id + 1
# assert that there's no index overlapping
intersection = set(train_df['place_index'].unique()).intersection(set(train_df['user_index'].unique()))
assert len(intersection) == 0

intersection = set(test_df['place_index'].unique()).intersection(set(test_df['user_index'].unique()))
assert len(intersection) == 0

train_dataset = MyDataset(train_df)
test_dataset = MyDataset(test_df)

latent_dim = 64
n_layers = 10

EPOCHS = 5
BATCH_SIZE = 100
DECAY = 0.0001
LR = 0.0005
K = 2


def get_testset_loss(model, testset, loss_fn, embeddings):
    loss_list = []
    model.eval()
    with torch.no_grad():
        for items, users, ratings, weights in DataLoader(testset, batch_size=BATCH_SIZE):
            users, items, ratings, weights = users.to(device), items.to(device), ratings.to(device), weights.to(device)
            pred = model.pred(users, items, embeddings)
            loss = loss_fn(pred, ratings, weights)

            loss_list.append(loss.item())

    return sum(loss_list) / len(loss_list)


def train(model, optimizer, train_dataset, test_dataset, train_edge_index, loss_fn, freezed_layers):
    loss_list_epoch = []
    valid_loss_list_epoch = []
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    min_valid_loss = None
    min_loss_model = None
    for epoch in tqdm(range(EPOCHS)):
        n_batch = int(len(train_dataset) / BATCH_SIZE)
        loss_list = []
        model.train()
        for items, users, ratings, weights in tqdm(train_dataloader):
            optimizer.zero_grad()
            users, items, ratings, weights = users.to(device), items.to(device), ratings.to(device), weights.to(device)
            _, embeddings = model(train_edge_index, freezed_layers)
            pred = model.pred(users, items, embeddings)
            loss = loss_fn(pred, ratings, weights)
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())

        # evaluate on validation data
        valid_loss = get_testset_loss(model, test_dataset, loss_fn, embeddings)
        if min_valid_loss is None or valid_loss < min_valid_loss:
            min_valid_loss = valid_loss
            min_loss_model = torch.save(model.state_dict(), f"epoch_{epoch}.ckpt")

        valid_loss_list_epoch.append(round(valid_loss, 4))
        loss_list_epoch.append(round(np.mean(loss_list), 4))

    return loss_list_epoch, valid_loss_list_epoch


# Calculate weights of different labels and define weighted MSE loss
def weighted_MSE(preds, targets, weights):
    return (weights * (preds - targets) ** 2).mean()


loss_function = weighted_MSE
optimizer = torch.optim.Adam(lightgcn.parameters(), lr=LR)

loss_history, valid_loss_history = train(lightgcn, optimizer, train_dataset, test_dataset, train_edge_index,
                                         loss_function, freezed_layers=[True, True, True, True, True, True, True, True, False, False])

epoch_list = [(i + 1) for i in range(EPOCHS)]

plt.plot(epoch_list, loss_history, label='Training Loss')
plt.plot(epoch_list, valid_loss_history, label='Validation Loss')
with open('/content/drive/MyDrive/CS330_Project/cs330_light_gcn_V5_Austin_lr=0.0005_lf_10L_ttttttttff.txt', 'a+') as fp:
    for i in range(len(epoch_list)):
        fp.write("%s %s %s\n" % (epoch_list[i], loss_history[i], valid_loss_history[i]))

print('loss history', loss_history)
print('valid loss history', valid_loss_history)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()