# Recommending Books using LightGCN

In [195]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [196]:
import torch

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 



## Preprocessing the dataset

### Import necessary libraries

In [197]:
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
from torch import nn, optim, Tensor
import torch.sparse as sparse

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn import LGConv

In [198]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.version.cuda)         # Should print the CUDA version PyTorch is built with


False
None


### Read in the raw data

In [199]:
users_df = pd.read_csv('data/BX-Users.csv', sep=';', encoding='latin-1')
# rename 'Location' and 'Age' to lowercase
users_df.rename(columns={'Location': 'location', 'Age': 'age'}, inplace=True)
# Example age bins
bins = [0, 18, 35, 55, 75, float('inf')]
labels = ['0-18', '19-35', '36-55', '56-75', '76+']

users_df['age'].fillna(users_df['age'].median(), inplace=True)

users_df['age_group'] = pd.cut(users_df['age'], bins=bins, labels=labels, right=False)

# rename 'User-ID' to 'user_id' 
users_df.rename(columns={'User-ID': 'user_id'}, inplace=True)

items_df = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
# rename 'Book-Title' to 'title', 'Book-Author' to 'author'
items_df.rename(columns={'Book-Title': 'title', 'Book-Author': 'author'}, inplace=True)
# rename 'ISBN' to 'item_id'
items_df.rename(columns={'ISBN': 'item_id'}, inplace=True)



  items_df = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')


In [200]:
items_df.head()

Unnamed: 0,item_id,title,author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [201]:
users_df.head()

Unnamed: 0,user_id,location,age,age_group
0,1,"nyc, new york, usa",32.0,19-35
1,2,"stockton, california, usa",18.0,19-35
2,3,"moscow, yukon territory, russia",32.0,19-35
3,4,"porto, v.n.gaia, portugal",17.0,0-18
4,5,"farnborough, hants, united kingdom",32.0,19-35


In [202]:
items_df.head()

Unnamed: 0,item_id,title,author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [203]:
items_df.shape

(271360, 8)

In [204]:
items_df['author'] = items_df['author'].fillna('NA')

In [205]:
items_df.isna().sum()

item_id                0
title                  0
author                 0
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [206]:
# delete rows with missing values
# items_df.dropna(inplace=True)

In [207]:
items_df.shape

(271360, 8)

In [208]:
ratings_df = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', encoding='latin-1')  
# rename 'User-ID' to 'user_id', 'Book-Rating' to 'rating', 'ISBN' to 'item_id'
ratings_df.rename(columns={'User-ID': 'user_id', 'Book-Rating': 'rating', 'ISBN': 'item_id'}, inplace=True)
ratings_df.head()


Unnamed: 0,user_id,item_id,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [209]:

# Preprocessing
ratings_df = ratings_df.loc[ratings_df['item_id'].isin(items_df['item_id'].unique()) & ratings_df['user_id'].isin(users_df['user_id'].unique())]

print(ratings_df.shape)
# # Keep the 100k highest ratings
ratings_df = ratings_df[ratings_df['rating'] >= 8].iloc[:100000]



(1031136, 3)


In [210]:
# subsert users_df where user_id is in ratings_df
users_df = users_df[users_df['user_id'].isin(ratings_df['user_id'].unique())].copy()
# subsert items_df where item_id is in ratings_df
items_df = items_df[items_df['item_id'].isin(ratings_df['item_id'].unique())].copy()

In [211]:
print(users_df.shape)
print(items_df.shape)

(19694, 4)
(56948, 8)


In [212]:
# Create mappings
user_mapping = {userid: i for i, userid in enumerate(users_df['user_id'].unique())}
item_mapping = {isbn: i for i, isbn in enumerate(items_df['item_id'].unique())}

In [213]:
user_mapping[16]

1

In [214]:
# Count users and items
num_users = len(user_mapping)
num_items = len(item_mapping)
num_total = num_users + num_items

# Map user and item indices
users_df['user_idx'] = users_df['user_id'].map(user_mapping)
items_df['item_idx'] = items_df['item_id'].map(item_mapping)
ratings_df['user_idx'] = ratings_df['user_id'].map(user_mapping)
ratings_df['item_idx'] = ratings_df['item_id'].map(item_mapping)

# Merge the user and item features with the ratings
# merged_df = ratings_df.merge(users_df, on='user_id').merge(items_df, on='item_id')

users_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19694 entries, 11 to 278853
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   user_id    19694 non-null  int64   
 1   location   19694 non-null  object  
 2   age        19694 non-null  float64 
 3   age_group  19694 non-null  category
 4   user_idx   19694 non-null  int64   
dtypes: category(1), float64(1), int64(2), object(1)
memory usage: 788.7+ KB


In [215]:
ratings_df.item_idx.min()

0

In [216]:
items_df.head()

Unnamed: 0,item_id,title,author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,item_idx
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,0
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,1
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,2
5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,3
6,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,4


### Preprocessing the dataset: transforming the user features and item features

In [217]:
# fill missing values of locations, age_group, subject, author with 'unknown'
def fillna_category(df, column):
    df[column] = df[column].astype('category')
    df[column].cat.add_categories('unknown')
    df[column].fillna('unknown', inplace=True)

# check if there are any missing values
items_df.isna().sum()

# delete rows with missing values in 'title' and 'author'
# items_df.dropna(subset=['title', 'author'], inplace=True)


item_id                0
title                  0
author                 0
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
item_idx               0
dtype: int64

In [218]:

# Function to map categories to indices
def map_categories(df, column):
    unique_values = df[column].unique()
    value_to_idx = {value: idx for idx, value in enumerate(unique_values)}
    return df[column].map(value_to_idx), len(unique_values)

# Map each categorical feature
users_df['location_idx'], num_locations = map_categories(users_df, 'location')
users_df['age_group_idx'], num_age_groups = map_categories(users_df, 'age_group')
items_df['subject_idx'], num_subjects = map_categories(items_df, 'title')
items_df['author_idx'], num_authors = map_categories(items_df, 'author')

### Prepare Input data for the Neural Network Model

In [219]:
# Convert indices to tensors
user_location_indices = torch.tensor(users_df['location_idx'].values, dtype=torch.long)
user_age_group_indices = torch.tensor(users_df['age_group_idx'].values, dtype=torch.long)
item_subject_indices = torch.tensor(items_df['subject_idx'].values, dtype=torch.long)
item_author_indices = torch.tensor(items_df['author_idx'].values, dtype=torch.long)


### Prepare Edge Data

In [220]:

# # Tensors for user and item indices
# user_indices = torch.tensor(merged_df['user_idx'].values, dtype=torch.long)
# item_indices = torch.tensor(merged_df['item_idx'].values, dtype=torch.long)


# # Tensor for ratings
# ratings = torch.tensor(merged_df['rating'].values, dtype=torch.float)

# # ratings = torch.tensor(merged_df['rating'].values, dtype=torch.float)


In [221]:
users_df.head()

Unnamed: 0,user_id,location,age,age_group,user_idx,location_idx,age_group_idx
11,12,"fort bragg, california, usa",32.0,19-35,0,0,0
15,16,"albuquerque, new mexico, usa",32.0,19-35,1,1,0
25,26,"bellevue, washington, usa",32.0,19-35,2,2,0
31,32,"portland, oregon, usa",32.0,19-35,3,3,0
38,39,"cary, north carolina, usa",32.0,19-35,4,4,0


In [222]:
items_df.head()

Unnamed: 0,item_id,title,author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,item_idx,subject_idx,author_idx
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,0,0,0
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,1,1,1
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,2,2,2
5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,3,3,3
6,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,4,4,4


In [223]:
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,user_idx,item_idx
16,276747,60517794,9,19324,3147
19,276747,671537458,9,19324,1258
20,276747,679776818,8,19324,4053
27,276751,3596218098,8,19325,18749
28,276754,684867621,8,19326,1346


### Create Adjacency Matrix

In [224]:
ratings_df.item_idx.max()

56947

In [225]:
# in ratings_df, exclude rows with item_idx < 0
ratings_df = ratings_df[ratings_df['item_idx'] >= 0]

In [226]:
%%timeit
import numpy as np
import torch

def create_adjacency_matrix(users_df, items_df, ratings_df):
    # Basic data validation
    if not all(col in users_df.columns for col in ['user_idx']) or \
       not all(col in items_df.columns for col in ['item_idx']) or \
       not all(col in ratings_df.columns for col in ['user_idx', 'item_idx']):
        raise ValueError("Dataframes do not have the required columns")

    num_users = users_df['user_idx'].nunique()
    num_items = items_df['item_idx'].nunique()

    # Convert indices to tensor-friendly format
    user_ids = ratings_df['user_idx'].values.astype(np.int64)
    item_ids = ratings_df['item_idx'].values.astype(np.int64) + num_users

    # Create edge index tensors
    start_idx = torch.LongTensor([user_ids, item_ids])
    end_idx = torch.LongTensor([item_ids, user_ids])

    # Create values tensor
    values = torch.FloatTensor([1] * 2 * len(ratings_df))

    # Construct sparse adjacency matrix
    adj_matrix = torch.sparse.FloatTensor(torch.cat([start_idx, end_idx], dim=1), values, torch.Size([num_users + num_items, num_users + num_items]))

    # Normalize adjacency matrix
    # Computing degree (sum of edges for each node)
    deg = torch.sparse.sum(adj_matrix, dim=1).to_dense()

    # Efficient normalization for user rows
    indices = adj_matrix._indices()
    values = adj_matrix._values()

    for i in range(indices.size(1)):
        row = indices[0, i]
        if row < num_users:  # Normalize only user rows
            values[i] /= deg[row]

    # Reconstruct normalized adjacency matrix
    norm_adj = torch.sparse.FloatTensor(indices, values, adj_matrix.size())

    return norm_adj

# Example usage
adj_matrix = create_adjacency_matrix(users_df, items_df, ratings_df)


1.93 s ± 14.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### split the train-validtion-test adj matrix

In [227]:
import torch

def split_sparse_tensor(adj_matrix, train_ratio=0.8, val_ratio=0.1):
    # Extract the total number of users and items from the dimensions of the adjacency matrix
    total_nodes = adj_matrix.size(0)
    num_users = torch.max(adj_matrix._indices()[0]).item() + 1
    num_items = total_nodes - num_users

    train_indices = []
    val_indices = []
    test_indices = []

    for user in range(num_users):
        # Filter items for this user
        user_mask = adj_matrix._indices()[0] == user
        user_items = adj_matrix._indices()[1][user_mask] - num_users  # Adjust item index

        # Shuffle items
        shuffled_indices = torch.randperm(user_items.size(0))
        user_items = user_items[shuffled_indices]

        # Split items
        num_train = int(user_items.size(0) * train_ratio)
        num_val = int(user_items.size(0) * val_ratio)

        train_items = user_items[:num_train]
        val_items = user_items[num_train:num_train + num_val]
        test_items = user_items[num_train + num_val:]

        # Add to indices lists
        train_indices.append(torch.stack([torch.full_like(train_items, user), train_items + num_users], dim=0))
        val_indices.append(torch.stack([torch.full_like(val_items, user), val_items + num_users], dim=0))
        test_indices.append(torch.stack([torch.full_like(test_items, user), test_items + num_users], dim=0))

    def create_sparse_tensor(indices_list):
        all_indices = torch.cat(indices_list, dim=1)
        values = torch.ones(all_indices.size(1))
        return torch.sparse.FloatTensor(all_indices, values, adj_matrix.size())

    train_adj_matrix = create_sparse_tensor(train_indices)
    val_adj_matrix = create_sparse_tensor(val_indices)
    test_adj_matrix = create_sparse_tensor(test_indices)

    return train_adj_matrix, val_adj_matrix, test_adj_matrix

# Example usage
train_adj_matrix, val_adj_matrix, test_adj_matrix = split_sparse_tensor(adj_matrix)


In [228]:
test_adj_matrix

tensor(indices=tensor([[    0,     1,     2,  ..., 76639, 76640, 76641],
                       [19704, 19706, 19712,  ..., 19222, 19222, 18920]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(76642, 76642), nnz=86678, layout=torch.sparse_coo)

### Graph Dataloader

In [229]:
print(items_df.shape)
print(items_df.item_idx.nunique())

(56948, 11)
56948


In [230]:
import numpy as np
import torch

In [231]:
class GraphDataLoader:
    def __init__(self, adj_matrix, users_df, items_df, num_negatives=1, batch_size=1024, device='cpu'):
        self.adj_matrix = adj_matrix.to(device)
        self.num_negatives = num_negatives
        self.batch_size = batch_size
        self.num_users = users_df['user_idx'].nunique()
        self.num_items = items_df['item_idx'].nunique()
        self.device = device
        self.negatives = self.precompute_negatives()

    def precompute_negatives(self):
        negatives = {}
        for user in range(self.num_users):
            user_row = self.adj_matrix[user].coalesce()
            if user_row._nnz() > 0:
                pos_items = user_row.indices()[0].cpu().numpy()
                neg_items = np.setdiff1d(np.arange(self.num_items), pos_items)
                negatives[user] = neg_items
            else:
                negatives[user] = np.arange(self.num_items)
        return negatives

    def generate_batch(self):
        for batch_start in range(0, self.num_users, self.batch_size):
            batch_users, batch_pos_items, batch_neg_items = [], [], []

            for user in range(batch_start, min(batch_start + self.batch_size, self.num_users)):
                user_row = self.adj_matrix[user].coalesce()
                pos_items = user_row.indices()[0].cpu().numpy() if user_row._nnz() > 0 else np.array([])

                if len(pos_items) > 0:
                    pos_item = np.random.choice(pos_items)
                else:
                    pos_item = np.random.randint(0, self.num_items)  # Fallback if no positive items

                # Adjust the number of negatives based on available samples
                num_available_negs = len(self.negatives[user])
                num_negatives = min(self.num_negatives, num_available_negs)

                # If less negatives are available, allow repetition
                neg_samples = np.random.choice(self.negatives[user], num_negatives, replace=num_negatives < self.num_negatives)

                for neg_sample in neg_samples:
                    batch_users.append(user)
                    batch_pos_items.append(pos_item)
                    batch_neg_items.append(neg_sample)
                    
                
                # Add a check to ensure equal lengths
                assert len(batch_pos_items) == len(batch_neg_items), "Mismatch in batch sizes of positives and negatives during batch generation"
                                      
            if batch_users:
                  # print(f"Generated batch sizes - Users: {len(batch_users)}, Pos: {len(batch_pos_items)}, Neg: {len(batch_neg_items)}")
                  yield torch.tensor(batch_users, dtype=torch.long, device=self.device), \
                      torch.tensor(batch_pos_items, dtype=torch.long, device=self.device), \
                      torch.tensor(batch_neg_items, dtype=torch.long, device=self.device)
                
            

In [232]:
# train_adj_matrix, val_adj_matrix, test_adj_matrix = split_adj_matrix(adj_matrix)
train_loader = GraphDataLoader(train_adj_matrix, users_df, items_df, num_negatives=1, batch_size=128)

## Implementing the LightGCN architecture

In [233]:
import torch
import torch.nn as nn
from torch.sparse import mm as sparse_mm

In [234]:
import torch
import torch.nn as nn
from torch.sparse import mm as sparse_mm

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, num_layers):
        super(LightGCN, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        # Initialize user and item embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        # Initialization
        nn.init.normal_(self.user_embedding.weight, std=0.1)
        nn.init.normal_(self.item_embedding.weight, std=0.1)

    def forward(self, user_indices, item_indices, adj_matrix):
        # Initial batch size check
        initial_batch_size = user_indices.size(0)
        assert initial_batch_size == item_indices.size(0), "Initial batch sizes of user and item indices must be equal"

        # Check and filter out-of-range indices
        valid_user_mask = (user_indices >= 0) & (user_indices < self.num_users)
        valid_item_mask = (item_indices >= 0) & (item_indices < self.num_items)
        valid_indices_mask = valid_user_mask & valid_item_mask

        # Filter based on the mask
        user_indices = user_indices[valid_indices_mask]
        item_indices = item_indices[valid_indices_mask]

        # Handling case where all indices are invalid
        if len(user_indices) == 0 or len(item_indices) == 0:
            return torch.zeros(initial_batch_size)

        # Create initial embeddings
        all_embeddings = torch.cat([self.user_embedding.weight, self.item_embedding.weight], dim=0)

        # List to hold all embeddings for each layer
        all_user_embs = [self.user_embedding(user_indices)]
        all_item_embs = [self.item_embedding(item_indices)]

        # Perform graph convolutions
        for _ in range(self.num_layers):
            all_embeddings = sparse_mm(adj_matrix, all_embeddings)

            user_emb = all_embeddings[:self.num_users]
            item_emb = all_embeddings[self.num_users:]

            all_user_embs.append(user_emb[user_indices])
            all_item_embs.append(item_emb[item_indices])

        # Compute final embeddings as the mean of all layers' embeddings
        final_user_emb = torch.mean(torch.stack(all_user_embs), dim=0)
        final_item_emb = torch.mean(torch.stack(all_item_embs), dim=0)

        # Predict ratings by computing the dot product of user and item embeddings
        scores = torch.sum(final_user_emb * final_item_emb, dim=1)

        # Ensuring output scores match the initial batch size
        if scores.size(0) != initial_batch_size:
            # Padding scores to match initial batch size
            padded_scores = torch.zeros(initial_batch_size)
            padded_scores[:scores.size(0)] = scores
            return padded_scores

        return scores

# Example usage:
# model = LightGCN(num_users=1000, num_items=500, embedding_dim=64, num_layers=3)
# ... (setup data and training loop) ...


In [235]:
def precision_at_k(targets, predictions, k=10):
    top_k_preds = predictions.topk(k, dim=1).indices
    relevant = targets.gather(1, top_k_preds)
    precision = relevant.sum().float() / (k * targets.size(0))
    return precision

def recall_at_k(targets, predictions, k=10):
    top_k_preds = predictions.topk(k, dim=1).indices
    relevant = targets.gather(1, top_k_preds)
    recall = relevant.sum().float() / targets.sum()
    return recall


In [236]:
   
# BPR Loss
def bpr_loss(pos_scores, neg_scores):
    if pos_scores.shape != neg_scores.shape:
        raise ValueError(f"pos_scores and neg_scores must be of the same shape, got {pos_scores.shape} and {neg_scores.shape}")
    
    return -torch.mean(torch.log(torch.sigmoid(pos_scores - neg_scores)))

## Model Training

In [237]:
# train_adj_matrix, val_adj_matrix, test_adj_matrix = split_adj_matrix(adj_matrix)
train_loader = GraphDataLoader(train_adj_matrix, users_df, items_df, num_negatives=1, batch_size=128)

In [238]:
val_loader = GraphDataLoader(val_adj_matrix, users_df, items_df, num_negatives=1, batch_size=128)
test_loader = GraphDataLoader(test_adj_matrix, users_df, items_df, num_negatives=1, batch_size=128)

In [239]:
import torch

def train_and_validate(model, train_loader, val_loader, epochs, optimizer, loss_fn, adj_matrix, early_stopping_rounds=5, device='cuda'):
    best_val_loss = float('inf')
    no_improvement_epochs = 0

    # Move the model to the specified device
    model.to(device)
    adj_matrix = adj_matrix.to(device)

    for epoch in range(epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        num_train_batches = 0

        for users, pos_items, neg_items in train_loader.generate_batch():
            # Add a check here to ensure consistent batch sizes
            assert pos_items.shape == neg_items.shape, "Mismatch in shapes of pos_items and neg_items during training"

            # Move tensors to the specified device
            users = users.to(device)
            pos_items = pos_items.to(device)
            neg_items = neg_items.to(device)

            optimizer.zero_grad()

            # Forward pass
            pos_scores = model(users, pos_items, adj_matrix)
            neg_scores = model(users, neg_items, adj_matrix)

            # Compute loss
            loss = loss_fn(pos_scores, neg_scores)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            num_train_batches += 1

        avg_train_loss = total_train_loss / num_train_batches if num_train_batches > 0 else 0
        print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_train_loss:.4f}", end='')

        # Validation phase
        model.eval()
        total_val_loss = 0
        num_val_batches = 0

        with torch.no_grad():
            for users, pos_items, neg_items in val_loader.generate_batch():
                # Move tensors to the specified device
                users = users.to(device)
                pos_items = pos_items.to(device)
                neg_items = neg_items.to(device)

                # Forward pass
                pos_scores = model(users, pos_items, adj_matrix)
                neg_scores = model(users, neg_items, adj_matrix)

                # Compute loss
                loss = loss_fn(pos_scores, neg_scores)

                total_val_loss += loss.item()
                num_val_batches += 1

        avg_val_loss = total_val_loss / num_val_batches if num_val_batches > 0 else 0
        print(f" - Validation Loss: {avg_val_loss:.4f}")

        # Check for early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            no_improvement_epochs = 0
            torch.save(model.state_dict(), 'models/best_model.pth')  # Save the best model
        else:
            no_improvement_epochs += 1
            if no_improvement_epochs >= early_stopping_rounds:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                break

# Example usage
# model = YourModel(...)
# train_loader = YourDataLoader(...)
# val_loader = YourDataLoader(...)
# optimizer = torch.optim.Adam(model.parameters(), ...)
# loss_fn = your_loss_function
# adj_matrix = your_adjacency_matrix
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# train_and_validate(model, train_loader, val_loader, epochs, optimizer, loss_fn, adj_matrix, early_stopping_rounds=5, device=device)


In [241]:
# Define the model and optimizer with L2 regularization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LightGCN(num_users, num_items, embedding_dim=64, num_layers=3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)
loss_fn = bpr_loss
adj_matrix = adj_matrix.to(device)
# Corrected function call
train_and_validate(model, train_loader, val_loader, 10, optimizer, loss_fn, adj_matrix, early_stopping_rounds=5, device=device)
    


Epoch 1/10 - Training Loss: 0.7003 - Validation Loss: 0.7025
Epoch 2/10 - Training Loss: 0.7007 - Validation Loss: 0.7017
Epoch 3/10 - Training Loss: 0.7001 - Validation Loss: inf
Epoch 4/10 - Training Loss: 0.6966 - Validation Loss: 0.7018
Epoch 5/10 - Training Loss: 0.7008 - Validation Loss: 0.6998
Epoch 6/10 - Training Loss: 0.6997 - Validation Loss: 0.7038
Epoch 7/10 - Training Loss: 0.6938 - Validation Loss: 0.7037
Epoch 8/10 - Training Loss: 0.6945 - Validation Loss: 0.7054
Epoch 9/10 - Training Loss: 0.6950 - Validation Loss: 0.7016
Epoch 10/10 - Training Loss: 0.6934 - Validation Loss: 0.7063
Early stopping triggered after 10 epochs.


### Test the Model

In [253]:
import torch

def precision_at_k(targets, predictions, k=10):
    k = min(k, predictions.size(1))  # Adjust k to the size of predictions if needed
    top_k_preds = predictions.topk(k, dim=1).indices
    relevant = targets.gather(1, top_k_preds)
    precision = relevant.sum().float() / (k * targets.size(0))
    return precision

def recall_at_k(targets, predictions, k=10):
    k = min(k, predictions.size(1))  # Adjust k to the size of predictions if needed
    top_k_preds = predictions.topk(k, dim=1).indices
    relevant = targets.gather(1, top_k_preds)
    recall = relevant.sum().float() / targets.sum()
    return recall

def test_model(model, test_loader, adj_matrix, k=10, device='cuda'):
    model.to(device)
    adj_matrix = adj_matrix.to(device)

    # Metrics
    total_precision = 0.0
    total_recall = 0.0
    num_batches = 0

    model.eval()
    with torch.no_grad():
        for users, pos_items, neg_items in test_loader.generate_batch():
            users = users.to(device)
            pos_items = pos_items.to(device)
            neg_items = neg_items.to(device)

            # Forward pass
            pos_scores = model(users, pos_items, adj_matrix)
            neg_scores = model(users, neg_items, adj_matrix)

            # Ensure pos_scores and neg_scores are 2D before concatenation
            if pos_scores.dim() == 1:
                pos_scores = pos_scores.unsqueeze(1)
            if neg_scores.dim() == 1:
                neg_scores = neg_scores.unsqueeze(1)

            # Combine scores and sort them
            all_scores = torch.cat([pos_scores, neg_scores], dim=1)
            sorted_scores, sorted_indices = torch.sort(all_scores, dim=1, descending=True)

            # Ensure all_scores is 2D before creating targets
            if all_scores.dim() == 1:
                all_scores = all_scores.unsqueeze(1)

            # Generate 'targets' tensor
            targets = torch.zeros_like(all_scores)
            num_positives = pos_items.size(1) if pos_items.dim() > 1 else 1
            targets[:, :num_positives] = 1  # Marking the first 'n' items as positives

            # Adjust k based on the number of items
            k_adjusted = min(k, all_scores.size(1))

            # Compute precision and recall
            precision = precision_at_k(targets, sorted_scores, k=k_adjusted)
            recall = recall_at_k(targets, sorted_scores, k=k_adjusted)

            total_precision += precision.item()
            total_recall += recall.item()
            num_batches += 1

    avg_precision = total_precision / num_batches
    avg_recall = total_recall / num_batches

    print(f"Test Results - Precision@{k}: {avg_precision:.4f}, Recall@{k}: {avg_recall:.4f}")
    return avg_precision, avg_recall

# Example usage:
# model = YourTrainedModel(...)
# test_loader = YourDataLoader(...)
# adj_matrix = YourAdjacencyMatrix(...)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# test_model(model, test_loader, adj_matrix, k=10, device=device)


In [254]:
test_model(model, test_loader, adj_matrix, k=10, device=device)

Test Results - Precision@10: 0.5000, Recall@10: 1.0000


(0.5, 1.0)

## Recommending books for a particular user

In [None]:
model = torch.load('models/book_recommend_lightgcn_model_epochs30.pth')
model.eval()

In [None]:
bookid_title = pd.Series(items_df['title'].values, index=items_df.ISBN).to_dict()
bookid_author = pd.Series(items_df['author'].values, index=items_df.ISBN).to_dict()
user_pos_items = get_user_items(edge_index)

In [None]:
def recommend(user_id, num_recs):
    user = user_mapping.get(user_id)
    print("user_idx is ", user)
    if user is None:
        print(f"User ID {user_id} not found.")
        return [], []

    emb_user = model.emb_users.weight[user]
    ratings = model.emb_items.weight @ emb_user

    values, indices = torch.topk(ratings, k=100)

    ids = [index.cpu().item() for index in indices if index in user_pos_items[user]][:num_recs]
    item_isbns = [list(item_mapping.keys())[list(item_mapping.values()).index(book)] for book in ids]
    titles = [bookid_title[id] for id in item_isbns]
    authors = [bookid_author[id] for id in item_isbns]

    print(f'Favorite books from user n°{user_id}:')
    for i in range(len(item_isbns)):
        print(f'- {titles[i]}, by {authors[i]}')

    ids = [index.cpu().item() for index in indices if index not in user_pos_items[user]][:num_recs]
    item_isbns = [list(item_mapping.keys())[list(item_mapping.values()).index(book)] for book in ids]
    titles = [bookid_title[id] for id in item_isbns]
    authors = [bookid_author[id] for id in item_isbns]

    print(f'\nRecommended books for user n°{user_id}')
    for i in range(num_recs):
        print(f'- {titles[i]}, by {authors[i]}') 

In [None]:
recommend(2084, 5)

In [None]:
recommend(3305, 5)

In [None]:
recommend(277427, 5)

In [None]:
recommend(277427, 5)

In [None]:
recommend(2084, 5)

In [None]:
recommend(1519, 5)