# Recommending Books using LightGCN

In [5]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [6]:
import torch

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 



## Preprocessing the dataset

### Import necessary libraries

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
from torch import nn, optim, Tensor
import torch.sparse as sparse

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn import LGConv

In [8]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.version.cuda)         # Should print the CUDA version PyTorch is built with


True
11.8


### Read in the raw data

In [9]:
users_df = pd.read_csv('data/BX-Users.csv', sep=';', encoding='latin-1')
# rename 'Location' and 'Age' to lowercase
users_df.rename(columns={'Location': 'location', 'Age': 'age'}, inplace=True)
# Example age bins
bins = [0, 18, 35, 55, 75, float('inf')]
labels = ['0-18', '19-35', '36-55', '56-75', '76+']

users_df['age'].fillna(users_df['age'].median(), inplace=True)

users_df['age_group'] = pd.cut(users_df['age'], bins=bins, labels=labels, right=False)

# rename 'User-ID' to 'user_id' 
users_df.rename(columns={'User-ID': 'user_id'}, inplace=True)

items_df = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
# rename 'Book-Title' to 'title', 'Book-Author' to 'author'
items_df.rename(columns={'Book-Title': 'title', 'Book-Author': 'author'}, inplace=True)
# rename 'ISBN' to 'item_id'
items_df.rename(columns={'ISBN': 'item_id'}, inplace=True)



  items_df = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')


In [10]:
items_df.head()

Unnamed: 0,item_id,title,author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [11]:
users_df.head()

Unnamed: 0,user_id,location,age,age_group
0,1,"nyc, new york, usa",32.0,19-35
1,2,"stockton, california, usa",18.0,19-35
2,3,"moscow, yukon territory, russia",32.0,19-35
3,4,"porto, v.n.gaia, portugal",17.0,0-18
4,5,"farnborough, hants, united kingdom",32.0,19-35


In [12]:
items_df.head()

Unnamed: 0,item_id,title,author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [13]:
items_df.shape

(271360, 8)

In [14]:
# delete rows with missing values
items_df.dropna(inplace=True)

In [15]:
items_df.shape

(271353, 8)

In [16]:
ratings_df = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', encoding='latin-1')  
# rename 'User-ID' to 'user_id', 'Book-Rating' to 'rating', 'ISBN' to 'item_id'
ratings_df.rename(columns={'User-ID': 'user_id', 'Book-Rating': 'rating', 'ISBN': 'item_id'}, inplace=True)
ratings_df.head()


Unnamed: 0,user_id,item_id,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [17]:

# Preprocessing
ratings_df = ratings_df.loc[ratings_df['item_id'].isin(items_df['item_id'].unique()) & ratings_df['user_id'].isin(users_df['user_id'].unique())]

print(ratings_df.shape)
# # Keep the 100k highest ratings
ratings_df = ratings_df[ratings_df['rating'] >= 8].iloc[:100000]



(1031128, 3)


In [18]:
# subsert users_df where user_id is in ratings_df
users_df = users_df[users_df['user_id'].isin(ratings_df['user_id'].unique())]
# subsert items_df where item_id is in ratings_df
items_df = items_df[items_df['item_id'].isin(ratings_df['item_id'].unique())]

In [19]:
print(users_df.shape)
print(items_df.shape)

(19694, 4)
(56946, 8)


In [20]:

# Create mappings
user_mapping = {userid: i for i, userid in enumerate(users_df['user_id'].unique())}
item_mapping = {isbn: i for i, isbn in enumerate(items_df['item_id'].unique())}

# Count users and items
num_users = len(user_mapping)
num_items = len(item_mapping)
num_total = num_users + num_items

# Map user and item indices
users_df['user_idx'] = users_df['user_id'].map(user_mapping)
items_df['item_idx'] = items_df['item_id'].map(item_mapping)
ratings_df['user_idx'] = ratings_df['user_id'].map(user_mapping)
ratings_df['item_idx'] = ratings_df['item_id'].map(item_mapping)

# Merge the user and item features with the ratings
merged_df = ratings_df.merge(users_df, on='user_id').merge(items_df, on='item_id')

users_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19694 entries, 11 to 278853
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   user_id    19694 non-null  int64   
 1   location   19694 non-null  object  
 2   age        19694 non-null  float64 
 3   age_group  19694 non-null  category
 4   user_idx   19694 non-null  int64   
dtypes: category(1), float64(1), int64(2), object(1)
memory usage: 788.7+ KB


In [21]:
ratings_df.item_idx.min()

0

In [22]:
items_df.head()

Unnamed: 0,item_id,title,author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,item_idx
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,0
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,1
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,2
5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,3
6,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,4


### Preprocessing the dataset: transforming the user features and item features

In [23]:
# fill missing values of locations, age_group, subject, author with 'unknown'
def fillna_category(df, column):
    df[column] = df[column].astype('category')
    df[column].cat.add_categories('unknown')
    df[column].fillna('unknown', inplace=True)

# check if there are any missing values
items_df.isna().sum()

# delete rows with missing values in 'title' and 'author'
items_df.dropna(subset=['title', 'author'], inplace=True)


In [24]:

# Function to map categories to indices
def map_categories(df, column):
    unique_values = df[column].unique()
    value_to_idx = {value: idx for idx, value in enumerate(unique_values)}
    return df[column].map(value_to_idx), len(unique_values)

# Map each categorical feature
users_df['location_idx'], num_locations = map_categories(users_df, 'location')
users_df['age_group_idx'], num_age_groups = map_categories(users_df, 'age_group')
items_df['subject_idx'], num_subjects = map_categories(items_df, 'title')
items_df['author_idx'], num_authors = map_categories(items_df, 'author')

### Prepare Input data for the Neural Network Model

In [25]:
# Convert indices to tensors
user_location_indices = torch.tensor(users_df['location_idx'].values, dtype=torch.long)
user_age_group_indices = torch.tensor(users_df['age_group_idx'].values, dtype=torch.long)
item_subject_indices = torch.tensor(items_df['subject_idx'].values, dtype=torch.long)
item_author_indices = torch.tensor(items_df['author_idx'].values, dtype=torch.long)


### Prepare Edge Data

In [26]:

# # Tensors for user and item indices
# user_indices = torch.tensor(merged_df['user_idx'].values, dtype=torch.long)
# item_indices = torch.tensor(merged_df['item_idx'].values, dtype=torch.long)


# # Tensor for ratings
# ratings = torch.tensor(merged_df['rating'].values, dtype=torch.float)

# # ratings = torch.tensor(merged_df['rating'].values, dtype=torch.float)


In [27]:
users_df.head()

Unnamed: 0,user_id,location,age,age_group,user_idx,location_idx,age_group_idx
11,12,"fort bragg, california, usa",32.0,19-35,0,0,0
15,16,"albuquerque, new mexico, usa",32.0,19-35,1,1,0
25,26,"bellevue, washington, usa",32.0,19-35,2,2,0
31,32,"portland, oregon, usa",32.0,19-35,3,3,0
38,39,"cary, north carolina, usa",32.0,19-35,4,4,0


In [28]:
items_df.head()

Unnamed: 0,item_id,title,author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,item_idx,subject_idx,author_idx
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,0,0,0
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,1,1,1
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,2,2,2
5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,3,3,3
6,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,4,4,4


In [29]:
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,user_idx,item_idx
16,276747,60517794,9,19324,3147
19,276747,671537458,9,19324,1258
20,276747,679776818,8,19324,4053
27,276751,3596218098,8,19325,18749
28,276754,684867621,8,19326,1346


### Create Adjacency Matrix

In [30]:
ratings_df.item_idx.min()

0

In [31]:
# in ratings_df, exclude rows with item_idx < 0
ratings_df = ratings_df[ratings_df['item_idx'] >= 0]

In [61]:
import numpy as np
import torch

def create_adjacency_matrix(users_df, items_df, ratings_df):
    # Basic data validation
    if not all(col in users_df.columns for col in ['user_idx']) or \
       not all(col in items_df.columns for col in ['item_idx']) or \
       not all(col in ratings_df.columns for col in ['user_idx', 'item_idx']):
        raise ValueError("Dataframes do not have the required columns")

    num_users = users_df['user_idx'].nunique()
    num_items = items_df['item_idx'].nunique()

    # Convert indices to tensor-friendly format
    user_ids = ratings_df['user_idx'].values.astype(np.int64)
    item_ids = ratings_df['item_idx'].values.astype(np.int64) + num_users

    # Create edge index tensors
    start_idx = torch.LongTensor([user_ids, item_ids])
    end_idx = torch.LongTensor([item_ids, user_ids])

    # Create values tensor
    values = torch.FloatTensor([1] * 2 * len(ratings_df))

    # Construct sparse adjacency matrix
    adj_matrix = torch.sparse.FloatTensor(torch.cat([start_idx, end_idx], dim=1), values, torch.Size([num_users + num_items, num_users + num_items]))

    # Normalize adjacency matrix
    # Computing degree (sum of edges for each node)
    deg = torch.sparse.sum(adj_matrix, dim=1).to_dense()

    # Efficient normalization for user rows
    indices = adj_matrix._indices()
    values = adj_matrix._values()

    for i in range(indices.size(1)):
        row = indices[0, i]
        if row < num_users:  # Normalize only user rows
            values[i] /= deg[row]

    # Reconstruct normalized adjacency matrix
    norm_adj = torch.sparse.FloatTensor(indices, values, adj_matrix.size())

    return norm_adj

# Example usage
adj_matrix = create_adjacency_matrix(users_df, items_df, ratings_df)


### split the train-validtion-test adj matrix

In [None]:
import torch

def split_sparse_tensor(adj_matrix, train_ratio=0.8, val_ratio=0.1):
    # Extract the total number of users and items from the dimensions of the adjacency matrix
    total_nodes = adj_matrix.size(0)
    num_users = torch.max(adj_matrix._indices()[0]).item() + 1
    num_items = total_nodes - num_users

    train_indices = []
    val_indices = []
    test_indices = []

    for user in range(num_users):
        # Filter items for this user
        user_mask = adj_matrix._indices()[0] == user
        user_items = adj_matrix._indices()[1][user_mask] - num_users  # Adjust item index

        # Shuffle items
        shuffled_indices = torch.randperm(user_items.size(0))
        user_items = user_items[shuffled_indices]

        # Split items
        num_train = int(user_items.size(0) * train_ratio)
        num_val = int(user_items.size(0) * val_ratio)

        train_items = user_items[:num_train]
        val_items = user_items[num_train:num_train + num_val]
        test_items = user_items[num_train + num_val:]

        # Add to indices lists
        train_indices.append(torch.stack([torch.full_like(train_items, user), train_items + num_users], dim=0))
        val_indices.append(torch.stack([torch.full_like(val_items, user), val_items + num_users], dim=0))
        test_indices.append(torch.stack([torch.full_like(test_items, user), test_items + num_users], dim=0))

    def create_sparse_tensor(indices_list):
        all_indices = torch.cat(indices_list, dim=1)
        values = torch.ones(all_indices.size(1))
        return torch.sparse.FloatTensor(all_indices, values, adj_matrix.size())

    train_adj_matrix = create_sparse_tensor(train_indices)
    val_adj_matrix = create_sparse_tensor(val_indices)
    test_adj_matrix = create_sparse_tensor(test_indices)

    return train_adj_matrix, val_adj_matrix, test_adj_matrix

# Example usage
train_adj_matrix, val_adj_matrix, test_adj_matrix = split_sparse_tensor(adj_matrix)


In [99]:
test_adj_matrix

tensor(indices=tensor([[    0,     1,     2,  ..., 76637, 76638, 76639],
                       [19704, 19706, 19711,  ..., 19222, 19222, 18920]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(76640, 76640), nnz=86674, layout=torch.sparse_coo)

### Graph Dataloader

In [100]:
print(items_df.shape)
print(items_df.item_idx.nunique())

(56946, 11)
56946


In [123]:
import numpy as np
import torch

class GraphDataLoader:
    def __init__(self, adj_matrix, users_df, items_df, num_negatives=1, batch_size=1024, device='cuda'):
        self.adj_matrix = adj_matrix.to(device)  # Move adjacency matrix to the specified device
        self.num_negatives = num_negatives
        self.batch_size = batch_size
        self.num_users = users_df['user_idx'].nunique()
        self.num_items = items_df['item_idx'].nunique()
        self.device = device  # Store the device
        self.negatives = self.precompute_negatives()

    def precompute_negatives(self):
        negatives = {}
        for user in range(self.num_users):
            user_row = self.adj_matrix[user].coalesce()
            if user_row._nnz() > 0:
                pos_items = user_row.indices()[0].cpu().numpy()  # Move to CPU for numpy compatibility
                neg_items = np.setdiff1d(np.arange(self.num_items), pos_items)
                negatives[user] = neg_items
            else:
                negatives[user] = np.arange(self.num_items)
        return negatives

    def generate_batch(self):
        for batch_start in range(0, self.num_users, self.batch_size):
            batch_users, batch_pos_items, batch_neg_items = [], [], []

            for user in range(batch_start, min(batch_start + self.batch_size, self.num_users)):
                user_row = self.adj_matrix[user].coalesce()
                if user_row._nnz() > 0:
                    pos_items = user_row.indices()[0].cpu().numpy()
                    pos_item = np.random.choice(pos_items)
                    neg_samples = np.random.choice(self.negatives[user], self.num_negatives, replace=False)

                    batch_users.extend([user] * self.num_negatives)
                    batch_pos_items.extend([pos_item] * self.num_negatives)
                    batch_neg_items.extend(neg_samples)  # No offset added here

            if batch_users:
                yield torch.tensor(batch_users, dtype=torch.long, device=self.device), \
                      torch.tensor(batch_pos_items, dtype=torch.long, device=self.device), \
                      torch.tensor(batch_neg_items, dtype=torch.long, device=self.device)

# Example usage
# data_loader = GraphDataLoader(adj_matrix, users_df, items_df)
# for batch in data_loader.generate_batch():
#     # process the batch


## Implementing the LightGCN architecture


In [117]:
import torch
import torch.nn as nn
from torch.sparse import mm as sparse_mm

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, num_layers):
        super(LightGCN, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        # Initialize user and item embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        # Initialization
        nn.init.normal_(self.user_embedding.weight, std=0.1)
        nn.init.normal_(self.item_embedding.weight, std=0.1)

    def forward(self, user_indices, item_indices, adj_matrix):
        # Ensure that user and item indices are within the valid range
        assert user_indices.min() >= 0 and user_indices.max() < self.num_users, "User indices out of range"
        assert item_indices.min() >= 0 and item_indices.max() < self.num_items, "Item indices out of range"

        # Create initial embeddings
        all_embeddings = torch.cat([self.user_embedding.weight, self.item_embedding.weight], dim=0)

        # List to hold all embeddings for each layer
        all_user_embs = [self.user_embedding(user_indices)]
        all_item_embs = [self.item_embedding(item_indices)]

        # Perform graph convolutions
        for _ in range(self.num_layers):
            all_embeddings = sparse_mm(adj_matrix, all_embeddings)

            user_emb = all_embeddings[:self.num_users]
            item_emb = all_embeddings[self.num_users:]

            all_user_embs.append(user_emb[user_indices])
            all_item_embs.append(item_emb[item_indices])

        # Compute final embeddings as the mean of all layers' embeddings
        final_user_emb = torch.mean(torch.stack(all_user_embs), dim=0)
        final_item_emb = torch.mean(torch.stack(all_item_embs), dim=0)

        # Predict ratings by computing the dot product of user and item embeddings
        scores = torch.sum(final_user_emb * final_item_emb, dim=1)

        return scores

# Example usage:
# model = LightGCN(num_users=1000, num_items=500, embedding_dim=64, num_layers=3)
# ... (setup data and training loop) ...


In [75]:
def train(model, data_loader, epochs, optimizer, loss_fn, adj_matrix):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        num_batches = 0  # Counter for the number of batches processed

        for batch_data in data_loader.generate_batch():
            users, pos_items, neg_items = batch_data
            if users is None:
                continue  # Skip the batch if it's empty

            optimizer.zero_grad()

            # Ensure tensors are on the GPU and adjust item indices
            users = users.cuda()
            pos_items = (pos_items - data_loader.num_users).cuda()  # Adjust positive item indices
            neg_items = (neg_items - data_loader.num_users).cuda()  # Adjust negative item indices

            # Positive and negative predictions
            pos_scores = model(users, pos_items, adj_matrix.cuda())
            neg_scores = model(users, neg_items, adj_matrix.cuda())

            # Compute loss
            loss = loss_fn(pos_scores, neg_scores)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1  # Increment batch counter

        # Compute average loss
        avg_loss = total_loss / num_batches if num_batches > 0 else 0
        print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

# Example usage:
# model = LightGCN(num_users, num_items, embedding_dim=64, num_layers=3).cuda()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
# loss_fn = bpr_loss  # or any other loss function you defined
# data_loader = GraphDataLoader(adj_matrix, users_df, items_df, num_negatives=1, batch_size=64)
# train(model, data_loader, epochs=10, optimizer, loss_fn, adj_matrix)


In [118]:
def precision_at_k(targets, predictions, k=10):
    top_k_preds = predictions.topk(k, dim=1).indices
    relevant = targets.gather(1, top_k_preds)
    precision = relevant.sum().float() / (k * targets.size(0))
    return precision

def recall_at_k(targets, predictions, k=10):
    top_k_preds = predictions.topk(k, dim=1).indices
    relevant = targets.gather(1, top_k_preds)
    recall = relevant.sum().float() / targets.sum()
    return recall


In [119]:
def evaluate_model(model, data_loader):
    model.eval()
    precision_sum, recall_sum = 0, 0
    with torch.no_grad(): 
        for batch in data_loader:
            users, items = batch
            scores = model(users, items, adj_matrix)
            targets = adj_matrix[users].to_dense()  # Convert to dense for simplicity

            precision_sum += precision_at_k(targets, scores, k)
            recall_sum += recall_at_k(targets, scores, k)
    
    precision = precision_sum / len(data_loader)
    recall = recall_sum / len(data_loader)
    return precision, recall


In [120]:
   
# BPR Loss
def bpr_loss(pos_scores, neg_scores):
    return -torch.mean(torch.log(torch.sigmoid(pos_scores - neg_scores)))


## Model Training

In [114]:
# train_adj_matrix, val_adj_matrix, test_adj_matrix = split_adj_matrix(adj_matrix)
train_loader = GraphDataLoader(train_adj_matrix, users_df, items_df, num_negatives=1, batch_size=64)
val_loader = GraphDataLoader(val_adj_matrix, users_df, items_df, num_negatives=1, batch_size=64)
test_loader = GraphDataLoader(test_adj_matrix, users_df, items_df, num_negatives=1, batch_size=64)



In [121]:
import torch
def train_and_validate(model, train_loader, val_loader, epochs, optimizer, loss_fn, adj_matrix, early_stopping_rounds=5, device='cuda'):
    best_val_loss = float('inf')
    no_improvement_epochs = 0

    model = model.to(device)
    # adj_matrix = adj_matrix.to(device)
    # train_loader = train_loader.to(device)
    # val_loader = val_loader.to(device)
    # model = model.to(device)

    for epoch in range(epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        num_train_batches = 0

        for users, pos_items, neg_items in train_loader.generate_batch():
            # Move tensors to the specified device
            users, pos_items, neg_items = users.to(device), pos_items.to(device), neg_items.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            pos_scores = model(users, pos_items, adj_matrix)
            neg_scores = model(users, neg_items, adj_matrix)

            # Compute loss
            loss = loss_fn(pos_scores, neg_scores)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            num_train_batches += 1

        avg_train_loss = total_train_loss / num_train_batches if num_train_batches > 0 else 0
        print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_train_loss:.4f}", end='')

        # Validation phase
        model.eval()
        total_val_loss = 0
        num_val_batches = 0

        with torch.no_grad():
            for users, pos_items, neg_items in val_loader.generate_batch():
                # Move tensors to the specified device
                users, pos_items, neg_items = users.to(device), pos_items.to(device), neg_items.to(device)

                # Forward pass
                pos_scores = model(users, pos_items, adj_matrix)
                neg_scores = model(users, neg_items, adj_matrix)

                # Compute loss
                loss = loss_fn(pos_scores, neg_scores)

                total_val_loss += loss.item()
                num_val_batches += 1

        avg_val_loss = total_val_loss / num_val_batches if num_val_batches > 0 else 0
        print(f" - Validation Loss: {avg_val_loss:.4f}")

        # Check for early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            no_improvement_epochs = 0
            # Save the best model
            torch.save(model.state_dict(), 'models/best_model.pth')
        else:
            no_improvement_epochs += 1
            if no_improvement_epochs >= early_stopping_rounds:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                break

# Load the best model after training is complete or early stopping is triggered
# model.load_state_dict(torch.load('best_model.pth'))


In [122]:
# Define the model and optimizer with L2 regularization
device = 'cuda'
model = LightGCN(num_users, num_items, embedding_dim=64, num_layers=3).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
loss_fn = bpr_loss  # Define your loss function
# data_loader = GraphDataLoader(adj_matrix, users_df, items_df, num_negatives=1, batch_size=64)
adj_matrix = adj_matrix.to(device)
# Corrected function call
train_and_validate(model, train_loader, val_loader, 10, optimizer, loss_fn, adj_matrix, early_stopping_rounds=5, device='cuda')
    


AssertionError: Item indices out of range

In [None]:

# Train the model
train(model, data_loader, epochs=10, optimizer=optimizer, loss_fn=bpr_loss)

# Evaluate the model
# Define an appropriate metric function depending on your task
print("Evaluation Metric:", evaluate_model(model, data_loader))


In [None]:

# Save the model
torch.save(model.state_dict(), 'models/book_recommend_lightgcn_features_model_epochs30.pth')

In [None]:
K = 20
LAMBDA = 1e-6
BATCH_SIZE = 1024

num_user_features = user_features.shape[1]-1
print(num_user_features)
num_item_features = item_features.shape[1]-1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model = LightGCN(num_users, num_items, num_user_features, num_item_features, num_layers=4, dim_h=64)
model = LightGCN(num_users, num_items, num_user_features, num_item_features)

model = model.to(device)
edge_index = edge_index.to(device)
train_edge_index = train_edge_index.to(device)
val_edge_index = val_edge_index.to(device)
user_features = user_features.to(device)

# check the dimensions of the user_features tensor
print(user_features.shape)
# 
item_features = item_features.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
%%time

n_batch = int(len(train_index) / BATCH_SIZE)

for epoch in range(31):
    model.train()  # Set the model to training mode

    total_train_loss = 0  # To accumulate loss over the epoch

    for _ in range(n_batch):
        optimizer.zero_grad()

        # Forward pass
        emb_users_final, emb_users, emb_items_final, emb_items = model(
            train_edge_index, user_features, item_features)

        # Sample a mini-batch
        user_indices, pos_item_indices, neg_item_indices = sample_mini_batch(train_edge_index)

        # Select the corresponding embeddings
        emb_users_final_batch = emb_users_final[user_indices]
        emb_users_batch = emb_users[user_indices]
        emb_pos_items_final_batch = emb_items_final[pos_item_indices]
        emb_pos_items_batch = emb_items[pos_item_indices]
        emb_neg_items_final_batch = emb_items_final[neg_item_indices]
        emb_neg_items_batch = emb_items[neg_item_indices]

        # Compute loss
        train_loss = bpr_loss(emb_users_final_batch, emb_users_batch, 
                              emb_pos_items_final_batch, emb_pos_items_batch, 
                              emb_neg_items_final_batch, emb_neg_items_batch)

        # Backward pass and optimization
        train_loss.backward()
        optimizer.step()

        total_train_loss += train_loss.item()

    # Average training loss for the epoch
    avg_train_loss = total_train_loss / n_batch

    # Validation step
    if epoch % 5 == 0:
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            val_loss, recall, ndcg = test(model, val_edge_index, [train_edge_index])
            print(f"Epoch {epoch} | Train loss: {avg_train_loss:.5f} | Val loss: {val_loss:.5f} | Val recall@{K}: {recall:.5f} | Val ndcg@{K}: {ndcg:.5f}")

# Save the model after training is complete
model_path = 'models/book_recommend_lightgcn_features_model_epochs30.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved at {model_path}")

In [None]:
print("Max index in user_features:", user_features[:, 0].max().item())
print("Number of users:", num_users)


In [None]:
test_loss, test_recall, test_ndcg = test(model, test_edge_index.to(device), [train_edge_index, val_edge_index])

print(f"Test loss: {test_loss:.5f} | Test recall@{K}: {test_recall:.5f} | Test ndcg@{K}: {test_ndcg:.5f}")

In [None]:
# Assuming 'model' is your GNN model
torch.save(model, 'models/book_recommend_lightgcn_features_model_epochs30.pth')


## Recommending books for a particular user

In [None]:
model = torch.load('models/book_recommend_lightgcn_model_epochs30.pth')
model.eval()

In [None]:
bookid_title = pd.Series(items_df['title'].values, index=items_df.ISBN).to_dict()
bookid_author = pd.Series(items_df['author'].values, index=items_df.ISBN).to_dict()
user_pos_items = get_user_items(edge_index)

In [None]:
def recommend(user_id, num_recs):
    user = user_mapping.get(user_id)
    print("user_idx is ", user)
    if user is None:
        print(f"User ID {user_id} not found.")
        return [], []

    emb_user = model.emb_users.weight[user]
    ratings = model.emb_items.weight @ emb_user

    values, indices = torch.topk(ratings, k=100)

    ids = [index.cpu().item() for index in indices if index in user_pos_items[user]][:num_recs]
    item_isbns = [list(item_mapping.keys())[list(item_mapping.values()).index(book)] for book in ids]
    titles = [bookid_title[id] for id in item_isbns]
    authors = [bookid_author[id] for id in item_isbns]

    print(f'Favorite books from user n°{user_id}:')
    for i in range(len(item_isbns)):
        print(f'- {titles[i]}, by {authors[i]}')

    ids = [index.cpu().item() for index in indices if index not in user_pos_items[user]][:num_recs]
    item_isbns = [list(item_mapping.keys())[list(item_mapping.values()).index(book)] for book in ids]
    titles = [bookid_title[id] for id in item_isbns]
    authors = [bookid_author[id] for id in item_isbns]

    print(f'\nRecommended books for user n°{user_id}')
    for i in range(num_recs):
        print(f'- {titles[i]}, by {authors[i]}') 

In [None]:
recommend(2084, 5)

In [None]:
recommend(3305, 5)

In [None]:
recommend(277427, 5)

In [None]:
recommend(277427, 5)

In [None]:
recommend(2084, 5)

In [None]:
recommend(1519, 5)