In [29]:
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
import numpy as np

In [30]:
def load_data():
    reviews = pd.read_pickle('../Pickle/reviews.pkl')
    books = pd.read_pickle('../Pickle/books.pkl')
    read = pd.read_pickle('../Pickle/read.pkl')
    review_embeddings = pd.read_pickle('../Pickle/review_embeddings.pkl')
    return reviews, books, read, review_embeddings

In [31]:
def align_review_embeddings(reviews, review_embeddings):
    review_embeddings = pd.merge(reviews[['review_id', 'user_id', 'book_id']], review_embeddings, on='review_id', how='left')
    return review_embeddings

In [32]:
def filter_data(reviews, books, read, review_embeddings):
    reviews = reviews[reviews['book_id'].isin(books['book_id'])]
    read = read[read['book_id'].isin(books['book_id'])]
    common_users = pd.merge(read[['user_id']], reviews[['user_id']], on='user_id')['user_id'].unique()
    read = read[read['user_id'].isin(common_users)]
    reviews = reviews[reviews['user_id'].isin(common_users)]
    return reviews, books, read, review_embeddings

In [33]:
def split_data(read, reviews, review_embeddings):
    train_read, test_read = train_test_split(read, test_size=0.2, random_state=42)
    
    # Filter reviews and review_embeddings to contain only users and books from the train and test sets
    train_user_book_pairs = train_read[['user_id', 'book_id']]
    test_user_book_pairs = test_read[['user_id', 'book_id']]

    train_reviews = reviews[reviews[['user_id', 'book_id']].apply(tuple, axis=1).isin(train_user_book_pairs.apply(tuple, axis=1))]
    test_reviews = reviews[reviews[['user_id', 'book_id']].apply(tuple, axis=1).isin(test_user_book_pairs.apply(tuple, axis=1))]
    
    train_review_embeddings = review_embeddings[review_embeddings[['user_id', 'book_id']].apply(tuple, axis=1).isin(train_user_book_pairs.apply(tuple, axis=1))]
    test_review_embeddings = review_embeddings[review_embeddings[['user_id', 'book_id']].apply(tuple, axis=1).isin(test_user_book_pairs.apply(tuple, axis=1))]

    return train_read, test_read, train_reviews, test_reviews, train_review_embeddings, test_review_embeddings

In [34]:
def balance_data(train_read):
    high_ratings = train_read[train_read['rating'] >= 4]
    low_ratings = train_read[train_read['rating'] < 4]
    low_ratings_upsampled = resample(low_ratings, replace=True, n_samples=len(high_ratings), random_state=42)
    train_read = pd.concat([low_ratings_upsampled, high_ratings])
    return train_read

In [42]:
def prepare_tensors(train_read, test_read, train_reviews, test_reviews, train_review_embeddings, test_review_embeddings, books):
    train_read = balance_data(train_read)
    train_ratings_tensor = torch.tensor(train_read['rating'].values, dtype=torch.float32)
    test_ratings_tensor = torch.tensor(test_read['rating'].values, dtype=torch.float32)

    user_features = pd.concat([train_reviews[['user_id', 'Confidence Score']], test_reviews[['user_id', 'Confidence Score']]]).drop_duplicates().reset_index(drop=True)
    book_features = books[['book_id', 'filtered_genres']].drop_duplicates().reset_index(drop=True)

    merged_books_train = pd.merge(book_features, train_review_embeddings, on='book_id', how='left')
    merged_books_test = pd.merge(book_features, test_review_embeddings, on='book_id', how='left')

    # Drop rows with missing embeddings
    merged_books_train = merged_books_train.dropna(subset=['embeddings'])
    merged_books_test = merged_books_test.dropna(subset=['embeddings'])

    # Determine the embedding dimension
    if not merged_books_train['embeddings'].isnull().all():
        embedding_dim = merged_books_train['embeddings'].dropna().iloc[0].shape[0]
    elif not merged_books_test['embeddings'].isnull().all():
        embedding_dim = merged_books_test['embeddings'].dropna().iloc[0].shape[0]
    else:
        raise ValueError("All embedding values are missing in both train and test datasets.")

    genre_encoder = OneHotEncoder(sparse_output=True)
    book_genres_encoded = genre_encoder.fit_transform(books[['filtered_genres']])
    book_genres_encoded_coo = book_genres_encoded.tocoo()

    values = book_genres_encoded_coo.data
    indices = np.vstack((book_genres_encoded_coo.row, book_genres_encoded_coo.col))
    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = book_genres_encoded_coo.shape
    book_features_tensor = torch.sparse_coo_tensor(i, v, torch.Size(shape)).coalesce()

    book_embeddings_tensor_train = torch.tensor(np.stack(merged_books_train['embeddings'].values), dtype=torch.float32)
    book_embeddings_tensor_test = torch.tensor(np.stack(merged_books_test['embeddings'].values), dtype=torch.float32)

    book_features_tensor_dense = book_features_tensor.to_dense()
    book_features_tensor_train = book_features_tensor_dense[:len(book_embeddings_tensor_train)].to_sparse().coalesce()
    book_features_tensor_test = book_features_tensor_dense[:len(book_embeddings_tensor_test)].to_sparse().coalesce()

    combined_book_features_tensor_train = torch.cat([book_features_tensor_train.to_dense(), book_embeddings_tensor_train], dim=1)
    combined_book_features_tensor_test = torch.cat([book_features_tensor_test.to_dense(), book_embeddings_tensor_test], dim=1)

    user_features_tensor = torch.tensor(user_features.values, dtype=torch.float32)
    num_features_to_pad = combined_book_features_tensor_train.shape[1] - user_features_tensor.shape[1]
    padding = torch.zeros((user_features_tensor.shape[0], num_features_to_pad))
    user_features_tensor_padded = torch.cat([user_features_tensor, padding], dim=1)
    user_features_coo = user_features_tensor_padded.to_sparse().coalesce()

    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_features['user_id'].unique())}
    book_id_to_index = {book_id: idx + len(user_id_to_index) for idx, book_id in enumerate(book_features['book_id'].unique())}

    return train_read, test_read, train_ratings_tensor, test_ratings_tensor, user_features_coo, combined_book_features_tensor_train, combined_book_features_tensor_test, user_id_to_index, book_id_to_index


In [43]:
def prepare_edge_index_and_ratings(df, user_id_to_index, book_id_to_index):
    edge_index = []
    ratings = []
    for _, row in df.iterrows():
        user_idx = user_id_to_index[row['user_id']]
        book_idx = book_id_to_index[row['book_id']]
        edge_index.append([user_idx, book_idx])
        ratings.append(row['rating'])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    ratings_tensor = torch.tensor(ratings, dtype=torch.float)
    return edge_index, ratings_tensor

In [44]:
def prepare_data_objects(train_read, test_read, user_features_coo, book_features_tensor_train, book_features_tensor_test, user_id_to_index, book_id_to_index):
    train_edge_index, train_ratings_tensor = prepare_edge_index_and_ratings(train_read, user_id_to_index, book_id_to_index)
    test_edge_index, test_ratings_tensor = prepare_edge_index_and_ratings(test_read, user_id_to_index, book_id_to_index)

    user_indices = user_features_coo.indices()
    book_indices_train = book_features_tensor_train.indices() + torch.tensor([[user_features_coo.shape[0]], [0]])
    book_indices_test = book_features_tensor_test.indices() + torch.tensor([[user_features_coo.shape[0]], [0]])

    combined_indices_train = torch.cat([user_indices, book_indices_train], dim=1)
    combined_indices_test = torch.cat([user_indices, book_indices_test], dim=1)

    train_data = Data(edge_index=combined_indices_train, y=train_ratings_tensor)
    test_data = Data(edge_index=combined_indices_test, y=test_ratings_tensor)

    return train_data, test_data, train_edge_index, test_edge_index


In [38]:
reviews, books, read, review_embeddings = load_data()

In [39]:
review_embeddings = align_review_embeddings(reviews, review_embeddings)

In [40]:
reviews, books, read, review_embeddings = filter_data(reviews, books, read, review_embeddings)
train_read, test_read, train_reviews, test_reviews, train_review_embeddings, test_review_embeddings = split_data(read, reviews, review_embeddings)
train_read = balance_data(train_read)
train_read, test_read, train_ratings_tensor, test_ratings_tensor, user_features_coo, combined_book_features_tensor_train, combined_book_features_tensor_test, user_id_to_index, book_id_to_index = prepare_tensors(train_read, test_read, train_reviews, test_reviews, train_review_embeddings, test_review_embeddings, books)
train_data, test_data, train_edge_index, test_edge_index = prepare_data_objects(train_read, test_read, user_features_coo, combined_book_features_tensor_train, combined_book_features_tensor_test, user_id_to_index, book_id_to_index) 

RuntimeError: indices expected sparse coordinate tensor layout but got Strided

In [None]:
class GATModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_heads):
        super(GATModel, self).__init__()
        self.gat1 = GATConv(in_channels, hidden_channels, heads=num_heads)
        self.gat2 = GATConv(hidden_channels * num_heads, hidden_channels, heads=num_heads)
        self.gat3 = GATConv(hidden_channels * num_heads, hidden_channels, heads=num_heads)
        self.gat4 = GATConv(hidden_channels * num_heads, hidden_channels, heads=num_heads)
        self.gat5 = GATConv(hidden_channels * num_heads, out_channels, heads = num_heads, concat=False)
        
    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.gat2(x, edge_index)
        x = F.elu(x)
        x = self.gat3(x, edge_index)
        x = F.elu(x)
        x = self.gat4(x, edge_index)
        x = F.elu(x)
        x = self.gat4(x, edge_index)
        x = F.elu(x)
        x = self.gat5(x, edge_index)
    
        # Extract node embeddings and compute edge outputs
        edge_outputs = x[edge_index[0]] * x[edge_index[1]]
        edge_outputs = torch.sigmoid(edge_outputs.sum(dim=-1)) * 5  # Scale the output to the range [0, 5]
        return edge_outputs

    def predict(self, x, edge_index):
        self.eval()
        with torch.no_grad():
            return self.forward(x, edge_index)


# Define the model parameters
in_channels = train_data.x.shape[1]
hidden_channels = 16
out_channels = 1
num_heads = 8

# Initialize the model
model = GATModel(in_channels, hidden_channels, out_channels, num_heads)

# Define loss and optimizer
criterion = torch.nn.SmoothL1Loss() 
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.00001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.2)

def train(model, train_data, val_data, criterion, optimizer, num_epochs, batch_size):
    train_loader = DataLoader([train_data], batch_size=batch_size, shuffle=True)
    val_loader = DataLoader([val_data], batch_size=batch_size)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index)

            if out.shape != data.y.shape:
                raise ValueError(f'Shape mismatch: output {out.shape}, target {data.y.shape}')
            
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}")

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for data in val_loader:
                out = model(data.x, data.edge_index)
                if out.shape != data.y.shape:
                    raise ValueError(f'Shape mismatch: output {out.shape}, target {data.y.shape}')
                
                val_loss = criterion(out, data.y)
                total_val_loss += val_loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"                Val Loss: {avg_val_loss:.4f}")

    return model

# Train the model
trained_model = train(model, train_data, test_data, criterion, optimizer, num_epochs=25, batch_size=32)

In [None]:
def precision_at_k(r, k):
    """Calculates precision at k"""
    r = np.asarray(r)[:k]
    return np.mean(r)

def recall_at_k(r, k, all_positives):
    """Calculates recall at k"""
    r = np.asarray(r)[:k]
    if all_positives == 0:
        return 0
    return np.sum(r) / all_positives

def ndcg_at_k(actual_sorted, k):
    actual_sorted_padded = np.pad(actual_sorted, (0, max(0, k - len(actual_sorted))), 'constant')
    ideal_sorted = np.pad(np.sort(actual_sorted)[::-1], (0, max(0, k - len(actual_sorted))), 'constant')
    ideal_dcg = np.sum(ideal_sorted[:k] / np.log2(np.arange(2, k + 2)))
    dcg = np.sum(actual_sorted_padded[:k] / np.log2(np.arange(2, k + 2)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0

In [None]:
def evaluate(data, model, target_ratings, k):
    model.eval()
    with torch.no_grad():
        out = model.predict(data.x, data.edge_index).squeeze()
        predicted_ratings = out[data.edge_index[0]].cpu().numpy()
        actual_ratings = (target_ratings >= 4).float().cpu().numpy()

        user_ids = data.edge_index[0].cpu().numpy()
        book_ids = data.edge_index[1].cpu().numpy()

        precision_scores = []
        recall_scores = []
        ndcg_scores = []

        for user_id in np.unique(user_ids):
            user_indices = user_ids == user_id
            actual = actual_ratings[user_indices]
            predicted = predicted_ratings[user_indices]

            sorted_indices = np.argsort(predicted)[::-1]
            actual_sorted = actual[sorted_indices]

            precision = precision_at_k(actual_sorted, k)
            recall = recall_at_k(actual_sorted, k, np.sum(actual))
            ndcg = ndcg_at_k(actual_sorted, k)

            precision_scores.append(precision)
            recall_scores.append(recall)
            ndcg_scores.append(ndcg)

        mean_precision = np.mean(precision_scores)
        mean_recall = np.mean(recall_scores)
        mean_ndcg = np.mean(ndcg_scores)

        print(f'Precision@{k}: {mean_precision:.4f}')
        print(f'Recall@{k}: {mean_recall:.4f}')
        print(f'NDCG@{k}: {mean_ndcg:.4f}')

evaluate(test_data, model, test_ratings_tensor, k=10)


In [20]:
def recommend(data, model, user_id, top_n):
    model.eval()
    with torch.no_grad():
        user_index = user_id_to_index[user_id]
        book_indices = torch.arange(len(user_id_to_index), len(user_id_to_index) + len(book_id_to_index))
        user_edge_index = torch.stack([torch.full_like(book_indices, user_index), book_indices], dim=0)
        out = model.predict(data.x, user_edge_index)
        predicted_ratings = out.squeeze().cpu().numpy()
        read_books = set(read[read['user_id'] == user_id]['book_id'])
        all_books = list(book_id_to_index.keys())
        unread_books_indices = [i for i in range(len(all_books)) if all_books[i] not in read_books]
        unread_books_ratings = predicted_ratings[unread_books_indices]
        top_n_indices = np.argsort(unread_books_ratings)[::-1][:top_n]
        recommended_books = [(all_books[i], books[books['book_id'] == all_books[i]]['title'].values[0], unread_books_ratings[i]) for i in top_n_indices]
        return recommended_books


In [21]:
# Combine the edge indices for all data
combined_edge_index = torch.cat([train_edge_index, test_edge_index], dim=1) 
# Prepare PyTorch Geometric Data object for all data 
combined_data = Data(x=all_features, edge_index=combined_edge_index)

In [None]:
recommended_books = recommend(combined_data, model, user_id=1, top_n=5)
for book_id, title, rating in recommended_books:
    print(f"Book ID: {book_id}, Title: {title}, Predicted Rating: {rating:.2f}")

In [None]:
recommended_books = recommend(combined_data, model, user_id=4, top_n=5)
for book_id, title, rating in recommended_books:
    print(f"Book ID: {book_id}, Title: {title}, Predicted Rating: {rating:.2f}")

In [None]:
recommended_books = recommend(combined_data, model, user_id=15, top_n=5)
for book_id, title, rating in recommended_books:
    print(f"Book ID: {book_id}, Title: {title}, Predicted Rating: {rating:.2f}")

In [None]:
recommended_books = recommend(combined_data, model, user_id=67, top_n=5)
for book_id, title, rating in recommended_books:
    print(f"Book ID: {book_id}, Title: {title}, Predicted Rating: {rating:.2f}")