In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [None]:
def load_data():
    reviews = pd.read_pickle('../Pickle/reviews.pkl')
    books = pd.read_pickle('../Pickle/books.pkl')
    read = pd.read_pickle('../Pickle/read.pkl')
    review_embeddings = pd.read_pickle('../Pickle/review_embeddings.pkl')
    user_genres = pd.read_pickle('../Pickle/user_most_common_genres.pkl')
    return reviews, books, read, review_embeddings, user_genres

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

def split_data(data, book_id_to_index, test_size=0.2):
    # Ensure that only books in the book_id_to_index are considered
    available_books = set(book_id_to_index.keys())
    data = data[data['book_id'].isin(available_books)]  # Filter books to only those in train

    # Initialize lists to store train and test data
    train_dfs = []
    test_dfs = []

    # Group data by user_id
    for user_id, user_data in data.groupby('user_id'):
        # Split the books into training and testing sets for each user
        books = list(user_data['book_id'].unique())
        train_books, test_books = train_test_split(books, test_size=test_size, random_state=42)

        # Use the pre-split books to filter the user data for train/test
        user_train_data = user_data[user_data['book_id'].isin(train_books)]
        user_test_data = user_data[user_data['book_id'].isin(test_books)]

        # Append user data to corresponding lists
        train_dfs.append(user_train_data)
        test_dfs.append(user_test_data)

    # Concatenate all the user-specific dataframes into one dataframe each for train and test
    train_data = pd.concat(train_dfs, ignore_index=True)
    test_data = pd.concat(test_dfs, ignore_index=True)

    return train_data, test_data


In [None]:
def normalize_ratings(train_data, test_data):
    # Calculate mean and standard deviation from the training set
    mean_rating = train_data['rating'].mean()
    std_rating = train_data['rating'].std()

    # Normalize the ratings in both the training and test sets using the training set statistics
    train_data['rating'] = (train_data['rating'] - mean_rating) / std_rating
    test_data['rating'] = (test_data['rating'] - mean_rating) / std_rating

    return train_data, test_data, mean_rating, std_rating

In [None]:
from sklearn.utils import resample

def upsample_lower_classes(data, target_column='rating', minority_classes=[0, 1, 2], n_samples=1000):
    # Separate majority and minority class data
    majority_class_data = data[~data[target_column].isin(minority_classes)]
    
    # Up-sample each minority class by n_samples
    upsampled_minority_data = []
    for class_label in minority_classes:
        class_data = data[data[target_column] == class_label]
        upsampled_class_data = resample(class_data, replace=True, n_samples=n_samples, random_state=42)
        upsampled_minority_data.append(upsampled_class_data)

    # Combine the majority class data with up-sampled minority class data
    upsampled_data = pd.concat([majority_class_data] + upsampled_minority_data)

    return upsampled_data


In [None]:
def initialize_id_mappings(combined_data):
    # Combine the unique user IDs and book IDs from both train and test data
    unique_user_ids = set(combined_data['user_id'])
    unique_book_ids = set(combined_data['book_id'])

    # Create mappings from user_id and book_id to indices
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
    book_id_to_index = {book_id: idx for idx, book_id in enumerate(unique_book_ids)}

    return user_id_to_index, book_id_to_index

In [None]:
def prepare_edge_index_ratings_attributes(df, user_id_to_index, book_id_to_index):
    edge_index = []
    ratings = []
    edge_attrs = []

    for _, row in df.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        rating = row['rating']
        embedding = row['embeddings']  # Review embedding as edge attribute

        # Map user_id and book_id to indices
        user_idx = user_id_to_index.get(user_id)
        book_idx = book_id_to_index.get(book_id)

        if user_idx is None or book_idx is None:
            continue

        edge_index.append([user_idx, book_idx])
        ratings.append(rating)
        edge_attrs.append(embedding)

    # Convert edge_attrs to numpy array before tensor conversion (if needed)
    edge_attrs = np.array(edge_attrs)
    
    # Convert lists to tensors
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    ratings_tensor = torch.tensor(ratings, dtype=torch.float32)
    edge_attrs_tensor = torch.tensor(edge_attrs, dtype=torch.float32)

    print(f"Edge index shape: {edge_index.shape}")
    print(f"Ratings tensor shape: {ratings_tensor.shape}")
    print(f"Edge attributes tensor shape: {edge_attrs_tensor.shape}")

    return edge_index, ratings_tensor, edge_attrs_tensor

def prepare_data_objects(train_data, test_data, user_id_to_index, book_id_to_index, user_embedding_dim=64, book_embedding_dim=64):
    # Prepare edge index, ratings, and edge attributes for training and testing
    train_edge_index, train_ratings_tensor, train_edge_attrs = prepare_edge_index_ratings_attributes(
        train_data, user_id_to_index, book_id_to_index
    )
    test_edge_index, test_ratings_tensor, test_edge_attrs = prepare_edge_index_ratings_attributes(
        test_data, user_id_to_index, book_id_to_index
    )

    # Initialize random embeddings for users and books (separate dimensions)
    num_users = len(user_id_to_index)
    num_books = len(book_id_to_index)
    
    # Generate random embeddings for users and books
    user_embeddings = torch.randn(num_users, user_embedding_dim)  # Random embeddings for users
    book_embeddings = torch.randn(num_books, book_embedding_dim)  # Random embeddings for books
    
    # Concatenate user and book embeddings
    # Debug: Check the size of concatenated embeddings
    print(f"User embeddings shape: {user_embeddings.shape}")
    print(f"Book embeddings shape: {book_embeddings.shape}")

# Concatenate user and book embeddings
    node_embeddings = torch.cat([user_embeddings, book_embeddings], dim=0)  # Combined user and book embeddings

# Debug: Verify concatenated shape
    print(f"Concatenated node embeddings shape: {node_embeddings.shape}")

    # Create Data objects with edge attributes included
    train_data_obj = Data(
        x=node_embeddings,  # Assign node features (separate user/book embeddings)
        edge_index=train_edge_index,
        y=train_ratings_tensor,
        edge_attr=train_edge_attrs
    )
    test_data_obj = Data(
        x=node_embeddings,  # Assign node features (separate user/book embeddings)
        edge_index=test_edge_index,
        y=test_ratings_tensor,
        edge_attr=test_edge_attrs
    )

    return train_data_obj, test_data_obj


In [None]:
reviews, books, read, review_embeddings, user_genres = load_data()
books = books[['book_id', 'title', 'authors', 'filtered_genres']]
data = pd.merge(read, books, on='book_id')
data = data[['user_id', 'book_id', 'rating', 'title', 'authors', 'filtered_genres']]
data = data.reset_index(drop=True)
user_genres = user_genres.reset_index()

In [None]:
user_interaction_counts = data['user_id'].value_counts()
eligible_users = user_interaction_counts[user_interaction_counts >= 5].index
data = data[data['user_id'].isin(eligible_users)]
data = data.reset_index(drop=True)

In [None]:
user_id_to_index, book_id_to_index = initialize_id_mappings(combined_data)

In [None]:
train_data, test_data = split_data(combined_data, book_id_to_index, test_size=0.2)

In [None]:
train_data = upsample_lower_classes(train_data)

In [None]:
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [None]:
train_data_obj, test_data_obj = prepare_data_objects(train_data, test_data, user_id_to_index, book_id_to_index)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv

# Define the model parameters
in_channels = train_data_obj.x.shape[1]  # Number of features per node
out_channels = 1  # Single output for regression (e.g., predicted rating)
hidden_channels = 12
num_heads = 4
lr = 0.0001

from torch_geometric.nn import GATConv
import torch.nn.functional as F
import torch.nn as nn

class GATModel(nn.Module):
    def __init__(self, num_users, num_books, num_features):
        super(GATModel, self).__init__()
        
        # Define embeddings for users and books
        self.user_embedding = nn.Embedding(num_users, num_features)
        self.book_embedding = nn.Embedding(num_books, num_features)
        
        # Define GAT layers
        self.gat1 = GATConv(num_features, 8)  # First GAT layer (num_features -> 8)
        self.gat2 = GATConv(8, 1)  # Second GAT layer (8 -> 1, for regression output)
    
    def forward(self, data):
        # Extract node features and edge index from the data object
        x, edge_index = data.x, data.edge_index
        
        x = x.long()  # Ensure the indices are of type Long
        
        # Apply embeddings to user and book
        x_user = self.user_embedding(x[:len(user_id_to_index)])  # User embeddings
        x_book = self.book_embedding(x[len(user_id_to_index):])  # Book embeddings
        
        # Concatenate user and book embeddings
        x = torch.cat([x_user, x_book], dim=0)
        
        # Apply GAT layers
        x = self.gat1(x, edge_index)  # First GAT layer
        x = F.elu(x)
        x = self.gat2(x, edge_index)  # Second GAT layer (output layer)
        
        return x



# Model setup
model = GATModel(in_channels, hidden_channels, out_channels=1, num_heads=num_heads)  # Output a single value for regression



In [None]:
# Loss and optimizer
criterion = torch.nn.MSELoss()  # MSE loss for regression
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

def train(model, train_data_obj, optimizer, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()

        # Ensure the input data to the model
        edge_index = train_data_obj.edge_index
        edge_attr = train_data_obj.edge_attr
        x = train_data_obj.x  # Ensure features are present

        # Forward pass
        out = model(x, edge_index, edge_attr)
        print(f"Output shape: {out.shape}")

        
        # Ensure output and target (y) are the same shape
        y = train_data_obj.y.view(-1, 1)  # Reshape target to match output shape
        loss = criterion(out, y)

        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")


# Validate the model
def validate(model, data):
    model.eval()
    criterion = torch.nn.MSELoss()  # Use MSELoss for regression
    with torch.no_grad():
        out = model(data.x, data.edge_index, data.edge_attr)  # Forward pass with the entire graph
        
        # Debugging the shapes of out and y
        print(f"Validation output shape: {out.shape}")
        print(f"Target shape: {data.y.view(-1, 1).shape}")
        
        val_loss = criterion(out, data.y.view(-1, 1))  # Ensure y is the correct shape
    return val_loss.item()

# Train the model
num_epochs = 50
train(model, train_data_obj, optimizer, criterion, num_epochs)

# Validate the model
test_loss = validate(model, test_data_obj)
print(f"Test Loss: {test_loss:.4f}")

In [None]:
def rescale_predictions(predictions, mean_rating, std_rating):
    # Rescale and clip
    rescaled = (predictions * std_rating) + mean_rating
    return np.clip(rescaled, 0, 5)

In [None]:
def precision_at_k(r, k):
    """Calculates precision at k"""
    r = np.asarray(r)[:k]
    return np.sum(r) / k

def recall_at_k(r, k, all_positives):
    """Calculates recall at k"""
    r = np.asarray(r)[:k]
    if all_positives == 0:
        return 0
    return np.sum(r) / all_positives

def ndcg_at_k(actual_sorted, k):
    actual_sorted_padded = np.pad(actual_sorted, (0, max(0, k - len(actual_sorted))), 'constant')
    ideal_sorted = np.pad(np.sort(actual_sorted)[::-1], (0, max(0, k - len(actual_sorted))), 'constant')
    ideal_dcg = np.sum(ideal_sorted[:k] / np.log2(np.arange(2, k + 2)))
    dcg = np.sum(actual_sorted_padded[:k] / np.log2(np.arange(2, k + 2)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0
def evaluate(data, model, target_ratings, k, mean_rating, std_rating):
    model.eval()
    with torch.no_grad():
        # Model prediction and rescaling
        out = model.predict(data.x, data.edge_index).squeeze()
        predicted_ratings = out[data.edge_index[0]].numpy()
        rescaled_predictions = rescale_predictions(predicted_ratings, mean_rating, std_rating)
        rounded_predictions = np.round(rescaled_predictions)

        # Convert actual ratings to binary relevance
        actual_ratings = (target_ratings >= 3).float().numpy()
        user_ids = data.edge_index[0].numpy()

        precision_scores = []
        recall_scores = []
        ndcg_scores = []

        for user_id in np.unique(user_ids):
            user_indices = user_ids == user_id
            actual = actual_ratings[user_indices]
            predicted = rounded_predictions[user_indices]

            sorted_indices = np.argsort(predicted)[::-1]
            actual_sorted = actual[sorted_indices]

            precision = precision_at_k(actual_sorted, k)
            recall = recall_at_k(actual_sorted, k, np.sum(actual))
            ndcg = ndcg_at_k(actual_sorted, k)

            precision_scores.append(precision)
            recall_scores.append(recall)
            ndcg_scores.append(ndcg)

        mean_precision = np.mean(precision_scores)
        mean_recall = np.mean(recall_scores)
        mean_ndcg = np.mean(ndcg_scores)

        print(f'Average Precision@{k}: {mean_precision:.4f}')
        print(f'Average Recall@{k}: {mean_recall:.4f}')
        print(f'Average NDCG@{k}: {mean_ndcg:.4f}')

evaluate(test_data_obj, model, test_ratings_tensor, k=10, mean_rating=mean_rating, std_rating=std_rating)

In [None]:
def recommend(data, rescaled_predictions, user_id, user_id_to_index, book_id_to_index, books, top_n):
    # Map user ID to internal index
    user_index = user_id_to_index[user_id]

    # Get all the edges that correspond to the user_id in combined_data (i.e., user-item interactions)
    user_edges = data.edge_index[0] == user_index  # Get all the user-item edges where the user is involved
    user_rated_books = data.edge_index[1][user_edges].cpu().numpy()  # Get the book indices for those edges

    # All books present in the dataset (combined_data)
    all_books = list(book_id_to_index.keys())

    # Find books that the user has not rated (i.e., not in the user_rated_books)
    unread_books_indices = [
        i for i, book in enumerate(all_books) if book_id_to_index[book] not in user_rated_books
    ]
    
    if len(unread_books_indices) == 0:
        return []  # Return empty list if no unread books

    # Map book IDs to the corresponding indices for rescaled_predictions
    unread_books_indices_mapped = [book_id_to_index[all_books[i]] for i in unread_books_indices]

    # Ensure that unread_books_indices maps to actual indices of predictions
    unread_books_ratings = rescaled_predictions[unread_books_indices_mapped]

    # Sort unread books ratings
    top_n_indices = np.argsort(unread_books_ratings)[::-1]
    
    recommended_books = []
    count = 0
    for i in top_n_indices:
        if count >= top_n:
            break
        book_id = all_books[unread_books_indices[i]]
        book_title = books.loc[books['book_id'] == book_id, 'title']
        if not book_title.empty:
            recommended_books.append((book_id, book_title.values[0], unread_books_ratings[i]))
            count += 1

    return recommended_books


In [None]:
with torch.no_grad():
    node_features = test_data_obj.x
    edge_indices = test_data_obj.edge_index
    predictions = model.predict(node_features, edge_indices)

    rescaled_predictions = rescale_predictions(predictions, mean_rating, std_rating)

In [None]:
recommend(combined_data, rescaled_predictions, user_id=1, user_id_to_index=user_id_to_index, book_id_to_index=book_id_to_index, books=books, top_n=10)

In [None]:
recommend(combined_data, rescaled_predictions, user_id=9, user_id_to_index=user_id_to_index, book_id_to_index=book_id_to_index, books=books, top_n=10)