In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

In [2]:
# Load the data
def load_data():
    reviews = pd.read_pickle('../Pickle/reviews.pkl')
    books = pd.read_pickle('../Pickle/books.pkl')
    read = pd.read_pickle('../Pickle/read.pkl')
    user_genres = pd.read_pickle('../Pickle/user_most_common_genres.pkl')
    return reviews, books, read, user_genres

In [3]:
# Initialize ID mappings for users and books
def initialize_id_mappings(combined_data):
    unique_user_ids = set(combined_data['user_id'])
    unique_book_ids = set(combined_data['book_id'])

    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
    book_id_to_index = {book_id: idx for idx, book_id in enumerate(unique_book_ids)}

    return user_id_to_index, book_id_to_index

In [4]:
def filter_and_split_data(ratings_data, user_genres, test_size=0.2, random_state=42):
    # Filter out books that have been rated fewer than twice
    book_user_counts = ratings_data['book_id'].value_counts()
    eligible_books = book_user_counts[book_user_counts > 5].index  
    ratings_data = ratings_data[ratings_data['book_id'].isin(eligible_books)]

    # Filter out users who don't have enough ratings
    user_book_counts = ratings_data['user_id'].value_counts()
    eligible_users = user_book_counts[user_book_counts > 5].index  
    ratings_data = ratings_data[ratings_data['user_id'].isin(eligible_users)]    

    # Ensure users are in both ratings_data and user_genres
    eligible_users_in_genres = user_genres['user_id'].isin(eligible_users)
    user_genres = user_genres[eligible_users_in_genres]
    
    # Merge the ratings_data and user_genres on user_id to get the most common genres for users
    filtered_data = ratings_data.merge(user_genres[['user_id', 'most_common_genres']], on='user_id', how='inner')

    # Now proceed with train-test split
    train_dfs = []
    test_dfs = []

    # Create a mask to check for users with enough rated books (>= 5)
    user_data_valid = filtered_data.groupby('user_id').filter(lambda x: len(x) > 5)

    # Split train-test for each user
    for user_id, user_data in user_data_valid.groupby('user_id'):
        books = user_data['book_id'].unique()  # All books rated by the user
        
        # Split the books into train and test sets
        train_books, test_books = train_test_split(books, test_size=test_size, random_state=random_state)
        
        # Get the train and test data for the user
        user_train_data = user_data[user_data['book_id'].isin(train_books)]
        user_test_data = user_data[user_data['book_id'].isin(test_books)]
        
        # Append to train and test lists
        train_dfs.append(user_train_data)
        test_dfs.append(user_test_data)
    
    # Combine all the train and test data into single dataframes
    train_data = pd.concat(train_dfs)
    test_data = pd.concat(test_dfs)

    return train_data, test_data, user_genres, filtered_data

In [5]:
# Normalize ratings
def normalize_ratings(train_data, test_data):
    mean_rating = train_data['rating'].mean()
    std_rating = train_data['rating'].std()

    train_data['rating'] = (train_data['rating'] - mean_rating) / std_rating
    test_data['rating'] = (test_data['rating'] - mean_rating) / std_rating

    return train_data, test_data, mean_rating, std_rating


In [6]:
# Prepare edge index, ratings, and edge attributes (review embeddings removed)
def prepare_edge_index_ratings_attributes(df, user_id_to_index, book_id_to_index):
    edge_index = []
    ratings = []

    # Get user and book indices as lists or numpy arrays
    user_indices = df['user_id'].map(user_id_to_index).values
    book_indices = df['book_id'].map(book_id_to_index).values

    # Filter out invalid entries (those with no matching user or book)
    valid_rows = df[(user_indices != -1) & (book_indices != -1)]  # Filter out invalid entries
    user_indices = valid_rows['user_id'].map(user_id_to_index).values
    book_indices = valid_rows['book_id'].map(book_id_to_index).values

    # Create edge index and ratings tensors
    edge_index = torch.stack([torch.tensor(user_indices), torch.tensor(book_indices)], dim=0)
    ratings_tensor = torch.tensor(valid_rows['rating'].values, dtype=torch.float32)

    return edge_index, ratings_tensor

In [7]:
def align_user_and_book_features(filtered_data, user_id_to_index, book_id_to_index):
    # Create a dictionary for all possible genres (book genres from filtered data)
    unique_book_genres = sorted(set(filtered_data['filtered_genres'].str.split(',').explode()))
    book_genre_dict = {genre: idx for idx, genre in enumerate(unique_book_genres)}

    # Prepare user genre features
    user_genre_features = {}
    
    # Group by user_id and process all genres at once
    for user_id, group in filtered_data.groupby('user_id'):
        genres = group['most_common_genres'].iloc[0]  # All rows for this user should have the same genres
        genre_vector = np.zeros(len(book_genre_dict))  # Size based on unique book genres
        for genre in genres:
            if genre in book_genre_dict:
                genre_vector[book_genre_dict[genre]] = 1
        user_genre_features[user_id_to_index[user_id]] = torch.tensor(genre_vector, dtype=torch.float32)

    # Prepare book genre features
    book_genre_features = {}
    for book_id, group in filtered_data.groupby('book_id'):
        genres = group['filtered_genres'].iloc[0].split(',')  # Assuming all rows for this book have the same genres
        genre_vector = np.zeros(len(book_genre_dict))  # Size based on unique book genres
        for genre in genres:
            if genre in book_genre_dict:
                genre_vector[book_genre_dict[genre]] = 1
        book_genre_features[book_id_to_index[book_id]] = torch.tensor(genre_vector, dtype=torch.float32)

    return user_genre_features, book_genre_features

In [8]:
from sklearn.decomposition import PCA

def apply_pca_on_features(user_genre_features, book_genre_features, n_components=20):
    # Combine user and book features into one array for PCA
    all_user_features = torch.stack(list(user_genre_features.values()))
    all_book_features = torch.stack(list(book_genre_features.values()))

    all_features = torch.cat([all_user_features, all_book_features], dim=0)  # Combine user and book features

    # Apply PCA
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(all_features)

    # Split back the reduced features into user and book features
    reduced_user_features = reduced_features[:len(user_genre_features)]
    reduced_book_features = reduced_features[len(user_genre_features):]

    # Update the user and book genre features dictionaries with the reduced features
    updated_user_genre_features = {key: torch.tensor(val) for key, val in zip(user_genre_features.keys(), reduced_user_features)}
    updated_book_genre_features = {key: torch.tensor(val) for key, val in zip(book_genre_features.keys(), reduced_book_features)}

    return updated_user_genre_features, updated_book_genre_features

In [9]:
# Prepare the data objects (train/test)
def prepare_data_objects(train_data, test_data, user_genre_features, book_genre_features, user_id_to_index, book_id_to_index):
    train_edge_index, train_ratings_tensor = prepare_edge_index_ratings_attributes(
        train_data, user_id_to_index, book_id_to_index
    )
    test_edge_index, test_ratings_tensor = prepare_edge_index_ratings_attributes(
        test_data, user_id_to_index, book_id_to_index
    )

    num_users = len(user_id_to_index)
    num_books = len(book_id_to_index)

    user_embeddings = torch.zeros(num_users, len(user_genre_features[0]))
    book_embeddings = torch.zeros(num_books, len(book_genre_features[0]))

    for user_idx, user_feature in user_genre_features.items():
        user_embeddings[user_idx] = user_feature

    for book_idx, book_feature in book_genre_features.items():
        book_embeddings[book_idx] = book_feature

    node_embeddings = torch.cat([user_embeddings, book_embeddings], dim=0)

    train_data_obj = Data(
        x=node_embeddings,  
        edge_index=train_edge_index,
        y=train_ratings_tensor
    )
    test_data_obj = Data(
        x=node_embeddings,  
        edge_index=test_edge_index,
        y=test_ratings_tensor
    )

    return train_data_obj, test_data_obj

In [10]:
# Load data
reviews, books, read, user_genres = load_data()

# Merge data with books information
books = books[['book_id', 'title', 'authors', 'filtered_genres']]
data = pd.merge(read, books, on='book_id')
data = data[['user_id', 'book_id', 'rating', 'filtered_genres']]
data = data.reset_index(drop=True)

# Reset index for user_genres
user_genres = user_genres.reset_index()
user_genres = user_genres[user_genres['most_common_genres'].apply(lambda x: len(x) > 0)]


In [11]:
train_data, test_data, user_genres, filtered_data = filter_and_split_data(data, user_genres)

In [12]:
user_id_to_index, book_id_to_index = initialize_id_mappings(filtered_data)

In [13]:
# Normalize the ratings
train_data, test_data, mean_rating, std_rating = normalize_ratings(train_data, test_data)

In [14]:
# Prepare user and book genre features
user_genre_features, book_genre_features = align_user_and_book_features(filtered_data, user_id_to_index, book_id_to_index)

In [15]:
# Apply PCA on the features
user_genre_features, book_genre_features = apply_pca_on_features(user_genre_features, book_genre_features, n_components=10)

In [16]:
# Prepare data objects for train and test
train_data_obj, test_data_obj = prepare_data_objects(
    train_data, test_data, user_genre_features, book_genre_features, user_id_to_index, book_id_to_index
)

In [17]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
import torch.nn as nn
import torch.optim as optim

class GATModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels=1, num_heads=1):
        super(GATModel, self).__init__()
        # Define GAT layers
        self.gat1 = GATConv(in_channels, hidden_channels, heads=num_heads, concat=True)
        self.gat2 = GATConv(hidden_channels * num_heads, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        # Apply GAT layers
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.gat2(x, edge_index)
        # Return predictions for edges only
        src, dst = edge_index  # Get node pairs for each edge
        edge_predictions = (x[src] * x[dst]).sum(dim=-1)  # Inner product for edge regression
        return edge_predictions

from torch_geometric.loader import DataLoader

# Assuming train_data_obj and test_data_obj are instances of torch_geometric.data.Data
train_data_list = [train_data_obj]  
test_data_list = [test_data_obj]   

# Create DataLoader for training and testing
train_loader = DataLoader(train_data_list, batch_size=382, shuffle=True)  # Adjust batch_size as needed
test_loader = DataLoader(test_data_list, batch_size=8, shuffle=False)

def train_gat(model, train_loader, test_loader, num_epochs=20, lr=0.0001, log_freq=1):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()

        # Iterate over batches
        for batch in train_loader:
            # Move the batch to the correct device (CPU/GPU)
            # Forward pass for the current batch
            out = model(batch).squeeze()
            # Calculate loss (MSE) for this batch
            loss = criterion(out, batch.y.squeeze())
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        # Log the training loss at the specified frequency
        if (epoch + 1) % log_freq == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

    # Evaluate on the test data
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            test_out = model(batch).squeeze()
            test_loss = criterion(test_out, batch.y.squeeze())
        print(f'Test Loss: {test_loss.item()}')


# Initialize the model
in_channels = train_data_obj.x.shape[1]  # Number of input features per node
hidden_channels = 14
num_heads = 8 
out_channels = 1

# Initialize and train the model
model = GATModel(in_channels, hidden_channels, out_channels=out_channels, num_heads=num_heads)
train_gat(model, train_loader, test_loader, num_epochs=50, lr=0.0001, log_freq=1)

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 5801050752 bytes.

In [None]:
def validate_data(data):
    assert data.edge_index.ndim == 2 and data.edge_index.shape[0] == 2, "Invalid edge_index shape"
    assert data.x.ndim == 2, "Node features should be a 2D tensor"
    assert data.x.shape[1] > 0, "Node features are missing"
    print(f"Data validation passed. Num nodes: {data.x.shape[0]}, Num features: {data.x.shape[1]}, Num edges: {data.edge_index.shape[1]}")

# Validate training data before training
validate_data(train_data_obj)