In [1]:
import pandas as pd
import pickle
import faiss
import numpy as np
import joblib
import torch
from sklearn.model_selection import train_test_split

In [None]:
def load_data():
    # Load books in chunks
    books_list = []
    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                books_list.append(pickle.load(file))
            except EOFError:
                break
    books = pd.concat(books_list, ignore_index=True).drop_duplicates(subset='title', keep='first')

    # Load other datasets
    interactions = pd.read_pickle('../Pickle/interactions.pkl')
    read = pd.read_pickle('../Pickle/read.pkl')
    reviews = pd.read_pickle('../Pickle/reviews.pkl')
    
    # Load embeddings and indexes
    umap_embeddings = pd.read_pickle('../Pickle/umap_embeddings.pkl')
    faiss_index = faiss.read_index('../Pickle/faiss_index.bin')
    book_id_to_index = pd.read_pickle('../Pickle/book_id_to_index.pkl')
    user_id_to_index_gat = pd.read_pickle('../Pickle/user_id_to_index_gat.pkl')
    book_id_to_index_gat = pd.read_pickle('../Pickle/book_id_to_index_gat.pkl')
    all_embeddings = pd.read_pickle('../Pickle/gat_embeddings.pkl')
    clustered_books = pd.read_pickle('../Pickle/clustered_books.pkl')
    clusters = pd.read_pickle('../Pickle/clusters.pkl')

    # Filter datasets to valid book ids
    valid_book_ids = set(books['book_id'])
    read = read[(read['is_read'] == 1) & (read['book_id'].isin(valid_book_ids))]
    interactions = interactions[interactions['book_id'].isin(valid_book_ids)]
    reviews = reviews[reviews['book_id'].isin(valid_book_ids)]

    return (
        books, interactions, read, reviews,
        umap_embeddings, faiss_index, book_id_to_index,
        user_id_to_index_gat, book_id_to_index_gat, all_embeddings, clustered_books, clusters
    )


In [None]:
def denormalize_rating(log_scaled_ratings, min_rating):
    log_scaled_ratings = np.asarray(log_scaled_ratings, dtype=float)
    original_ratings = np.expm1(log_scaled_ratings)
    if min_rating:
        original_ratings += min_rating
    return np.clip(original_ratings, 0, 5)

In [None]:
def load_gat_model():
    from Gatv2Conv import GATModel
    model = GATModel(
        in_channels=32,
        hidden_channels=25,
        out_channels=1,
        num_heads=25,
        edge_feature_dim=386
    )
    model.load_state_dict(torch.load('../RecSysJupyter/gat_model.pth'))
    model.eval()
    return model

In [None]:
def recommend_HDBSCAN(book_id, book_id_to_index, soft_clusters, umap_embeddings, clustered_books, faiss_index, top_n=5):
    """
    Recommend books based on soft clustering (probabilistic membership vectors) using the HDBSCAN clustering results.
    
    This function recommends books by first checking if the input book belongs to a cluster with significant 
    soft membership. If the book has strong membership in one or more clusters, recommendations are made based 
    on the weighted distances between the book's embedding and those of the books in the same cluster(s). 
    If the book does not have a strong membership in any cluster, or if it's an outlier, a global search using 
    FAISS is performed to find similar books.

    Args:
    book_id (int): The ID of the book for which recommendations are to be made.
    book_id_to_index (dict): A mapping of book IDs to their corresponding index in the embedding and clustering arrays.
    soft_clusters (numpy.ndarray): The soft clustering (membership vectors), where each element contains the 
                                    membership probabilities for each cluster for a given book.
    umap_embeddings (numpy.ndarray): The UMAP embeddings for all books.
    clustered_books (pandas.DataFrame): A DataFrame containing the books, with at least a column `book_id` 
                                        containing the book IDs.
    faiss_index (faiss.Index): The FAISS index for performing global nearest neighbor search.
    top_n (int, optional): The number of recommended books to return. Default is 5.

    Returns:
    list: A list of tuples containing recommended book IDs and their corresponding similarity scores, sorted by similarity.
    """
    # Check if the book ID exists
    if book_id not in book_id_to_index:
        print(f"Book ID {book_id} not found.")
        return []
    
    # Get the index and soft membership of the input book
    book_idx = book_id_to_index[book_id]
    book_soft_membership = soft_clusters[book_idx]

    # Get the embedding for the book
    query_embedding = umap_embeddings[book_idx].reshape(1, -1).astype('float32')

    # If the book is not an outlier (it will have a soft membership in multiple clusters)
    if np.max(book_soft_membership) > 0:  # Check if the book has any strong membership
        # Get the indices of books with significant membership in the same clusters
        same_cluster_indices = np.where(book_soft_membership > 0.1)[0]  # Threshold for significant membership (e.g., 0.1)

        # If there are other books in the same clusters
        if len(same_cluster_indices) > 0:
            # Compute weighted distances to all books based on soft membership
            weighted_distances = []
            for idx in same_cluster_indices:
                soft_membership = soft_clusters[idx]
                weighted_distance = np.sum(soft_membership * np.linalg.norm(umap_embeddings[idx] - query_embedding, axis=1))
                weighted_distances.append(weighted_distance)

            # Get top_n closest books based on weighted distances
            top_indices = np.argsort(weighted_distances)[:top_n]

            # Map back to book IDs and return similarity scores
            results = []
            for idx in top_indices:
                similar_book_idx = same_cluster_indices[idx]
                similar_book_id = clustered_books.iloc[similar_book_idx]['book_id']
                similarity_score = 1 / (1 + weighted_distances[idx])  # Convert distance to similarity
                results.append((similar_book_id, similarity_score))

            return results

    # If outlier or no significant membership in soft clusters, perform global FAISS search
    print("Book is an outlier or has no significant cluster neighbors, using global search")

    D, I = faiss_index.search(query_embedding, top_n + 1)  # +1 because it includes itself as the closest neighbor
    results = []
    count = 0
    for idx in I[0]:
        if idx == book_idx:
            continue  # Skip the query book itself
        similar_book_id = clustered_books.iloc[idx]['book_id']
        distance = D[0, count]
        similarity_score = 1 / (1 + distance)
        results.append((similar_book_id, similarity_score))
        count += 1
        if len(results) == top_n:
            break
    
    return results


In [None]:
def recommend_nmf(
    nmf_model, 
    interactions, 
    user_id, 
    books_read, 
    books, 
    min_rating, 
    n_recommendations=5, 
    top_n_factors=5
):
    """
    Recommend books to a user using a trained NMF (Non-negative Matrix Factorization) model for collaborative filtering.
    This function predicts ratings for books that the user hasn't already read and provides explanations for the 
    recommendations based on the latent factors learned by the NMF model.
    
    The function predicts the ratings for candidate books that the user has not interacted with and ranks them 
    by predicted rating. It also explains the recommendations by providing contributions from the top latent factors 
    that influence the predicted rating.

    Args:
    nmf_model (surprise.NMF): The trained NMF model used to predict ratings.
    interactions (pandas.DataFrame): A DataFrame containing user-item interactions (e.g., user ratings for books).
    user_id (int): The ID of the user for whom recommendations are being generated.
    books_read (list of int): List of book IDs that the user has already read.
    books (pandas.DataFrame): A DataFrame containing the book details, including 'book_id' and 'title'.
    min_rating (float, optional): The minimum rating used for denormalization (if any). Default is None.
    n_recommendations (int, optional): The number of recommendations to return. Default is 5.
    top_n_factors (int, optional): The number of top latent factors to use for explaining the recommendation. Default is 5.

    Returns:
    list: A list of dictionaries, each containing a recommended book's ID, title, predicted rating, and an explanation 
          of the recommendation based on the latent factors.
    
    """
    
    # Get the unique book ids from interactions
    all_books = interactions['book_id'].unique()
    
    # Filter out books the user has already read (optional, if books_read is provided)
    candidate_books = [book_id for book_id in all_books if book_id not in books_read]

    # Generate predicted ratings for each candidate book
    user_predictions = [
        (book_id, nmf_model.predict(uid=user_id, iid=book_id).est)
        for book_id in candidate_books
    ]
    
    # Sort by predicted rating (highest first)
    top_books = sorted(user_predictions, key=lambda x: x[1], reverse=True)[:n_recommendations]
    
    # Get Surprise internal user/item mappings
    user_inner_id = nmf_model.trainset.to_inner_uid(user_id)
    pu = nmf_model.pu
    qi = nmf_model.qi
    
    user_factors = pu[user_inner_id]

    recommendations_with_explanations = []

    for book_id, raw_pred_rating in top_books:
        # Get internal item index for the model
        try:
            item_inner_id = nmf_model.trainset.to_inner_iid(book_id)
        except ValueError:
            continue  # Item not in the model
        
        item_factors = qi[item_inner_id]

        # Contributions from each latent factor
        contributions = user_factors * item_factors
        predicted_rating = contributions.sum()

        # Get indices of top contributing latent factors
        top_factors_idx = np.argsort(np.abs(contributions))[::-1][:top_n_factors]

        # Book title lookup
        book_title = books.loc[books['book_id'] == book_id, 'title'].values[0]

        # Format the factor explanations
        explanations = []
        for rank, i in enumerate(top_factors_idx, 1):
            explanation = {
                'latent_factor': int(i + 1),
                'user_affinity': round(user_factors[i], 3),
                'item_relevance': round(item_factors[i], 3),
                'contribution': round(contributions[i], 3)
            }
            explanations.append(explanation)
        
            denormed_rating = denormalize_rating([predicted_rating], min_rating)[0]

        # Append the final recommendation and explanation
        recommendations_with_explanations.append({
            'book_id': book_id,
            'title': book_title,
            'predicted_rating': round(denormed_rating, 2),
            'top_latent_factors': explanations
        })

    return recommendations_with_explanations


In [None]:
def recommend_GAT(user_id, unread_book_ids, all_embeddings, user_id_to_index, book_id_to_index, books_df, min_rating, top_n=5):
    """
    Recommend books to a user using a Graph Attention Network (GAT)-based collaborative filtering approach.
    This function predicts the ratings for books that the user has not read yet based on user and book embeddings 
    and returns the top N recommendations.

    Args:
    user_id (int): The ID of the user for whom recommendations are being generated.
    unread_book_ids (list of int): List of book IDs that the user has not read.
    all_embeddings (numpy.ndarray): An array containing the embeddings for all users and books.
    user_id_to_index (dict): A dictionary mapping user IDs to indices in the embedding matrix.
    book_id_to_index (dict): A dictionary mapping book IDs to indices in the embedding matrix.
    books_df (pandas.DataFrame): A DataFrame containing book information (e.g., 'book_id' and 'title').
    min_rating (float, optional): The minimum rating used for denormalization (if any). Default is None.
    top_n (int, optional): The number of recommendations to return. Default is 5.

    Returns:
    list: A list of the top N recommended books, each containing the book's ID and the predicted rating.
    """
    
    # Get the user's index in the embedding matrix
    user_index = user_id_to_index.get(user_id)
    if user_index is None:
        raise ValueError(f"User ID {user_id} not found in index mappings.")

    # Extract the user's embedding
    user_embedding = all_embeddings[user_index]

    predictions = []

    # Predict ratings for all unread books
    for book_id in unread_book_ids:
        book_index = book_id_to_index.get(book_id)
        if book_index is None:
            continue  # Skip books not in the book_id_to_index

        # Extract the book's embedding
        book_embedding = all_embeddings[book_index]

        # Compute the predicted rating as the dot product between the user's and book's embedding
        predicted_rating = np.expm1(np.dot(user_embedding, book_embedding))  # Denormalize if needed

        denormed_rating = denormalize_rating([predicted_rating], min_rating)[0] if min_rating else predicted_rating

        predictions.append({
            "book_id": book_id,
            "predicted_rating": denormed_rating
        })

    # Sort the predictions by predicted rating in descending order
    sorted_books = sorted(predictions, key=lambda x: x["predicted_rating"], reverse=True)

    # Return the top_n recommendations
    return sorted_books[:top_n]


In [None]:
def merge_recommendations_weighted(gat_recs, nmf_recs, final_size=5, gat_weight=0.6, nmf_weight=0.4):
    """
    Merges recommendations from two different models (GAT and NMF) by weighting their scores and selecting the top recommendations.
    This function combines recommendations from both models, assigns weights to each model's predictions,
    and returns a final list of top recommendations based on the weighted scores.

    Args:
    gat_recs (list of dict): List of GAT recommendations, where each recommendation is a dictionary 
                              containing 'book_id' and 'predicted_rating'.
    nmf_recs (list of dict): List of NMF recommendations, where each recommendation is a dictionary 
                              containing 'book_id' and 'predicted_rating'.
    final_size (int, optional): The number of top recommendations to return. Default is 5.
    gat_weight (float, optional): The weight assigned to the GAT model's recommendations. Default is 0.6.
    nmf_weight (float, optional): The weight assigned to the NMF model's recommendations. Default is 0.4.

    Returns:
    list: A list of dictionaries containing the top 'final_size' recommendations, each with 'book_id' and the combined score.
    
    Notes:
    - Recommendations from both models are weighted based on the specified `gat_weight` and `nmf_weight`.
    - Duplicates (books already recommended by one model) are removed, and the final recommendations are selected based on 
      the weighted scores.
    - The function sorts recommendations in descending order by their combined weighted score and returns the top `final_size` recommendations.
    """
    
    combined, seen_books = [], set()

    # Create a list of all recommendations with their respective weights
    weighted_recs = []

    # Add GAT recommendations with weighted scores
    for rec in gat_recs:
        if rec['book_id'] not in seen_books:
            weighted_recs.append({'book_id': rec['book_id'], 'score': gat_weight * rec['predicted_rating']})
            seen_books.add(rec['book_id'])

    # Add NMF recommendations with weighted scores
    for rec in nmf_recs:
        if rec['book_id'] not in seen_books:
            weighted_recs.append({'book_id': rec['book_id'], 'score': nmf_weight * rec['predicted_rating']})
            seen_books.add(rec['book_id'])

    # Sort all recommendations by score in descending order
    weighted_recs.sort(key=lambda x: x['score'], reverse=True)

    # Select the top recommendations until reaching final_size
    for rec in weighted_recs:
        if len(combined) < final_size:
            combined.append(rec)
        else:
            break

    return combined


In [None]:
books, interactions, read, reviews, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings, clustered_books, clusters = load_data()

In [None]:
def get_user_stats(user_id, interactions, reviews):
    """
    Get the statistics about a user's reading and reviewing activity.

    Args:
    user_id (int): The user ID for whom to get the statistics.
    interactions (DataFrame): The interactions dataframe containing user-book interaction data.
    reviews (DataFrame): The reviews dataframe containing review data for users.

    Returns:
    tuple: A tuple containing the number of books the user has read (books_read) and the number of reviews they've written (reviews_written).
    """
    # Get the number of books read by the user from the interactions dataframe
    books_read = interactions[interactions['user_id'] == user_id]['book_id'].nunique()

    # Get the number of reviews written by the user from the reviews dataframe
    reviews_written = reviews[reviews['user_id'] == user_id].shape[0]

    return books_read, reviews_written


def recommend_for_user(user_id, interactions, reviews, umap_embeddings, faiss_index, book_id_to_index, clustered_books, clusters, nmf_model, all_embeddings, user_id_to_index_gat, book_id_to_index_gat, books, min_rating=None, top_n=5, final_size=5, gat_weight=0.6, nmf_weight=0.4):
    """
    Recommends books for a user based on their reading and review activity.

    This function first analyzes the user's activity to determine an appropriate recommendation strategy. 
    Based on the number of books read and reviews written, it selects one of the following recommendation methods:
    - If the user has read fewer than 10 books, it uses HDBSCAN-based recommendations.
    - If the user has read more than 10 books but written fewer than 6 reviews, it uses NMF-based recommendations.
    - If the user has read more than 10 books and written more than 5 reviews, it combines NMF and GAT-based recommendations using weighted merging.

    Args:
    user_id (int): The user ID for whom to generate recommendations.
    interactions (DataFrame): The interactions dataframe containing user-book interaction data.
    reviews (DataFrame): The reviews dataframe containing user review data.
    umap_embeddings (ndarray): UMAP embeddings of books used for HDBSCAN-based recommendations.
    faiss_index (Index): FAISS index for performing efficient nearest neighbor search.
    book_id_to_index (dict): A dictionary mapping book IDs to their index in the embeddings.
    clustered_books (DataFrame): DataFrame containing clustered books.
    clusters (ndarray): An array of cluster assignments for each book.
    nmf_model (NMF model): Trained NMF model for recommendation generation.
    all_embeddings (ndarray): Embeddings of books for the GAT model.
    user_id_to_index_gat (dict): Dictionary mapping user IDs to indices in the GAT model.
    book_id_to_index_gat (dict): Dictionary mapping book IDs to indices in the GAT model.
    books (DataFrame): DataFrame containing book information, used for title lookup in recommendations.
    min_rating (float, optional): The minimum rating for denormalization. Default is None.
    top_n (int, optional): The number of top recommendations to return. Default is 5.
    final_size (int, optional): The final number of top recommendations to return after merging. Default is 5.
    gat_weight (float, optional): Weight for the GAT model's recommendations. Default is 0.6.
    nmf_weight (float, optional): Weight for the NMF model's recommendations. Default is 0.4.

    Returns:
    list: A list of dictionaries containing the final recommendations for the user, each with 'book_id' and 'predicted_rating'.
    
    Notes:
    - The function adapts the recommendation strategy based on the user's activity (books read and reviews written).
    - The final recommendation set is a combination of models and weighted based on the user's activity.
    """

    # Get user statistics (books read and reviews written)
    books_read, reviews_written = get_user_stats(user_id, interactions, reviews)

    # Determine recommendation strategy
    if books_read < 10:
        # Use HDBSCAN-based recommendation if user has read fewer than 10 books
        print(f"User {user_id}: Less than 10 books read, using HDBSCAN.")
        unread_books = interactions[interactions['user_id'] == user_id]['book_id'].unique()
        recommendations = recommend_HDBSCAN(user_id, book_id_to_index, clusters, umap_embeddings, clustered_books, faiss_index, top_n)

    elif books_read > 10 and reviews_written <= 5:
        # Use NMF-based recommendation if user has read more than 10 books but written fewer than 6 reviews
        print(f"User {user_id}: More than 10 books read but less than 6 reviews, using NMF.")
        unread_books = interactions[interactions['user_id'] == user_id]['book_id'].unique()
        recommendations = recommend_nmf(nmf_model, interactions, user_id, unread_books, books, min_rating, top_n)

    elif books_read > 10 and reviews_written > 5:
        # Use the combined NMF + GAT recommendation if user has read more than 10 books and written more than 5 reviews
        print(f"User {user_id}: More than 10 books read and more than 5 reviews, using combined NMF + GAT.")
        unread_books = interactions[interactions['user_id'] == user_id]['book_id'].unique()
        
        # Get NMF recommendations
        nmf_recs = recommend_nmf(nmf_model, interactions, user_id, unread_books, books, min_rating, top_n)
        
        # Get GAT recommendations
        gat_recs = recommend_GAT(user_id, unread_books, all_embeddings, user_id_to_index_gat, book_id_to_index_gat, books, min_rating, top_n)
        
        # Merge NMF and GAT recommendations
        recommendations = merge_recommendations_weighted(gat_recs, nmf_recs, final_size, gat_weight, nmf_weight)

    return recommendations
