In [None]:
import pandas as pd
import pickle
import faiss
import numpy as np
import joblib
from torch_geometric.nn import GATv2Conv
import torch

In [None]:
# Load the necessary data
def load_data():
    books_list = []
    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                chunk = pickle.load(file)
                books_list.append(chunk)
            except EOFError:
                break
    books = pd.concat(books_list, ignore_index=True).drop_duplicates(subset='title', keep='first')

    interactions = pd.read_pickle('../Pickle/interactions.pkl')
    read = pd.read_pickle('../Pickle/read.pkl')
    reviews = pd.read_pickle('../Pickle/reviews.pkl')
    
    with open('../Pickle/umap_embeddings.pkl', 'rb') as f:
        umap_embeddings = pickle.load(f)
    faiss_index = faiss.read_index('../Pickle/faiss_index.bin')

    with open('../Pickle/book_id_to_index.pkl', 'rb') as f:
        book_id_to_index = pickle.load(f)
    with open('../Pickle/user_id_to_index_gat.pkl', 'rb') as f:
        user_id_to_index_gat = pickle.load(f)
    with open('../Pickle/book_id_to_index_gat.pkl', 'rb') as f:
        book_id_to_index_gat = pickle.load(f)
    with open('../Pickle/gat_embeddings.pkl', 'rb') as f:
        all_embeddings = pickle.load(f)
    
        # Filter read and interactions for valid books
    read = read[read['is_read'] == 1]
    valid_book_ids = set(books['book_id'])
    interactions = interactions[interactions['book_id'].isin(valid_book_ids)]
    read = read[read['book_id'].isin(valid_book_ids)]
    reviews = reviews[reviews['book_id'].isin(valid_book_ids)]

    return books, interactions, read, reviews, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings, reviews

In [None]:
read = pd.read_pickle('../Pickle/read.pkl')
interactions = pd.read_pickle('../Pickle/interactions.pkl')

In [None]:
# Load GAT model
def load_gat_model():
    from Gatv2Conv import GATModel
    model = GATModel(
        in_channels=32,  # Input features per node
        hidden_channels=25,
        out_channels=1,
        num_heads=25,
        edge_feature_dim=386  # Edge feature dimension
    )
    model.load_state_dict(torch.load('../RecSysJupyter/gat_model.pth'))
    model.eval()
    return model

In [None]:
# Recommendation function for HDBSCAN (Content-based)
def recommend_books_HDBSCAN(book_id, books, umap_embeddings, faiss_index, book_id_to_index, top_n=5):
    if book_id not in book_id_to_index:
        return []

    book_idx = book_id_to_index[book_id]
    distances, indices = faiss_index.search(np.array([umap_embeddings[book_idx]]), top_n + 1)
    recommendations = []

    for idx, dist in zip(indices[0][1:], distances[0][1:]):  # Exclude the book itself
        if idx >= len(books):
            continue  # Skip out-of-bounds indices

        recommended_book = books.iloc[idx]
        explanation = f"Similarity Score: {round(1 / (1 + dist), 3)}"
        recommendations.append({
            "book_id": recommended_book["book_id"],
            "title": recommended_book["title"],
            "authors": ', '.join(recommended_book["authors"]) if isinstance(recommended_book["authors"], list) else recommended_book["authors"],
            "predicted_rating": "N/A",
            "explanation": explanation
        })

    return recommendations

In [None]:
def recommend_books_NMF(nmf_model, interactions, user_id, books_read, books, n_recommendations=5):
    all_books = interactions['book_id'].unique()  # Include all books, no exclusions

    # Predict ratings for all books
    user_predictions = [
        (book_id, nmf_model.predict(uid=user_id, iid=book_id).est) for book_id in all_books
    ]

    user_predictions.sort(key=lambda x: x[1], reverse=True)
    top_books = user_predictions[:n_recommendations]

    recommendations = []
    for book_id, rating in top_books:
        book_info = books.loc[books['book_id'] == book_id, ['title', 'authors']].values[0]
        recommendations.append({
            "book_id": book_id,
            "title": book_info[0],
            "authors": ', '.join(book_info[1]) if isinstance(book_info[1], list) else book_info[1],
            "predicted_rating": rating,
            "explanation": "N/A"  # Add an explanation if needed
        })

    return recommendations


In [None]:
def recommend_books_GAT(user_id, unread_book_ids, all_embeddings, user_id_to_index, book_id_to_index, books_df, top_n=5):
    all_books = unread_book_ids  # Include all books (no exclusion logic)

    user_index = user_id_to_index.get(user_id)
    if user_index is None:
        raise ValueError(f"User ID {user_id} not found in index mappings.")

    user_embedding = all_embeddings[user_index]
    predictions = []

    for book_id in all_books:
        book_index = book_id_to_index.get(book_id)
        if book_index is None:
            continue

        book_embedding = all_embeddings[book_index]
        predicted_rating = np.dot(user_embedding, book_embedding)
        predicted_rating = np.expm1(predicted_rating)  # Denormalize

        book_title = books_df.loc[books_df['book_id'] == book_id, 'title'].values[0]
        predictions.append({
            "book_id": book_id,
            "title": book_title,
            "authors": ', '.join(books_df.loc[books_df['book_id'] == book_id, 'authors'].values[0]) if isinstance(books_df.loc[books_df['book_id'] == book_id, 'authors'].values[0], list) else books_df.loc[books_df['book_id'] == book_id, 'authors'].values[0],
            "predicted_rating": predicted_rating,
            "explanation": "N/A"  # Add an explanation if needed
        })

    top_recommendations = sorted(predictions, key=lambda x: x["predicted_rating"], reverse=True)[:top_n]
    return top_recommendations


In [None]:
# Helper function for weighted merge
def merge_recommendations_weighted(gat_recommendations, nmf_recommendations, final_size=5, gat_priority=3):
    combined = []
    seen_books = set()

    # Step 1: Add top GAT recommendations (prioritised)
    for rec in gat_recommendations:
        if rec not in seen_books:
            combined.append(rec)
            seen_books.add(rec)
        if len(combined) == gat_priority:
            break

    # Step 2: Add top NMF recommendations to balance
    for rec in nmf_recommendations:
        if rec not in seen_books:
            combined.append(rec)
            seen_books.add(rec)
        if len(combined) == final_size:
            break

    # Step 3: If still not enough, fill from remaining unique recommendations
    all_recs = gat_recommendations + nmf_recommendations
    for rec in all_recs:
        if rec not in seen_books:
            combined.append(rec)
            seen_books.add(rec)
        if len(combined) == final_size:
            break

    return combined


In [None]:
# Main recommendation logic with user validity checks
def recommend_for_user(user_id, books, interactions, read, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings, reviews):
    user_books_read = read[read['user_id'] == user_id]
    user_num_books = user_books_read['book_id'].nunique()
    user_reviews = reviews[reviews['user_id'] == user_id]
    user_num_reviews = user_reviews.shape[0]

    all_books = interactions['book_id'].unique()
    unread_books = list(set(all_books) - set(user_books_read['book_id']))

    # --- Case 1: Content-Based Filtering (HDBSCAN) ---
    if user_num_books < 5:
        print(f"User {user_id} has read {user_num_books} books. Using content-based filtering.")
        recommendations = []
        for book_id in user_books_read['book_id']:
            book_recommendations = recommend_books_HDBSCAN(book_id, books, umap_embeddings, faiss_index, book_id_to_index)
            recommendations.extend(book_recommendations)

        # Remove duplicates by title and return
        unique_recommendations = {rec['title']: rec for rec in recommendations}.values()
        return list(unique_recommendations)[:5]

    # --- Case 2: Collaborative Filtering (NMF) ---
    if user_num_books >= 5 and user_num_reviews < 5:
        print(f"User {user_id} has fewer reviews. Using collaborative filtering (NMF).")
        best_nmf = joblib.load('../Pickle/best_nmf_model.pkl')
        book_recommendations = recommend_books_NMF(best_nmf, interactions, user_id, user_books_read['book_id'], books)
        return book_recommendations[:5]

    # --- Case 3: Hybrid Filtering (NMF + GAT) ---
    if user_num_books >= 5 and user_num_reviews >= 5:
        print(f"User {user_id} has more than 5 books and reviews. Using hybrid filtering (NMF + GAT).")
        nmf_recommendations = recommend_books_NMF(best_nmf, interactions, user_id, user_books_read['book_id'], books)
        gat_recommendations = recommend_books_GAT(user_id, unread_books, all_embeddings, user_id_to_index_gat, book_id_to_index_gat, books)

        # Merge the recommendations giving more weight to GAT
        recommendations = merge_recommendations_weighted(gat_recommendations, nmf_recommendations, final_size=5, gat_priority=3)

        return recommendations




In [None]:
books, interactions, read, reviews, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings, reviews = load_data()

In [None]:
from sklearn.model_selection import train_test_split

def split_data(ratings_data, test_size=0.1, val_size=0.15, random_state=42):

    # Identify users and books that appear only once in the dataset
    user_counts = ratings_data['user_id'].value_counts()
    book_counts = ratings_data['book_id'].value_counts()

    # Find interactions where user or book appears only once
    single_interactions = ratings_data[
        ratings_data['user_id'].isin(user_counts[user_counts == 1].index) | 
        ratings_data['book_id'].isin(book_counts[book_counts == 1].index)
    ]

    # Remove those interactions from the main dataset
    remaining_interactions = ratings_data[~ratings_data.index.isin(single_interactions.index)]

    # Split the remaining interactions into train, validation, and test
    train_df, temp_data = train_test_split(remaining_interactions, test_size=test_size+val_size, random_state=random_state)
    val_data, test_data = train_test_split(temp_data, test_size=test_size/(test_size+val_size), random_state=random_state)

    # Add the single interactions to the training set
    train_data = pd.concat([train_df, single_interactions], ignore_index=True)

    return train_data, val_data, test_data


In [None]:
train_data_inter, test_data_inter = split_data(interactions)
train_data_rev, test_data_rev = split_data(reviews)

using train_data_inter, test_data_inter call case 2 for valid users and eval
train_data_rev, test_data_rev call case 3 for valid users and eval


In [None]:
recommendations = recommend_for_user(1 , books, interactions, read, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings)

In [None]:
for rec in recommendations:
    print(f"Book ID: {rec['book_id']}")
    print(f"Title: {rec['title']}")
    
    # Check if 'authors' is a list or a single string
    print(f"Authors: {rec['authors']}")
    print(f"Predicted Rating: {rec['predicted_rating']:.4f}")
    print("-" * 40)


In [None]:
from sklearn.metrics import precision_score, recall_score, ndcg_score, mean_reciprocal_rank
import numpy as np

# Helper function to calculate NDCG
def ndcg_at_k(recommended, relevant, k=5):
    """
    Calculates NDCG (Normalized Discounted Cumulative Gain) at rank k.
    recommended: list of recommended book ids
    relevant: list of relevant book ids (from ground truth)
    k: rank at which NDCG is calculated
    """
    recommended_at_k = recommended[:k]
    relevant_at_k = [1 if book in recommended_at_k else 0 for book in relevant]

    dcg = sum([rel / np.log2(i + 2) for i, rel in enumerate(relevant_at_k)])
    idcg = sum([1 / np.log2(i + 2) for i in range(min(k, len(relevant)))])  # Ideal DCG
    return dcg / idcg if idcg > 0 else 0

# Metrics calculation function
def evaluate_recommendations(test_data, recommendations, k=5):
    """
    Evaluates recommendations using Precision, Recall, NDCG, and MRR.
    test_data: The ground truth (users, books they interacted with, and ratings)
    recommendations: The recommended books for each user
    """
    precision_scores = []
    recall_scores = []
    ndcg_scores = []
    mrr_scores = []
    diversity_scores = []
    surprise_scores = []
    novelty_scores = []

    for user_id, recs in recommendations.items():
        # Get the ground truth for the user from test data
        ground_truth = test_data[test_data['user_id'] == user_id]
        relevant_books = set(ground_truth['book_id'])
        
        # Recommended books
        recommended_books = [rec['book_id'] for rec in recs]

        # Precision
        precision = len(relevant_books & set(recommended_books)) / len(recommended_books)
        precision_scores.append(precision)

        # Recall
        recall = len(relevant_books & set(recommended_books)) / len(relevant_books)
        recall_scores.append(recall)

        # NDCG
        ndcg = ndcg_at_k(recommended_books, list(relevant_books), k)
        ndcg_scores.append(ndcg)

        # MRR (Mean Reciprocal Rank)
        ranks = [i + 1 for i, rec in enumerate(recommended_books) if rec in relevant_books]
        mrr = np.mean([1 / rank for rank in ranks]) if ranks else 0
        mrr_scores.append(mrr)

        # Diversity (intra-list similarity) - Assuming cosine similarity or distance between embeddings
        diversity = calculate_diversity(recommended_books)  # You would need to define a function to calculate this
        diversity_scores.append(diversity)

        # Surprise (out-of-the-box recommendations)
        surprise = calculate_surprise(recommended_books)  # Define surprise based on rarity, etc.
        surprise_scores.append(surprise)

        # Novelty (recommendation of rare books, inverse of popularity)
        novelty = calculate_novelty(recommended_books)  # Define novelty based on item rarity
        novelty_scores.append(novelty)

    # Average the metrics
    return {
        "precision": np.mean(precision_scores),
        "recall": np.mean(recall_scores),
        "ndcg": np.mean(ndcg_scores),
        "mrr": np.mean(mrr_scores),
        "diversity": np.mean(diversity_scores),
        "surprise": np.mean(surprise_scores),
        "novelty": np.mean(novelty_scores),
    }

# Helper function to calculate diversity, surprise, and novelty
def calculate_diversity(recommended_books):
    # Example: Measure diversity using embeddings (e.g., cosine similarity)
    pass  # Define how to measure diversity between the recommended books

def calculate_surprise(recommended_books):
    # Example: Measure surprise based on rare items (e.g., inverse frequency)
    pass  # Define how to calculate surprise

def calculate_novelty(recommended_books):
    # Example: Measure novelty based on book popularity
    pass  # Define how to calculate novelty



In [None]:
# Evaluation with early check for user validity before calling recommend_for_user
def evaluate_case_2_and_3(train_data_inter, test_data_inter, train_data_rev, test_data_rev, books, interactions, read, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings, reviews):
    
    # Initialize results containers
    case_2_results = []
    case_3_results = []

    # Case 2: Collaborative Filtering (NMF) evaluation
    for user_id in test_data_inter['user_id'].unique():
        # Check user eligibility for Case 2 (Collaborative Filtering)
        user_books_read = read[read['user_id'] == user_id]
        user_num_books = user_books_read['book_id'].nunique()

        if user_num_books >= 5:
            recommendations = recommend_for_user(
                user_id, books, interactions, read, umap_embeddings, faiss_index, 
                book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings, reviews
            )
            actual_books = test_data_inter[test_data_inter['user_id'] == user_id]['book_id'].values

            precision, recall, ndcg, mrr, diversity, surprise, novelty = evaluate_recommendations(recommendations, actual_books)

            # Collect metrics into a dictionary (or tuple)
            case_2_results.append({
                'user_id': user_id,
                'precision': precision,
                'recall': recall,
                'ndcg': ndcg,
                'mrr': mrr,
                'diversity': diversity,
                'surprise': surprise,
                'novelty': novelty
            })

    # Case 3: Hybrid Filtering (NMF + GAT) evaluation
    for user_id in test_data_rev['user_id'].unique():
        # Check user eligibility for Case 3 (Hybrid Filtering)
        user_books_read = read[read['user_id'] == user_id]
        user_num_books = user_books_read['book_id'].nunique()
        user_reviews = reviews[reviews['user_id'] == user_id]
        user_num_reviews = user_reviews.shape[0]

        if user_num_books >= 5 and user_num_reviews >= 5:
            recommendations = recommend_for_user(
                user_id, books, interactions, read, umap_embeddings, faiss_index, 
                book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings, reviews
            )
            actual_books = test_data_rev[test_data_rev['user_id'] == user_id]['book_id'].values

            precision, recall, ndcg, mrr, diversity, surprise, novelty = evaluate_recommendations(recommendations, actual_books)

            # Collect metrics into a dictionary (or tuple)
            case_3_results.append({
                'user_id': user_id,
                'precision': precision,
                'recall': recall,
                'ndcg': ndcg,
                'mrr': mrr,
                'diversity': diversity,
                'surprise': surprise,
                'novelty': novelty
            })

    # Return both cases results
    return {
        'case_2_results': case_2_results,
        'case_3_results': case_3_results
    }

# Call the evaluation function and store results
results = evaluate_case_2_and_3(
    train_data_inter, test_data_inter, train_data_rev, test_data_rev, books, 
    interactions, read, umap_embeddings, faiss_index, book_id_to_index, 
    user_id_to_index_gat, book_id_to_index_gat, all_embeddings, reviews
)

# If you want to check results after the call
print("Case 2 Results:", results['case_2_results'][:5])  # print first 5 as example
print("Case 3 Results:", results['case_3_results'][:5])  # print fir
