In [None]:
import pandas as pd
import pickle
import faiss
import numpy as np
import joblib
import torch
from torch_geometric.nn import GATv2Conv
from sklearn.model_selection import train_test_split

In [None]:
def load_data():
    # Load books in chunks
    books_list = []
    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                books_list.append(pickle.load(file))
            except EOFError:
                break
    books = pd.concat(books_list, ignore_index=True).drop_duplicates(subset='title', keep='first')

    # Load other datasets
    interactions = pd.read_pickle('../Pickle/interactions.pkl')
    read = pd.read_pickle('../Pickle/read.pkl')
    reviews = pd.read_pickle('../Pickle/reviews.pkl')
    
    # Load embeddings and indexes
    umap_embeddings = pd.read_pickle('../Pickle/umap_embeddings.pkl')
    faiss_index = faiss.read_index('../Pickle/faiss_index.bin')
    book_id_to_index = pd.read_pickle('../Pickle/book_id_to_index.pkl')
    user_id_to_index_gat = pd.read_pickle('../Pickle/user_id_to_index_gat.pkl')
    book_id_to_index_gat = pd.read_pickle('../Pickle/book_id_to_index_gat.pkl')
    all_embeddings = pd.read_pickle('../Pickle/gat_embeddings.pkl')
    
    # Filter datasets to valid book ids
    valid_book_ids = set(books['book_id'])
    read = read[(read['is_read'] == 1) & (read['book_id'].isin(valid_book_ids))]
    interactions = interactions[interactions['book_id'].isin(valid_book_ids)]
    reviews = reviews[reviews['book_id'].isin(valid_book_ids)]

    return (
        books, interactions, read, reviews,
        umap_embeddings, faiss_index, book_id_to_index,
        user_id_to_index_gat, book_id_to_index_gat, all_embeddings
    )


In [None]:
def denormalize_rating(log_scaled_ratings, min_rating):
    log_scaled_ratings = np.asarray(log_scaled_ratings, dtype=float)

    # Reverse log1p transformation
    original_ratings = np.expm1(log_scaled_ratings)

    # Adjust for minimum rating
    if min_rating:
        original_ratings += min_rating

    # Clip values between 0 and 5
    return np.clip(original_ratings, 0, 5)

In [None]:
def load_gat_model():
    from Gatv2Conv import GATModel
    model = GATModel(
        in_channels=32,
        hidden_channels=25,
        out_channels=1,
        num_heads=25,
        edge_feature_dim=386
    )
    model.load_state_dict(torch.load('../RecSysJupyter/gat_model.pth'))
    model.eval()
    return model



In [None]:
def recommend_books_HDBSCAN(book_id, books, umap_embeddings, faiss_index, book_id_to_index, predicted_ratings, top_n=5):
    if book_id not in book_id_to_index:
        return []

    book_idx = book_id_to_index[book_id]
    distances, indices = faiss_index.search(np.array([umap_embeddings[book_idx]]), top_n + 1)

    recommendations = []
    for idx in indices[0][1:]:  # Skip the book itself
        if idx >= len(books):
            continue
        recommended_book_id = books.iloc[idx]["book_id"]

        # Here, get the rating for the original book if available or some other logic for ratings
        rating = predicted_ratings.get(book_id, None)

        # Add the recommended book as a dictionary with the same rating
        recommendations.append({
            'book_id': recommended_book_id,
            'rating': rating  # Keep the same predicted rating as the original
        })

    return recommendations


pass all recs through hdbscan?

if no hdbscan then this function is as normal and same metrics as for nmf normally

In [None]:
def recommend_books_NMF(nmf_model, interactions, user_id, books_read, books, min_rating=None, n_recommendations=5):
    all_books = interactions['book_id'].unique()

    user_predictions = [
        (book_id, nmf_model.predict(uid=user_id, iid=book_id).est) for book_id in all_books
    ]
    
    # Sort by predicted rating in descending order
    top_books = sorted(user_predictions, key=lambda x: x[1], reverse=True)[:n_recommendations]

    # Denormalize the ratings
    denormed_recommendations = [
        {'book_id': book_id, 'predicted_rating': denormalize_rating([rating], min_rating)[0]}
        for book_id, rating in top_books
    ]

    return denormed_recommendations

In [None]:
def recommend_books_GAT(user_id, unread_book_ids, all_embeddings, user_id_to_index, book_id_to_index, books_df, min_rating=None, top_n=5):
    user_index = user_id_to_index.get(user_id)
    if user_index is None:
        raise ValueError(f"User ID {user_id} not found in index mappings.")

    user_embedding = all_embeddings[user_index]

    predictions = []
    for book_id in unread_book_ids:
        book_index = book_id_to_index.get(book_id)
        if book_index is None:
            continue

        book_embedding = all_embeddings[book_index]
        predicted_rating = np.expm1(np.dot(user_embedding, book_embedding))  # Denormalize if needed

        # Denormalize the predicted rating
        denormed_rating = denormalize_rating([predicted_rating], min_rating)[0]

        predictions.append({
            "book_id": book_id,
            "predicted_rating": denormed_rating
        })

    # Sort by predicted rating in descending order
    sorted_books = sorted(predictions, key=lambda x: x["predicted_rating"], reverse=True)

    # Return top_n recommendations, each with book_id and predicted_rating
    return sorted_books[:top_n]

In [None]:
def merge_recommendations_weighted(gat_recs, nmf_recs, final_size=5, gat_weight=0.6, nmf_weight=0.4):
    combined, seen_books = [], set()

    # Create a list of all recommendations with their respective weights
    weighted_recs = []

    # Add GAT recommendations with weighted scores
    for rec in gat_recs:
        if rec['book_id'] not in seen_books:
            weighted_recs.append({'book_id': rec['book_id'], 'score': gat_weight * rec['predicted_rating']})
            seen_books.add(rec['book_id'])

    # Add NMF recommendations with weighted scores
    for rec in nmf_recs:
        if rec['book_id'] not in seen_books:
            weighted_recs.append({'book_id': rec['book_id'], 'score': nmf_weight * rec['predicted_rating']})
            seen_books.add(rec['book_id'])

    # Sort all recommendations by score in descending order
    weighted_recs.sort(key=lambda x: x['score'], reverse=True)

    # Select the top recommendations until reaching final_size
    for rec in weighted_recs:
        if len(combined) < final_size:
            combined.append(rec)
        else:
            break

    return combined


In [None]:
books, interactions, read, reviews, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings = load_data()

In [None]:
# Group the data by user_id and calculate the number of books and reviews
user_stats = read.groupby('user_id')['book_id'].nunique()
user_reviews_count = reviews.groupby('user_id').size()

# Merge the stats into a single DataFrame
user_data = pd.DataFrame({
    'num_books': user_stats,
    'num_reviews': user_reviews_count
}).fillna(0)

# Define conditions for case_2 and case_3
# case_2_users = user_data[(user_data['num_books'] >= 10) & (user_data['num_reviews'] < 5)].index
case_3_users = user_data[(user_data['num_books'] >= 10) & (user_data['num_reviews'] >= 5)].index

# Filter out users not in interactions and user_id_to_index_gat
# valid_case_2_users = [user_id for user_id in case_2_users if user_id in interactions.index]
valid_case_3_users = [user_id for user_id in case_3_users if user_id in reviews.index and user_id in user_id_to_index_gat]

In [None]:
# len(valid_case_2_users)

In [None]:
len(valid_case_3_users)

In [None]:
# train_2, test_2 = train_test_split(valid_case_2_users)
train_3, test_3 = train_test_split(valid_case_3_users)

In [None]:
def evaluate_case_3_users(case_3_users, interactions, books, read, reviews,
                          umap_embeddings, faiss_index, book_id_to_index,
                          user_id_to_index_gat, book_id_to_index_gat, all_embeddings,
                          final_size=5, gat_weight=0.7, nmf_weight=0.3):
    
    best_nmf = joblib.load('../Pickle/best_nmf_model.pkl')
    results = []

    for user_id in case_3_users:
        user_books_read = read[read['user_id'] == user_id]
        user_reviews = reviews[reviews['user_id'] == user_id]

        if user_books_read.empty or user_reviews.empty:
            continue 

        # Validate user in GAT index mapping
        if user_id not in user_id_to_index_gat:
            print(f"Skipping user {user_id}: not in GAT user index.")
            continue

        # Get books the user hasn't read yet
        all_books = interactions['book_id'].unique()
        unread_books = list(set(all_books) - set(user_books_read['book_id']))

        # Get NMF recommendations
        nmf_recs = recommend_books_NMF(
            nmf_model=best_nmf,
            interactions=interactions,
            user_id=user_id,
            books_read=user_books_read['book_id'],
            books=books,
            n_recommendations=final_size * 2
        )

        # Get GAT recommendations
        gat_recs = recommend_books_GAT(
            user_id=user_id,
            unread_book_ids=unread_books,
            all_embeddings=all_embeddings,
            user_id_to_index=user_id_to_index_gat,
            book_id_to_index=book_id_to_index_gat,
            books_df=books,
            top_n=final_size * 2
        )

        # Merge recommendations with weighted approach
        merged_recs = merge_recommendations_weighted(
            gat_recs=gat_recs,
            nmf_recs=nmf_recs,
            final_size=final_size,
            gat_weight=gat_weight,
            nmf_weight=nmf_weight
        )

        results.append({
            'user_id': user_id,
            'recommended_books': merged_recs  # Contains book_id + weighted score
        })

        print(f"User {user_id} hybrid weighted recommendations: {merged_recs}")

    return results

# Execute the evaluation with case_3_users
hybrid_results = evaluate_case_3_users(
    case_3_users=case_3_users,
    interactions=interactions,
    books=books,
    read=read,
    reviews=reviews,
    umap_embeddings=umap_embeddings,
    faiss_index=faiss_index,
    book_id_to_index=book_id_to_index,
    user_id_to_index_gat=user_id_to_index_gat,
    book_id_to_index_gat=book_id_to_index_gat,
    all_embeddings=all_embeddings,
    gat_weight=0.7,  # Assign weight to GAT model
    nmf_weight=0.3   # Assign weight to NMF model
)


In [None]:
def eval_at_k(results, ground_truth, k=5):
    precisions = []
    for user_result in results:
        user_id = user_result['user_id']
        recommended_ids = [rec['book_id'] for rec in user_result['recommended_books'][:k]]

        true_books = set(ground_truth.get(user_id, []))

        hits = len(set(recommended_ids) & true_books)
        precision = hits / k
        precisions.append(precision)

    avg_precision = np.mean(precisions)
    print(f"Precision@{k}: {avg_precision:.4f}")
    return avg_precision


In [None]:
valid_case_3_reads = reviews[reviews['user_id'].isin(valid_case_3_users)]

valid_case_3_user_read_books = valid_case_3_reads.groupby('user_id')['book_id'].apply(set).to_dict()


In [None]:
precision, recall = eval_at_k(hybrid_results, valid_case_3_user_read_books, k=10)


In [None]:
# def evaluate_case_2_users(case_2_users, interactions, books, read):
#     best_nmf = joblib.load('../Pickle/best_nmf_model.pkl')
#     results = []

#     predicted_ratings = {} 

#     for user_id in case_2_users:
#         user_books_read = read[read['user_id'] == user_id]
#         if user_books_read.empty:
#             continue  # Skip if the user hasn't read any books

#         user_books = user_books_read['book_id'].tolist()  # List of books user has read

#         recs = recommend_books_NMF(
#             nmf_model=best_nmf,
#             interactions=interactions,
#             user_id=user_id,
#             books_read=user_books,
#             books=books
#         )

#         nmf_recommended_book_ids = [rec['book_id'] for rec in recs]
#         nmf_predicted_ratings = {rec['book_id']: rec['predicted_rating'] for rec in recs}

#         predicted_ratings.update(nmf_predicted_ratings)

#         final_recommended_books = {
#             book_id: predicted_ratings.get(book_id, 0) for book_id in nmf_recommended_book_ids
#         }

#         results.append({
#             'user_id': user_id,
#             'recommended_books': [
#                 {'book_id': book_id, 'predicted_rating': predicted_rating}
#                 for book_id, predicted_rating in final_recommended_books.items()
#             ][:5]  # Limit to final 5 recommendations
#         })

#         print(f"User {user_id} recommended books and predicted ratings: {list(final_recommended_books.items())[:5]}")

#     return results

# nmf_results = evaluate_case_2_users(case_2_users, interactions, books, read)
