In [None]:
import pandas as pd
import pickle
import faiss
import numpy as np
import joblib
from torch_geometric.nn import GATv2Conv
import torch

In [None]:
books_list = []

with open('../Pickle/books.pkl', 'rb') as file:
    while True:
        try:
            chunk = pickle.load(file)
            books_list.append(chunk)
        except EOFError:
            break  # Stop when end of file is reached
books = pd.concat(books_list, ignore_index=True)
books = books.drop_duplicates(subset='title', keep='first')

In [None]:
interactions = pd.read_pickle('../Pickle/interactions.pkl')
read = pd.read_pickle('../Pickle/read.pkl')
reviews = pd.read_pickle('../Pickle/reviews.pkl')

In [None]:
with open('../Pickle/umap_embeddings.pkl', 'rb') as f:
    umap_embeddings = pickle.load(f)

faiss_index = faiss.read_index('../Pickle/faiss_index.bin')

with open('../Pickle/book_id_to_index.pkl', 'rb') as f:
    book_id_to_index = pickle.load(f)

with open('../Pickle/user_id_to_index_gat.pkl', 'rb') as f:
    user_id_to_index_gat = pickle.load(f)

with open('../Pickle/book_id_to_index_gat.pkl', 'rb') as f:
    book_id_to_index_gat = pickle.load(f)

with open('../Pickle/gat_embeddings.pkl', 'rb') as f:
    all_embeddings = pickle.load(f)

In [None]:
user_id_to_index_gat

In [None]:
book_id_to_index_gat

In [None]:
read = read[read['is_read']== 1]

In [None]:
valid_book_ids = set(books['book_id'])
interactions = interactions[interactions['book_id'].isin(valid_book_ids)]
read = read[read['book_id'].isin(valid_book_ids)]
reviews = reviews[reviews['book_id'].isin(valid_book_ids)]

In [None]:
import torch
from Gatv2Conv import GATModel  # Import the model class from the other file

# Define the model architecture
model = GATModel(
    in_channels=32,  # Input features per node
    hidden_channels=25,
    out_channels=1,
    num_heads=25,
    edge_feature_dim=386  # Correct edge feature dimension
)

# Load the saved model weights
model.load_state_dict(torch.load('../RecSysJupyter/gat_model.pth'))

# Set the model to evaluation mode
model.eval()

In [None]:
user_id = 8193

In [None]:
user_books_read = read[read['user_id'] == user_id ]
user_num_books = user_books_read['book_id'].nunique()
user_reviews = interactions[interactions['user_id']== user_id]
user_num_reviews = user_reviews['book_id'].nunique()

In [None]:
all_books = interactions['book_id'].unique()
unread_books = list(set(all_books) - set(user_books_read))

In [None]:
unread_books

In [None]:
def recommend_books_HDBSCAN(book_id, books=books, umap_embeddings=umap_embeddings, top_n=5, book_id_to_index=book_id_to_index):
    if book_id not in book_id_to_index:
        print(f"Warning: {book_id} is not in book_id_to_index.")  # Print missing book_id
        return ("not in index")  # Return empty if book_id is not in book_id_to_index

    book_idx = book_id_to_index[book_id]
    input_book_title = books.loc[books['book_id'] == book_id, 'title'].values[0]

    # Search for nearest neighbors using FAISS
    distances, indices = faiss_index.search(np.array([umap_embeddings[book_idx]]), top_n + 1)

    recommendations = []
    for idx, dist in zip(indices[0][1:], distances[0][1:]):  # Exclude the book itself
        # Check if the index is within bounds
        if idx >= len(books):
            print(f"Warning: Index {idx} is out of bounds for books DataFrame.")
            continue  # Skip if the index is out of bounds

        recommended_book = books.iloc[idx]
        explanation = f"Similarity Score: {round(1 / (1 + dist), 3)}"
        recommendations.append({
            "title": recommended_book["title"],
            "authors": recommended_book["authors"],
            "cluster": recommended_book.get("cluster", "N/A"),
            "explanation": explanation
        })

    return recommendations


In [None]:
def recommend_books_NMF(model, interactions, user_id, books_read, books, n_recommendations=5):
    """Recommend top-N books for a specific user or all users using the trained NMF model."""
    # Get unique books from the interactions dataset
    all_books = interactions['book_id'].unique()
    unread_books = list(set(all_books) - set(books_read))

    # Predict ratings for unread books
    user_predictions = [
        (book_id, model.predict(uid=user_id, iid=book_id).est) for book_id in unread_books
    ]
    
    # Sort predictions by estimated rating in descending order
    user_predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Get the top-N recommended books and map to title, authors, and rating
    top_books = user_predictions[:n_recommendations]
    top_books_with_details = []
    
    for book_id, rating in top_books:
        # Lookup the title and authors of the book using the books DataFrame
        book_info = books.loc[books['book_id'] == book_id, ['title', 'authors']].values[0]
        book_title = book_info[0]
        book_authors = book_info[1]
        
        # Append the book_id, title, authors, and rating to the list
        top_books_with_details.append((book_id, book_title, book_authors, rating))
        
    return {user_id: top_books_with_details}  # Return list of (book_id, title, authors, predicted_rating)


In [None]:
import numpy as np
import pandas as pd

def recommend_books_GAT(user_id, unread_book_ids, all_embeddings, user_id_to_index, book_id_to_index, books_df):
    """
    Predicts ratings for a user against multiple unread books using GAT embeddings.
    Returns the top 5 recommended books with titles and denormalised ratings.
    """
    # Map user ID to index
    user_index = user_id_to_index.get(user_id, None)
    if user_index is None:
        raise ValueError(f"User ID {user_id} not found in index mappings.")

    # Get user embedding
    user_embedding = all_embeddings[user_index]

    predictions = []  # Store (book_id, title, predicted_rating)

    for book_id in unread_book_ids:
        book_index = book_id_to_index.get(book_id, None)
        if book_index is None:
            continue  # Skip books not found in mapping

        # Get book embedding
        book_embedding = all_embeddings[book_index]

        # Compute dot product (raw score)
        predicted_rating = np.dot(user_embedding, book_embedding)

        # Denormalise using expm1 (inverse of log1p)
        predicted_rating = np.expm1(predicted_rating)

        # Get book title from books_df
        book_title = books_df.loc[books_df['book_id'] == book_id, 'title'].values
        book_title = book_title[0] if len(book_title) > 0 else "Unknown Title"

        # Store (book_id, title, predicted_rating)
        predictions.append((book_id, book_title, predicted_rating))

    # Sort by predicted rating (descending) and return top 5
    top_recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:5]

    return top_recommendations  # Returns [(book_id, title, predicted_rating), ...]


In [None]:
# Example User and Book IDs
user_id = 8193
book_id = 24584


top_books = recommend_books_GAT(user_id, unread_books, all_embeddings, user_id_to_index_gat, book_id_to_index_gat, books)

print("Top 5 recommended books:")
for book_id, title, rating in top_books:
    print(f" {title} (Book ID: {book_id}) - Predicted Rating: {rating:.2f}")



In [None]:
if user_num_books < 5:
    print(f"User {user_id} has read {user_num_books} books. Using content-based filtering.")
    recommendations = []

# For each book the user has read, get recommendations
    for book_id in user_books_read:
        book_recommendations = recommend_books_HDBSCAN(book_id)
        recommendations.extend(book_recommendations)

# Remove duplicate recommendations
    unique_recommendations = {rec['title']: rec for rec in recommendations}.values()

# Convert to a list for easier handling
    unique_recommendations = list(unique_recommendations)

# Display the recommendations
    for rec in unique_recommendations:
        print(f"Title: {rec['title']}")
        print(f"Authors: {rec['authors']}")
        print(f"Cluster: {rec['cluster']}")
        print(f"Explanation: {rec['explanation']}")
        print() 
        
if user_num_books > 5 and user_num_reviews < 5:

    print(f"User {user_id} has read {user_num_books} books. Using collaborative filtering.")
    best_nmf = joblib.load('../Pickle/best_nmf_model.pkl')
    book_recommendations = recommend_books_NMF(best_nmf, interactions, user_id, user_books_read , books)
    user_predictions = book_recommendations.get(user_id, [])
    print(user_predictions)
    fifth_rec = user_predictions[3][0]  # book_id of the 5th recommendation
     # Feed it into the get_recommendation_by_cluster function to get 5 new books
    new_recommendations = recommend_books_HDBSCAN(fifth_rec)
        # Take the 1st book from the new recommendations
    new_first_rec = new_recommendations[0]
        
        # Replace the original 5th recommendation with the new first recommendation
    user_predictions[4] = (new_first_rec, user_predictions[4][1])  # Keep the original rating
        
        # Return the updated recommendations for the user
    book_recommendations[user_id] = user_predictions[:5]
    print(user_predictions)

if user_num_books > 5 and user_num_reviews > 5:
    print("using nmf and gat2vconv")
    print(f"User {user_id} has read {user_num_books} books. Using collaborative filtering.")
    best_nmf = joblib.load('../Pickle/best_nmf_model.pkl')
    book_recommendations = recommend_books_NMF(best_nmf, interactions, user_id, user_books_read , books)
    user_predictions = book_recommendations.get(user_id, [])
    #call recommend_gat function and return boooks
    

In [1]:
import pandas as pd
import pickle
import faiss
import numpy as np
import joblib
from torch_geometric.nn import GATv2Conv
import torch

# Load the necessary data
def load_data():
    books_list = []
    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                chunk = pickle.load(file)
                books_list.append(chunk)
            except EOFError:
                break
    books = pd.concat(books_list, ignore_index=True).drop_duplicates(subset='title', keep='first')

    interactions = pd.read_pickle('../Pickle/interactions.pkl')
    read = pd.read_pickle('../Pickle/read.pkl')
    reviews = pd.read_pickle('../Pickle/reviews.pkl')
    with open('../Pickle/umap_embeddings.pkl', 'rb') as f:
        umap_embeddings = pickle.load(f)
    faiss_index = faiss.read_index('../Pickle/faiss_index.bin')

    with open('../Pickle/book_id_to_index.pkl', 'rb') as f:
        book_id_to_index = pickle.load(f)
    with open('../Pickle/user_id_to_index_gat.pkl', 'rb') as f:
        user_id_to_index_gat = pickle.load(f)
    with open('../Pickle/book_id_to_index_gat.pkl', 'rb') as f:
        book_id_to_index_gat = pickle.load(f)
    with open('../Pickle/gat_embeddings.pkl', 'rb') as f:
        all_embeddings = pickle.load(f)

    # Filter read and interactions for valid books
    read = read[read['is_read'] == 1]
    valid_book_ids = set(books['book_id'])
    interactions = interactions[interactions['book_id'].isin(valid_book_ids)]
    read = read[read['book_id'].isin(valid_book_ids)]
    reviews = reviews[reviews['book_id'].isin(valid_book_ids)]

    return books, interactions, read, reviews, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings

# Load GAT model
def load_gat_model():
    from Gatv2Conv import GATModel
    model = GATModel(
        in_channels=32,  # Input features per node
        hidden_channels=25,
        out_channels=1,
        num_heads=25,
        edge_feature_dim=386  # Edge feature dimension
    )
    model.load_state_dict(torch.load('../RecSysJupyter/gat_model.pth'))
    model.eval()
    return model

# Recommendation function for HDBSCAN (Content-based)
def recommend_books_HDBSCAN(book_id, books, umap_embeddings, faiss_index, book_id_to_index, top_n=5):
    if book_id not in book_id_to_index:
        return []

    book_idx = book_id_to_index[book_id]
    distances, indices = faiss_index.search(np.array([umap_embeddings[book_idx]]), top_n + 1)
    recommendations = []

    for idx, dist in zip(indices[0][1:], distances[0][1:]):  # Exclude the book itself
        if idx >= len(books):
            continue  # Skip out-of-bounds indices

        recommended_book = books.iloc[idx]
        explanation = f"Similarity Score: {round(1 / (1 + dist), 3)}"
        recommendations.append({
            "book_id": recommended_book["book_id"],
            "title": recommended_book["title"],
            "authors": ', '.join(recommended_book["authors"]) if isinstance(recommended_book["authors"], list) else recommended_book["authors"],
            "predicted_rating": "N/A",
            "explanation": explanation
        })

    return recommendations

# Recommendation function for NMF (Collaborative filtering)
def recommend_books_NMF(nmf_model, interactions, user_id, books_read, books, n_recommendations=5):
    all_books = interactions['book_id'].unique()
    unread_books = list(set(all_books) - set(books_read))

    # Predict ratings for unread books
    user_predictions = [
        (book_id, nmf_model.predict(uid=user_id, iid=book_id).est) for book_id in unread_books
    ]

    user_predictions.sort(key=lambda x: x[1], reverse=True)
    top_books = user_predictions[:n_recommendations]

    recommendations = []
    for book_id, rating in top_books:
        book_info = books.loc[books['book_id'] == book_id, ['title', 'authors']].values[0]
        recommendations.append({
            "book_id": book_id,
            "title": book_info[0],
            "authors": ', '.join(book_info[1]) if isinstance(book_info[1], list) else book_info[1],
            "predicted_rating": rating,
            "explanation": "N/A"  # Add an explanation if needed
        })

    return recommendations

# Recommendation function for GAT (Hybrid filtering)
def recommend_books_GAT(user_id, unread_book_ids, all_embeddings, user_id_to_index, book_id_to_index, books_df, top_n=5):
    user_index = user_id_to_index.get(user_id)
    if user_index is None:
        raise ValueError(f"User ID {user_id} not found in index mappings.")

    user_embedding = all_embeddings[user_index]
    predictions = []

    for book_id in unread_book_ids:
        book_index = book_id_to_index.get(book_id)
        if book_index is None:
            continue

        book_embedding = all_embeddings[book_index]
        predicted_rating = np.dot(user_embedding, book_embedding)
        predicted_rating = np.expm1(predicted_rating)  # Denormalise

        book_title = books_df.loc[books_df['book_id'] == book_id, 'title'].values[0]
        predictions.append({
            "book_id": book_id,
            "title": book_title,
            "authors": ', '.join(books_df.loc[books_df['book_id'] == book_id, 'authors'].values[0]) if isinstance(books_df.loc[books_df['book_id'] == book_id, 'authors'].values[0], list) else books_df.loc[books_df['book_id'] == book_id, 'authors'].values[0],
            "predicted_rating": predicted_rating,
            "explanation": "N/A"  # Add an explanation if needed
        })

    top_recommendations = sorted(predictions, key=lambda x: x["predicted_rating"], reverse=True)[:top_n]
    return top_recommendations



# Main recommendation logic
def recommend_for_user(user_id, books, interactions, read, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings):
    user_books_read = read[read['user_id'] == user_id]
    user_num_books = user_books_read['book_id'].nunique()
    user_reviews = reviews[reviews['user_id'] == user_id]  # Assuming 'user_id' is in your interactions
    user_num_reviews = user_reviews.shape[0]  # Number of reviews for the user

    all_books = interactions['book_id'].unique()
    unread_books = list(set(all_books) - set(user_books_read['book_id']))

    if user_num_books < 5:
        print(f"User {user_id} has read {user_num_books} books. Using content-based filtering.")
        recommendations = []
        for book_id in user_books_read['book_id']:
            book_recommendations = recommend_books_HDBSCAN(book_id, books, umap_embeddings, faiss_index, book_id_to_index)
            recommendations.extend(book_recommendations)

        unique_recommendations = {rec['title']: rec for rec in recommendations}.values()
        return list(unique_recommendations)

    if user_num_books > 5:
        best_nmf = joblib.load('../Pickle/best_nmf_model.pkl')

        # Collaborative filtering if reviews < 5
        if user_num_reviews < 5:
            print(f"User {user_id} has fewer reviews. Using collaborative filtering (NMF).")
            book_recommendations = recommend_books_NMF(best_nmf, interactions, user_id, user_books_read['book_id'], books)
            return book_recommendations

        # Hybrid recommendation (NMF + GAT)
        print(f"User {user_id} has more than 5 books and reviews. Using hybrid filtering (NMF + GAT).")
        book_recommendations = recommend_books_NMF(best_nmf, interactions, user_id, user_books_read['book_id'], books)
        gat_recommendations = recommend_books_GAT(user_id, unread_books, all_embeddings, user_id_to_index_gat, book_id_to_index_gat, books)

        recommendations = book_recommendations + gat_recommendations
        return recommendations

# Example usage
books, interactions, read, reviews, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings = load_data()
user_id = 8193
recommendations = recommend_for_user(user_id, books, interactions, read, umap_embeddings, faiss_index, book_id_to_index, user_id_to_index_gat, book_id_to_index_gat, all_embeddings)


User 8193 has more than 5 books and reviews. Using hybrid filtering (NMF + GAT).


In [3]:
recommendations

[{'book_id': 131072,
  'title': 'An Introduction to Ancient Egyptian Literature',
  'authors': 'E.A. Wallis Budge',
  'predicted_rating': 3.0476281670720264,
  'explanation': 'N/A'},
 {'book_id': 262151,
  'title': 'Socrates In Love',
  'authors': 'Kyoichi Katayama',
  'predicted_rating': 3.0476281670720264,
  'explanation': 'N/A'},
 {'book_id': 131081,
  'title': 'Dark Horse',
  'authors': 'Fletcher Knebel',
  'predicted_rating': 3.0476281670720264,
  'explanation': 'N/A'},
 {'book_id': 131088,
  'title': 'The Cold Blue Blood (Berger and Mitry, #1)',
  'authors': 'David Handler',
  'predicted_rating': 3.0476281670720264,
  'explanation': 'N/A'},
 {'book_id': 65557,
  'title': "The Bride's Necklace (Necklace Trilogy, #1)",
  'authors': 'Kat Martin',
  'predicted_rating': 3.0476281670720264,
  'explanation': 'N/A'},
 {'book_id': 33837,
  'title': 'When Night Falls',
  'authors': 'Linda  Anderson',
  'predicted_rating': 3.6996765,
  'explanation': 'N/A'},
 {'book_id': 42716,
  'title': '

In [4]:
for rec in recommendations:
    print(f"Book ID: {rec['book_id']}")
    print(f"Title: {rec['title']}")
    
    # Check if 'authors' is a list or a single string
    print(f"Authors: {rec['authors']}")
    print(f"Predicted Rating: {rec['predicted_rating']:.4f}")
    print("-" * 40)


Book ID: 131072
Title: An Introduction to Ancient Egyptian Literature
Authors: E.A. Wallis Budge
Predicted Rating: 3.0476
----------------------------------------
Book ID: 262151
Title: Socrates In Love
Authors: Kyoichi Katayama
Predicted Rating: 3.0476
----------------------------------------
Book ID: 131081
Title: Dark Horse
Authors: Fletcher Knebel
Predicted Rating: 3.0476
----------------------------------------
Book ID: 131088
Title: The Cold Blue Blood (Berger and Mitry, #1)
Authors: David Handler
Predicted Rating: 3.0476
----------------------------------------
Book ID: 65557
Title: The Bride's Necklace (Necklace Trilogy, #1)
Authors: Kat Martin
Predicted Rating: 3.0476
----------------------------------------
Book ID: 33837
Title: When Night Falls
Authors: Linda  Anderson
Predicted Rating: 3.6997
----------------------------------------
Book ID: 42716
Title: Not Just For Christmas
Authors: Roddy Doyle
Predicted Rating: 3.6997
----------------------------------------
Book ID: 60