In [14]:
from sklearn.metrics import pairwise_distances
import pandas as pd
import json

In [15]:
with open('books_data_with_recommendations.json', 'r') as f:
    books_data = json.load(f)

In [16]:
books_df = pd.DataFrame(books_data)
books_df.head()

Unnamed: 0,title_complete,description,image_url,publisher,authors,genres,publish_date,num_pages,isbn,isbn13,genres_vector,recommended_books,price
0,Never Let Me Go,Hailsham seems like a pleasant English boardin...,https://images-na.ssl-images-amazon.com/images...,Vintage Books,Kazuo Ishiguro,"[Adult, Science Fiction, Fiction, Novels, Audi...",2010-08-31T14:00:00,288.0,,,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[Something Borrowed (Darcy & Rachel, #1), Repl...",38
1,Uncle Tom’s Cabin,The narrative drive of Stowe's classic novel i...,https://images-na.ssl-images-amazon.com/images...,Wordsworth Classics,Harriet Beecher Stowe,"[Classic Literature, American, Fiction, Litera...",1999-08-05T14:00:00,438.0,,,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[Dead Souls, An American Tragedy, Les Liaisons...",77
2,The Poisonwood Bible,The Poisonwood Bible is a story told by the wi...,https://images-na.ssl-images-amazon.com/images...,Harper Perennial Modern Classics,Barbara Kingsolver,"[Adult Fiction, Fiction, Literature, Novels, R...",2005-05-31T14:00:00,546.0,,,"[0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...","[Ahab's Wife, or The Star-Gazer, Uncle Tom’s C...",53
3,Blood River: A Journey to Africa’s Broken Heart,A compulsively readable account of a journey t...,https://images-na.ssl-images-amazon.com/images...,Vintage,Tim Butcher,"[Nonfiction, Memoir, Politics, Africa, Travel,...",2008-05-27T14:00:00,363.0,99494280.0,9780099494287.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[Dear Barack: The Extraordinary Partnership of...,39
4,"Ender's Shadow (The Shadow Series, #1)",Welcome to Battleschool.Growing up is never ea...,https://images-na.ssl-images-amazon.com/images...,Starscape,Orson Scott Card,"[Science Fiction Fantasy, Fiction, Science Fic...",2002-05-19T14:00:00,469.0,765342405.0,9780765342409.0,"[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Monsters of Men (Chaos Walking, #3), Replay, ...",70


In [17]:
# Step 1: Create a set of all unique genres
unique_genres = sorted({genre for genres in books_df["genres"] for genre in genres})
unique_genres

['15th Century',
 '17th Century',
 '18th Century',
 '19th Century',
 '20th Century',
 'Abuse',
 'Action',
 'Adult',
 'Adult Fiction',
 'Adventure',
 'Africa',
 'Aliens',
 'American',
 'American History',
 'Angels',
 'Animals',
 'Art',
 'Arthurian',
 'Asia',
 'Audiobook',
 'Autobiography',
 'Banned Books',
 'Biography',
 'Biography Memoir',
 'Birds',
 'Book Club',
 'Books About Books',
 'British Literature',
 'Buddhism',
 'Business',
 'Chapter Books',
 'Chick Lit',
 'Childrens',
 'Classic Literature',
 'Classics',
 'College',
 'Coming Of Age',
 'Contemporary',
 'Contemporary Romance',
 'Cozy Mystery',
 'Crime',
 'Cultural',
 'Dark',
 'Detective',
 'Dragons',
 'Drama',
 'Dystopia',
 'Education',
 'Egypt',
 'English Literature',
 'Entrepreneurship',
 'Epic',
 'Epic Fantasy',
 'Erotica',
 'Espionage',
 'European History',
 'Fairy Tales',
 'Fantasy',
 'Fiction',
 'Finance',
 'Food',
 'France',
 'French Literature',
 'Gay',
 'Germany',
 'Ghost Stories',
 'Ghosts',
 'Greek Mythology',
 'Hallo

In [18]:
# Step 2: Create a binary matrix for genres
def encode_genres(genres, unique_genres):
    return [int(genre in genres) for genre in unique_genres]

In [19]:
books_df["binary_vector"] = books_df["genres"].apply(lambda x: encode_genres(x, unique_genres))


In [20]:
# Step 3: Compute Jaccard distances
import numpy as np
binary_matrix = np.array(books_df["binary_vector"].tolist())
binary_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [21]:
jaccard_distances = pairwise_distances(binary_matrix, metric="jaccard")



In [22]:
# Step 2: Convert distances into a DataFrame
distance_df = pd.DataFrame(jaccard_distances, index=books_df.index, columns=books_df.index)
distance_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.823529,0.75,1.0,0.75,0.75,0.823529,0.888889,0.75,0.75,...,0.888889,1.0,0.888889,0.888889,0.888889,0.666667,0.888889,0.571429,0.823529,0.75
1,0.823529,0.0,0.571429,1.0,0.888889,0.947368,0.571429,0.888889,0.947368,0.947368,...,0.823529,1.0,0.888889,0.947368,0.947368,0.947368,0.75,0.888889,0.888889,0.571429
2,0.75,0.571429,0.0,0.947368,0.888889,0.947368,0.571429,0.888889,0.947368,0.947368,...,0.823529,1.0,0.888889,0.947368,0.888889,0.947368,0.75,0.75,0.888889,0.571429
3,1.0,1.0,0.947368,0.0,0.947368,1.0,1.0,0.75,1.0,1.0,...,0.947368,1.0,0.947368,0.947368,0.947368,1.0,0.947368,1.0,1.0,1.0
4,0.75,0.888889,0.888889,0.947368,0.0,0.888889,0.823529,0.947368,0.947368,0.888889,...,0.823529,1.0,0.823529,0.823529,0.823529,0.666667,0.75,0.888889,0.823529,0.888889


In [23]:
# Step 3: Find top 15 similar books for each book
def find_top_similar_books(book_idx, distance_df, top_n=15):
    distances = distance_df.loc[book_idx]
    similar_books = (
        distances[distances.index != book_idx]
        .sort_values()
        .head(top_n)
        .index.tolist()
    )
    return similar_books

In [24]:
# Store Jaccard results (label) for each book
jaccard_results = {
    idx: find_top_similar_books(idx, distance_df, top_n=15)
    for idx in books_df.index
}

In [28]:
# Step 4: Calculate precision, recall, and accuracy
def calculate_metrics(recommended_books, jaccard_results):
    metrics = []
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0

    for idx, recommended in enumerate(recommended_books):
        # Get Jaccard result indices
        jaccard_result = jaccard_results[idx]
        recommended_idx = [books_df.index[books_df['title_complete'] == book].tolist()[0]
                           for book in recommended if book in books_df['title_complete'].tolist()]

        # Intersection of recommended and Jaccard results
        true_positives = len(set(jaccard_result) & set(recommended_idx))
        false_positives = len(set(jaccard_result) - set(recommended_idx))
        false_negatives = len(set(recommended_idx) - set(jaccard_result))

        # Update global counts
        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives

        # Precision: TP / (TP + FP)
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

        # Recall: TP / (TP + FN)
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        # Accuracy: TP / Total
        total = len(jaccard_result) + len(set(recommended_idx))
        accuracy = true_positives / total if total > 0 else 0

        metrics.append({"book_idx": idx, "precision": precision, "recall": recall, "accuracy": accuracy})

    # Calculate overall metrics
    overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    overall_accuracy = total_true_positives / (total_true_positives + total_false_positives + total_false_negatives) if (total_true_positives + total_false_positives + total_false_negatives) > 0 else 0

    return pd.DataFrame(metrics), overall_precision, overall_recall, overall_accuracy


In [29]:
# Calculate metrics
metrics_df, overall_precision, overall_recall, overall_accuracy = calculate_metrics(books_df["recommended_books"], jaccard_results)



In [30]:
# Display individual metrics
print("Individual Metrics:")
print(metrics_df)

# Display overall metrics
print("\nOverall Metrics:")
print(f"Overall Precision: {overall_precision:.2f}")
print(f"Overall Recall: {overall_recall:.2f}")
print(f"Overall Accuracy: {overall_accuracy:.2f}")

Individual Metrics:
    book_idx  precision    recall  accuracy
0          0   1.000000  1.000000  0.500000
1          1   1.000000  1.000000  0.500000
2          2   1.000000  1.000000  0.500000
3          3   0.733333  0.733333  0.366667
4          4   0.933333  0.933333  0.466667
..       ...        ...       ...       ...
95        95   0.800000  0.800000  0.400000
96        96   0.800000  0.800000  0.400000
97        97   0.866667  0.866667  0.433333
98        98   0.800000  0.800000  0.400000
99        99   0.800000  0.800000  0.400000

[100 rows x 4 columns]

Overall Metrics:
Overall Precision: 0.83
Overall Recall: 0.83
Overall Accuracy: 0.72
