**Introduction:** This Python script presents a book recommendation system designed to provide personalized reading suggestions based on the named entities extracted from book summaries. 

**1. Import necessary libraries**

In [1]:
import csv
from tqdm import tqdm  # for progress tracking
import numpy as np
import pandas as pd
import spacy
import re

**2. Data Acquisition:** For our book recommendation system, we used the CMU Book Summary Dataset from Kaggle (https://www.kaggle.com/datasets/ymaricar/cmu-book-summary-dataset). It contains plot summaries for 16 559 books extracted from Wikipedia, along with their metadata.

In [2]:
# Read and process data from the CSV file
data = []
with open("booksummaries.txt", "r") as f:
    reader = csv.reader(f, dialect="excel-tab")
    for row in tqdm(reader):
        data.append(row)

16559it [00:00, 25602.18it/s]


In [3]:
# Extract relevant information from the data and create a DataFrame
book_index = []
book_id = []
book_author = []
book_name = []
summary = []
genre = []
a = 1
for i in tqdm(data):
    book_index.append(a)
    a = a + 1
    book_id.append(i[0])
    book_name.append(i[2])
    book_author.append(i[3])
    genre.append(i[5])
    summary.append(i[6])

book_df = pd.DataFrame(
    {
        "Index": book_index,
        "ID": book_id,
        "BookTitle": book_name,
        "Author": book_author,
        "Genre": genre,
        "Summary": summary,
    }
)
book_df.head()

100%|█████████████████████████████████| 16559/16559 [00:00<00:00, 502867.03it/s]


Unnamed: 0,Index,ID,BookTitle,Author,Genre,Summary
0,1,620,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,2,843,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,3,986,The Plague,Albert Camus,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,4,1756,An Enquiry Concerning Human Understanding,David Hume,,The argument of the Enquiry proceeds by a ser...
4,5,2080,A Fire Upon the Deep,Vernor Vinge,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...


**3. Text Preprocessing**: Plot summaries are tokenized using the SpaCy library. 

In [4]:
# Load the English NER model
nlp = spacy.load("en_core_web_sm")

In [5]:
# Clean the summary text using SpaCy and create a new column for cleaned summaries
def clean_summary(text):
    doc = nlp(text)  # Process the input text using spaCy
    cleaned_tokens = [token.text for token in doc if token.is_alpha]  # Retrieve the text of each token
    cleaned_text = " ".join(cleaned_tokens)  # Join the list of cleaned tokens into a single string
    return cleaned_text

# DataFrame with cleaned summaries
book_df["NER_summary"] = book_df["Summary"].apply(lambda x: clean_summary(x))
book_df.head(1)

Unnamed: 0,Index,ID,BookTitle,Author,Genre,Summary,NER_summary
0,1,620,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...",Old Major the old boar on the Manor Farm calls...


In [6]:
book_df["NER_summary"] = book_df["NER_summary"].astype("str")
books = book_df.set_index("BookTitle")["NER_summary"] 

**4. Extract entities**

In [7]:
# Function to extract named entities from text
def extract_named_entities(text):
    doc = nlp(text)
    # Filter named entities by specific labels and store them in a set
    named_entities = set(
        [
            ent.text
            for ent in doc.ents
            if ent.label_
            in {
                "PERSON",
                "LOC",
                "ORG",
                "GPE",
                "EVENT",
                "WORK_OF_ART",
                "LANGUAGE",
                "LAW",
                "NORP",
                "FAC",
            }
        ]
    )
    return named_entities

In [8]:
# Apply the extract_named_entities function to each book summary and store the results
book_df["entities"] = book_df["NER_summary"].apply(lambda x: extract_named_entities(x))
# Save book entities to a CSV file for future use
book_df[["BookTitle", "entities"]].to_csv("book_entities.csv")
book_df.head(2)  # Display the first two rows of the DataFram

Unnamed: 0,Index,ID,BookTitle,Author,Genre,Summary,NER_summary,entities
0,1,620,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...",Old Major the old boar on the Manor Farm calls...,"{Animalism Soon, Animal Farm They, Pilkington,..."
1,2,843,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...",Alex a teenager living in near future England ...,"{Sentenced, the Korova Milkbar, Alexander, Ale..."


**5.Compute recommendations**:

In [9]:
book_df = book_df.set_index("BookTitle")["entities"]  # Set BookTitle as index for easier access

In [15]:
# Function to convert entity strings to sets
def convert_to_set(entities: str):
    return set(entities)

In [16]:
# Function to compute entity similarity between two books
def compute_entity_similarity(id1, id2):
    entities1 = convert_to_set(book_df.iloc[id1])
    entities2 = convert_to_set(book_df.iloc[id2])
    intersecting_entities = set.intersection(entities1, entities2)
    overlap_ratio = 0
    min_entities = min(len(entities1), len(entities2))
    if min_entities > 0:
        overlap_ratio = len(intersecting_entities) / min_entities
    return {
        "intersecting_entities": list(intersecting_entities),
        "num_intersecting": len(intersecting_entities),
        "overlap_ratio": overlap_ratio,
    }

In [17]:
# Function to get nearest books based on entity similarity
def get_nearest(title, top_n=3):
    # Get index of the given book title
    try:
        idx = np.where(np.array(book_df.index) == title)[0][0]
    except:
        print(f"Book {title} not found. Try again :)")
        return f"Book {title} not found. Try again :)"
    # Calculate entity similarity scores with all other books
    similarity_distance = np.array(
        [
            -compute_entity_similarity(idx, i)["overlap_ratio"]
            for i in range(0, len(book_df))
        ]
    )
    # Sort indices and return top n (excluding the book itself)
    result_indices = similarity_distance.argsort()[0 : top_n + 1]
    result_indices_filtered = [x for x in result_indices if x != idx]
    print("You might want to read:")
    # Display recommended book titles
    for book_title in book_df.index[result_indices_filtered].values:
        print(f"- {book_title}")

In [18]:
get_nearest("Dune")

You might want to read:
- The Winds of Dune
- Children of Dune
- Dune: House Corrino


In [19]:
get_nearest("Into the Wild", top_n=3)

You might want to read:
- Into Thin Air: A Personal Account of the Mt. Everest Disaster
- Darkness Descends
- The Royal Mess


In [26]:
get_nearest("James Bond: The Authorised Biography of 007", top_n=5)

You might want to read:
- The Intuitionist
- Here Comes the Sun
- A Cage of Eagles
- The Moneypenny Diaries: Guardian Angel
- All-Consuming Fire


In [22]:
get_nearest("Harry Potter and the Half-Blood Prince", top_n=3)

You might want to read:
- Harry the Dirty Dog
- Man and Boy
- The Drop


In [23]:
get_nearest("Harry Potter and the Half-Blood Prince", top_n=10)

You might want to read:
- Harry the Dirty Dog
- Man and Boy
- The Drop
- A Dedicated Man
- Harry Potter and the Philosopher's Stone
- Harry Potter and the Deathly Hallows
- Harry Potter and the Chamber of Secrets
- Harry Potter and the Order of the Phoenix
- Harry Potter and the Goblet of Fire
- Dragon Tears


In [24]:
get_nearest("War and Peace", top_n=3)

You might want to read:
- By the River Piedra I Sat Down and Wept
- G.O.G. 666
- Le gang des gaffeurs


In [25]:
get_nearest("iWoz: Computer Geek to Cult Icon - How I Invented the Personal Computer, Co-Founded Apple, and Had Fun Doing It", top_n=3)

You might want to read:
- The Sentimentalists
- Crazy Love: Overwhelmed by a Relentless God
- Absolutely American
