In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Import dataset
df = pd.read_csv('Resources/goodreads_dataset.csv')
df.head()

Unnamed: 0,title,titleComplete,description,genres,isbn,publisher,author,characters,places,ratingHistogram,ratingsCount,reviewsCount,numPages,language
0,Project Hail Mary,Project Hail Mary,Ryland Grace is the sole survivor on a despera...,"['Science Fiction Fantasy', 'Audiobook', 'Fant...",593135202.0,Ballantine Books,['Andy Weir'],"['Ryland Grace', 'Rocky']","['Tau Ceti System', 'Outer Space']","[1917, 5775, 29742, 116572, 266669]",420675.0,53538.0,476.0,English
1,The Talented Mr. Ripley,"The Talented Mr. Ripley (Ripley, #1)","Since his debut in 1955, Tom Ripley has evolve...","['Novels', 'Noir', 'Classics', 'Italy', 'Suspe...",393332144.0,W. W. Norton Company,['Patricia Highsmith'],"['Freddie Miles', 'Tom Ripley', 'Dickie Greenl...","['Italy', 'New York City, New York', 'Italian ...","[1483, 3902, 17161, 34467, 24270]",81283.0,5146.0,288.0,English
2,More Than This,More Than This,"A boy drowns, desperate and alone in his final...","['Queer', 'Fantasy', 'Contemporary', 'LGBT', '...",1406350486.0,Walker Books Ltd,['Patrick Ness'],['Seth Wearing'],,"[1441, 3672, 12295, 23873, 21208]",62489.0,8194.0,480.0,English
3,After Forever Ends,After Forever Ends,Orphaned by her mother and brushed off by her ...,"['Chick Lit', 'Fantasy', 'Coming Of Age', 'Con...",,Gingersnap Press,['Melodie Ramone'],,,"[81, 119, 205, 365, 750]",1520.0,241.0,564.0,English
4,A Bird Without Wings,A Bird Without Wings,"After an impoverished and indigent childhood, ...","['Contemporary', 'Contemporary Romance', 'Roma...",,Smashwords,['Roberta Pearce'],,,"[7, 6, 26, 49, 91]",179.0,31.0,,English


In [3]:
# Drop books that are written in other languages
df = df[df['language'] == 'English']
# Drop duplicate books by checking the isbn numbers
df = df.drop_duplicates(subset='isbn', keep='first')
# Replace NaN values with empty string
df['author'] = df['author'].fillna(' ')
# Drop columns that we won't be using
df.drop(columns=['titleComplete', 'characters', 'ratingHistogram', 'ratingsCount', 'reviewsCount', 'numPages', 'isbn', 'publisher', 'language', 'places'], inplace=True)
df.head()

Unnamed: 0,title,description,genres,author
0,Project Hail Mary,Ryland Grace is the sole survivor on a despera...,"['Science Fiction Fantasy', 'Audiobook', 'Fant...",['Andy Weir']
1,The Talented Mr. Ripley,"Since his debut in 1955, Tom Ripley has evolve...","['Novels', 'Noir', 'Classics', 'Italy', 'Suspe...",['Patricia Highsmith']
2,More Than This,"A boy drowns, desperate and alone in his final...","['Queer', 'Fantasy', 'Contemporary', 'LGBT', '...",['Patrick Ness']
3,After Forever Ends,Orphaned by her mother and brushed off by her ...,"['Chick Lit', 'Fantasy', 'Coming Of Age', 'Con...",['Melodie Ramone']
5,The Selected Writings of Edgar Allan Poe,"Enormously popular and widely admired, Edgar A...","['Poetry', 'Gothic', 'Essays', 'Literature', '...","['G.R. Thompson', 'Edgar Allan Poe']"


In [4]:
# Create function to clean strings that look like lists
def clean_list(list):
    return ', '.join(list.strip('[]').replace("'", "").split(", "))

In [5]:
# Convert list values into string values
df['genres'] = df['genres'].apply(clean_list)
df['author'] = df['author'].apply(clean_list)
df.head()

Unnamed: 0,title,description,genres,author
0,Project Hail Mary,Ryland Grace is the sole survivor on a despera...,"Science Fiction Fantasy, Audiobook, Fantasy, A...",Andy Weir
1,The Talented Mr. Ripley,"Since his debut in 1955, Tom Ripley has evolve...","Novels, Noir, Classics, Italy, Suspense, Myste...",Patricia Highsmith
2,More Than This,"A boy drowns, desperate and alone in his final...","Queer, Fantasy, Contemporary, LGBT, Young Adul...",Patrick Ness
3,After Forever Ends,Orphaned by her mother and brushed off by her ...,"Chick Lit, Fantasy, Coming Of Age, Contemporar...",Melodie Ramone
5,The Selected Writings of Edgar Allan Poe,"Enormously popular and widely admired, Edgar A...","Poetry, Gothic, Essays, Literature, Classics, ...","G.R. Thompson, Edgar Allan Poe"


In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/syyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Create a function to clean the long texts in the datasets
def clean_func(txt):
    # Clean the texts by removing punctuations and changing to lowercase
    txt = txt.lower()
    txt = re.sub(r'[^a-z\s]', '', txt)
    # Tokenize the text and remove stopwords
    txt_tokenized = txt.split()
    txt_tokenized = [word for word in txt_tokenized if word not in stop_words]
    result = ' '.join(txt_tokenized)
    return result

In [8]:
# Clean both description and review/text columns with the function
df['description'] = df['description'].apply(clean_func)
df.head()

Unnamed: 0,title,description,genres,author
0,Project Hail Mary,ryland grace sole survivor desperate lastchanc...,"Science Fiction Fantasy, Audiobook, Fantasy, A...",Andy Weir
1,The Talented Mr. Ripley,since debut tom ripley evolved ultimate bad bo...,"Novels, Noir, Classics, Italy, Suspense, Myste...",Patricia Highsmith
2,More Than This,boy drowns desperate alone final moments dies ...,"Queer, Fantasy, Contemporary, LGBT, Young Adul...",Patrick Ness
3,After Forever Ends,orphaned mother brushed dad fifteen year old s...,"Chick Lit, Fantasy, Coming Of Age, Contemporar...",Melodie Ramone
5,The Selected Writings of Edgar Allan Poe,enormously popular widely admired edgar allan ...,"Poetry, Gothic, Essays, Literature, Classics, ...","G.R. Thompson, Edgar Allan Poe"


In [9]:
def ask_user():
    user_titles = []
    user_reviews = []
    count = 0

    while count < 3:
        # Ask user for the book title
        title = input(f"What book did you read recently? Please enter the title: ")
        # Standardize the book title input
        title_clean = title.strip().lower()
        # Check if the book title is in our dataset
        find_book = df['title'].apply(lambda x: x.strip().lower() == title_clean)
        # Repeat if the book doesn't exist
        if not find_book.any():
            print(f"{title} doesn't exist in our database.")
        else:
            # Find the book in the database
            book = df['title'][find_book].iloc[0]
            # Ask user for the book review
            review = input(f"Please enter your review of {book}")
            # Save user inputs to lists
            user_titles.append(book)
            user_reviews.append(review)
            count += 1
    
    return user_titles, user_reviews

In [10]:
user_titles, user_reviews = ask_user()

In [11]:
print(user_titles)
print(user_reviews)

['Tomorrow, and Tomorrow, and Tomorrow', 'Stoner', 'Dark Matter']
['this is my favorite book of all time! i loved it so much. i will read it 2 more times.', "I didn't like this book. I was disappointed after reading it. I found this book very boring.", 'this book was very interesting. i loved the suspense and mystery happening as well as the sci-fi factors.']


In [12]:
# Clean the user reviews by removing stopwords
clean_reviews = [clean_func(review) for review in user_reviews]

analyzer = SentimentIntensityAnalyzer()
# Create a function that analyzes the sentiment of reviews
def analyze_review(review):
    sent = analyzer.polarity_scores(review)
    return sent['compound']

# Analyze sentiments of user reviews
user_sentiments = [analyze_review(review) for review in clean_reviews]
print(user_sentiments)

[0.7845, -0.2158, 0.8271]


In [13]:
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")
# Remove books that user already read from the dataset for recommendation
df_filtered = df[~df['title'].isin(user_titles)]
# Combine all description and genres of other books
X_books = df_filtered['description'] + ' ' + df_filtered['genres']

# Fit the vectorizer with the book dataset
tf_books = vectorizer.fit_transform(X_books)

In [14]:
recommendations = []
check_idx = set()

for i in range(3):
    title = user_titles[i]
    review = clean_reviews[i]
    sentiment = user_sentiments[i]

    # Find the book that user selected
    selected = df[df['title'] == title]
    # Combine all related data of the selected book
    user_combined = selected['description'] + ' ' + selected['genres'] + ' ' + review

    # Use the selected book's data as the target for vectorizer
    tf_user_input = vectorizer.transform(user_combined)
    # Get the cosine similarity of books compared to the user reviews
    similarity = cosine_similarity(tf_user_input, tf_books).flatten()
    # Apply user review's sentiment score to the similarity
    updated_similarity = similarity * (sentiment + 1)

    # Sort to select the top 2 similar book
    top2_idx = updated_similarity.argsort()[-2::][::-1]
    # Save the top 2 books into the list
    for idx in top2_idx:
        # Check duplicate and only add new recommendation to the list
        if idx not in check_idx:
            check_idx.add(idx)
            book = df_filtered.iloc[idx].to_dict()
            recommendations.append(book)

In [15]:
print("BOOK RECOMMENDATIONS FOR YOU: \n")
# Display all recommended books to the user
for i in range(len(recommendations)):
    book = recommendations[i]
    print(f"{i+1}. [{book['title']}] by {book['author']}")

BOOK RECOMMENDATIONS FOR YOU: 

1. [The Red Pyramid] by Rick Riordan
2. [The Book of Tomorrow] by Cecelia Ahern
3. [Women] by Charles Bukowski
4. [Forty Stories] by Dave Eggers, Donald Barthelme
5. [Seeds of Rebellion] by Brandon Mull
6. [The Bourne Supremacy] by Robert Ludlum
