In [1]:
import pandas as pd
import re
import gradio as gr
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Import books and reviews dataset
books = pd.read_csv('Resources/goodreads_dataset.csv')
print(books.columns)
reviews = pd.read_csv('Resources/book_reviews.csv')
print(reviews.columns)

Index(['title', 'titleComplete', 'description', 'genres', 'isbn', 'publisher',
       'author', 'characters', 'places', 'ratingHistogram', 'ratingsCount',
       'reviewsCount', 'numPages', 'language'],
      dtype='object')
Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')


In [3]:
# Only select columns that will be used
books = books[['isbn','description','genres','author','title']]
reviews = reviews[['Id','review/score','review/text']]
# Merge the datasets
merged_df = pd.merge(reviews, books, left_on='Id', right_on='isbn')
merged_df = merged_df.drop(columns=['Id'])
merged_df.head()

Unnamed: 0,review/score,review/text,isbn,description,genres,author,title
0,5.0,This is 1 of da bst books dat i have EVER read...,671551345,"Three sisters, teenage vampires, attempt to es...","['Paranormal Romance', 'Witches', 'Romance', '...",['L.J. Smith'],Daughters of Darkness
1,5.0,This is 1 of da bst books dat i have EVER read...,671551345,"Three sisters, teenage vampires, attempt to es...","['Vampires', 'Supernatural', 'Young Adult', 'U...",['L.J. Smith'],Daughters of Darkness
2,5.0,first of all i thought that this was one of lj...,671551345,"Three sisters, teenage vampires, attempt to es...","['Paranormal Romance', 'Witches', 'Romance', '...",['L.J. Smith'],Daughters of Darkness
3,5.0,first of all i thought that this was one of lj...,671551345,"Three sisters, teenage vampires, attempt to es...","['Vampires', 'Supernatural', 'Young Adult', 'U...",['L.J. Smith'],Daughters of Darkness
4,3.0,"Once started I couldn't put it down, literally...",671551345,"Three sisters, teenage vampires, attempt to es...","['Paranormal Romance', 'Witches', 'Romance', '...",['L.J. Smith'],Daughters of Darkness


In [4]:
# Create function to clean strings that look like lists
def clean_list(list):
    return ', '.join(list.strip('[]').replace("'", "").split(", "))

In [5]:
# Convert list values into string values
merged_df['genres'] = merged_df['genres'].apply(clean_list)
merged_df['author'] = merged_df['author'].apply(clean_list)
merged_df.head()

Unnamed: 0,review/score,review/text,isbn,description,genres,author,title
0,5.0,This is 1 of da bst books dat i have EVER read...,671551345,"Three sisters, teenage vampires, attempt to es...","Paranormal Romance, Witches, Romance, Fiction,...",L.J. Smith,Daughters of Darkness
1,5.0,This is 1 of da bst books dat i have EVER read...,671551345,"Three sisters, teenage vampires, attempt to es...","Vampires, Supernatural, Young Adult, Urban Fan...",L.J. Smith,Daughters of Darkness
2,5.0,first of all i thought that this was one of lj...,671551345,"Three sisters, teenage vampires, attempt to es...","Paranormal Romance, Witches, Romance, Fiction,...",L.J. Smith,Daughters of Darkness
3,5.0,first of all i thought that this was one of lj...,671551345,"Three sisters, teenage vampires, attempt to es...","Vampires, Supernatural, Young Adult, Urban Fan...",L.J. Smith,Daughters of Darkness
4,3.0,"Once started I couldn't put it down, literally...",671551345,"Three sisters, teenage vampires, attempt to es...","Paranormal Romance, Witches, Romance, Fiction,...",L.J. Smith,Daughters of Darkness


In [6]:
# Combine all text information about book into a new column
merged_df['book_info'] = merged_df['review/text'] + ' ' + merged_df['description'] + ' ' + merged_df['genres']

In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Create a function to clean the long texts in the datasets
def clean_func(txt):
    # Clean the texts by removing punctuations and changing to lowercase
    txt = txt.lower()
    txt = re.sub(r'[^a-z\s]', '', txt)
    # Tokenize the text and remove stopwords
    txt_tokenized = txt.split()
    txt_tokenized = [word for word in txt_tokenized if word not in stop_words]
    result = ' '.join(txt_tokenized)
    return result

[nltk_data] Downloading package stopwords to /Users/syyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Apply clean_func to the book_info column to cleanup
merged_df['book_info'] = merged_df['book_info'].apply(clean_func)

In [9]:
analyzer = SentimentIntensityAnalyzer()
# Create a function that analyzes the sentiment of reviews
def analyze_review(review):
    sent = analyzer.polarity_scores(review)
    return sent['compound']

In [10]:
# Create a function to recommend books
def recommend_book(title, review):
    # Clean the user's input review
    user_review = clean_func(review)
    # Get the sentiment score of user's review
    user_sentiment = analyze_review(user_review)
    # Use the sentiment score to filter out ratings score
    if user_sentiment >= 0:
        filtered_df = merged_df[merged_df['review/score'] >= 4]
    else:
        filtered_df = merged_df[merged_df['review/score'] < 4]

    # Vectorize the data and user review
    vectorizer = TfidfVectorizer(stop_words="english")
    tf_books = vectorizer.fit_transform(filtered_df['book_info'])
    tf_user = vectorizer.transform([user_review])

    # Calculate cosine similarity
    similarity = cosine_similarity(tf_user, tf_books)
    # Get the indices of top 5 similarity books
    top_idx = similarity[0].argsort()[-5:][::-1]
    # Select the books from dataset
    top_books = merged_df.iloc[top_idx]

    # Create a string of recommendations
    recommendations = ""
    check_list = []
    for _, row in top_books.iterrows():
        if row['title'] not in check_list:
            check_list.append(row['title'])
            recommendations += f"[{row['title']}] by {row['author']}\n"
    
    return recommendations

In [11]:
instruction = """
    <h1>Book Recommendations</h1>
    <p>What book did you read recently?</p>
    <p>Enter the title and write a brief review of the book.</p>
    <p>Click 'Submit' to get new book recommendations!</p>
    <br/>
"""

In [12]:
gr_interface = gr.Interface(
    fn=recommend_book,
    inputs=[
        gr.Textbox(label="Title"),
        gr.Textbox(label="Your Review")
    ],
    outputs=gr.Textbox(label="Recommended Books"),
    description=instruction
)
gr_interface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://72ad14a1afb77ead2c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


