In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# Import book reviews dataset
reviews = pd.read_csv('Resources/book_reviews.csv')
reviews.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,671551345,Night World: Daughters Of Darkness,,ADB0JID2XRFYR,Harmony-Faith Charisma Izabela Jazmyn McDonague,1/3,5.0,1076457600,BEST BOOK EVER!!,This is 1 of da bst books dat i have EVER read...
1,671551345,Night World: Daughters Of Darkness,,,,1/3,5.0,1043971200,one of the best night world books!!!!,first of all i thought that this was one of lj...
2,671551345,Night World: Daughters Of Darkness,,,,1/3,3.0,960422400,three sisters to die for.......,"Once started I couldn't put it down, literally..."
3,671551345,Night World: Daughters Of Darkness,,A1V0SFB3AXM8JK,"K. Davis ""The Rose Bride""",0/2,1.0,1177718400,Disappointing to say the least,"This book is probably, in my opinion, one of (..."
4,671551345,Night World: Daughters Of Darkness,,,,0/0,5.0,889920000,"The most charming, captivating work from LJ Sm...",The plot and characters are incredible. Everyo...


In [3]:
# Drop columns that will not be used
reviews = reviews.drop(columns=['Price', 'User_id', 'profileName', 'review/helpfulness', 'review/time', 'review/summary'])
reviews.head()

Unnamed: 0,Id,Title,review/score,review/text
0,671551345,Night World: Daughters Of Darkness,5.0,This is 1 of da bst books dat i have EVER read...
1,671551345,Night World: Daughters Of Darkness,5.0,first of all i thought that this was one of lj...
2,671551345,Night World: Daughters Of Darkness,3.0,"Once started I couldn't put it down, literally..."
3,671551345,Night World: Daughters Of Darkness,1.0,"This book is probably, in my opinion, one of (..."
4,671551345,Night World: Daughters Of Darkness,5.0,The plot and characters are incredible. Everyo...


In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Create a function to clean the long texts
def clean_func(txt):
    # Clean the texts by removing punctuations and changing to lowercase
    txt = txt.lower()
    txt = re.sub(r'[^a-z\s]', '', txt)
    # Tokenize the text and remove stopwords
    txt_tokenized = txt.split()
    txt_tokenized = [word for word in txt_tokenized if word not in stop_words]
    result = ' '.join(txt_tokenized)
    return result

[nltk_data] Downloading package stopwords to /Users/syyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Clean review/text columns with the function
reviews['review/text'] = reviews['review/text'].apply(clean_func)

In [6]:
# Get user's review
user_review = 'I loved this book so much! I especially liked how the author set up the fantasy world. Everything was so magical and the imaginary world seemed very believable. All characters had great personality, even the side characters had their own story. I was moved by the friendships of the main characters that grew during their adventure.'

In [7]:
analyzer = SentimentIntensityAnalyzer()
sentiment = analyzer.polarity_scores(user_review)['compound']
print(f"User review's sentiment score is : {sentiment}")

User review's sentiment score is : 0.9458


In [8]:
# Filter the reviews based on ratings according to user review's sentiment
if sentiment >= 0:
    reviews_filtered = reviews[reviews['review/score'] >= 4]
else:
    reviews_filtered = reviews[reviews['review/score'] < 4]

In [9]:
vectorizer = TfidfVectorizer(stop_words="english")
# Vectorize filtered reviews
tf_reviews = vectorizer.fit_transform(reviews_filtered['review/text'])
# Vectorize user's review
tf_user = vectorizer.transform([user_review])

In [10]:
# Calculate the cosine similarity
similarity = cosine_similarity(tf_user, tf_reviews)

In [11]:
print("Similar books recommended based on your review: \n")
top5_idx = similarity[0].argsort()[-5:][::-1]
for idx in top5_idx:
    book = reviews.iloc[idx]
    sim_value = similarity[0][idx]
    print(f"<{book['Title']}>")

Similar books recommended based on your review: 

<Night World: Daughters Of Darkness>
<A Dance Through Time (Time Passages Romance)>
<A People's History of the United States: 1492 to Present>
<Red Is for Remembrance (Stolarz Series)>
<The New Drawing on the Right Side of the Brain>
