# **Goodreads Book Reviews Sentiment Analysis**

In [3]:
# File: imports.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
# Download the VADER lexicon
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/HP/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
# Load the dataset
works = pd.read_csv('/Users/HP/Documents/Data Analytics/CodeBasics/Projects Portfolio/Goodreads+Book+Reviews/goodreads_works.csv')
reviews = pd.read_csv('/Users/HP/Documents/Data Analytics/CodeBasics/Projects Portfolio/Goodreads+Book+Reviews/goodreads_reviews.csv')

  reviews = pd.read_csv('/Users/HP/Documents/Data Analytics/CodeBasics/Projects Portfolio/Goodreads+Book+Reviews/goodreads_reviews.csv')


In [6]:
# Display the first few rows of the datasets
works_df = pd.DataFrame(works)
reviews_df = pd.DataFrame(reviews)

print("Works Dataset:")
print(works_df.head())

print("\nReviews Dataset:")
print(reviews_df.head())

# Display the shape of the datasets 
print("\nShape of Works Dataset:", works_df.shape)
print("Shape of Reviews Dataset:", reviews_df.shape)    

Works Dataset:
    work_id        isbn        isbn13         original_title  \
0   2919130  1416534601  9.781417e+12              Nocturnes   
1  52087333         NaN           NaN              Draw Play   
2   1649583  1416505520  9.781417e+12  Citizen of the Galaxy   
3    688299  0060541830  9.780061e+12                  Congo   
4   3464264  0451528824  9.780452e+12   Anne of Green Gables   

               author  original_publication_year  num_pages  \
0       John Connolly                     2004.0        NaN   
1           Tia Lewis                     2016.0        NaN   
2  Robert A. Heinlein                     1957.0        NaN   
3    Michael Crichton                     1980.0        NaN   
4     L.M. Montgomery                     1908.0        NaN   

                                         description  \
0                                                NaN   
1  Jake:\nI can't believe my coach assigned me a ...   
2  In a distant galaxy, the atrocity of slavery w... 

In [7]:
works.head(5)

Unnamed: 0,work_id,isbn,isbn13,original_title,author,original_publication_year,num_pages,description,genres,image_url,reviews_count,text_reviews_count,5_star_ratings,4_star_ratings,3_star_ratings,2_star_ratings,1_star_ratings,ratings_count,avg_rating,similar_books
0,2919130,1416534601.0,9781417000000.0,Nocturnes,John Connolly,2004.0,,,"fiction, fantasy, paranormal, mystery, thrille...",https://s.gr-assets.com/assets/nophoto/book/11...,8820,338,1118,1601,1029,190,58,3996,3.9,
1,52087333,,,Draw Play,Tia Lewis,2016.0,,Jake:\nI can't believe my coach assigned me a ...,"romance, fiction",https://s.gr-assets.com/assets/nophoto/book/11...,2482,204,204,353,274,77,29,937,3.7,
2,1649583,1416505520.0,9781417000000.0,Citizen of the Galaxy,Robert A. Heinlein,1957.0,,"In a distant galaxy, the atrocity of slavery w...","fiction, young-adult, fantasy, paranormal, chi...",https://s.gr-assets.com/assets/nophoto/book/11...,16506,447,3539,4351,2863,444,53,11250,4.0,
3,688299,60541830.0,9780061000000.0,Congo,Michael Crichton,1980.0,,"Deep in the African rain forest, near the lege...","fiction, mystery, thriller, crime, fantasy, pa...",https://s.gr-assets.com/assets/nophoto/book/11...,170916,1633,25081,45775,48505,14001,2926,136288,3.6,
4,3464264,451528824.0,9780452000000.0,Anne of Green Gables,L.M. Montgomery,1908.0,,"Everyone's favorite redhead, the spunky Anne S...","fiction, young-adult, children, history, histo...",https://s.gr-assets.com/assets/nophoto/book/11...,743392,14586,272952,161856,81578,19933,9099,545418,4.2,


In [8]:
reviews.head(5)

Unnamed: 0,review_id,user_id,work_id,started_at,read_at,date_added,rating,review_text,n_votes,n_comments
0,fa7a00c01296e3b2b2e857d79c51ea77,3693bb4f1062b659a354848cf11ca313,6128277,,,2013-12-21 00:00:00.000,5.0,Fire is half-human and half monster. Monsters ...,0,0
1,de0f7c8d15e247443e51969becf2878e,3693bb4f1062b659a354848cf11ca313,3270810,,,2013-12-21 00:00:00.000,5.0,Katsa is a graceling - blessed with an ability...,0,0
2,e79b49504ef58b2defcdc8b79e2ec3fb,3693bb4f1062b659a354848cf11ca313,4768235,,,2013-12-19 00:00:00.000,5.0,"This is a fun, light-hearted read. Tammy Jo is...",0,0
3,495c1210a9dbd819cbf7717dfb8b217f,3693bb4f1062b659a354848cf11ca313,6264661,,,2013-12-16 00:00:00.000,5.0,"Cassel, was a character that I was looking for...",0,0
4,54fcd8963c9dd56485d2a4ad152699c2,3693bb4f1062b659a354848cf11ca313,3429537,,,2013-12-16 00:00:00.000,5.0,This is kind of one of those books that I saw ...,0,0


In [9]:
# Clean works dataset
works_df['primary_genre'] = works_df['genres'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else np.nan)

In [10]:
# Standardize the 'original_publication_year' column by removing the decimal point and converting to numeric
works_df['original_publication_year'] = works_df['original_publication_year'].astype(str).str.split('.').str[0]


In [11]:
# Standardize the authors' names 
works_df['author'] = works_df['author'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else np.nan)

In [12]:
# Filtering the relevant columns for analysis 
works_clean = works_df[['work_id', 'original_title', 'author', 
                              'original_publication_year', 
                              'primary_genre', 'num_pages', 
                              'ratings_count', 'avg_rating', 
                              '5_star_ratings','4_star_ratings', 
                              '3_star_ratings', '2_star_ratings', 
                              '1_star_ratings', 'text_reviews_count', 
                              'image_url']]

In [13]:
works_clean.head()

Unnamed: 0,work_id,original_title,author,original_publication_year,primary_genre,num_pages,ratings_count,avg_rating,5_star_ratings,4_star_ratings,3_star_ratings,2_star_ratings,1_star_ratings,text_reviews_count,image_url
0,2919130,Nocturnes,John Connolly,2004,fiction,,3996,3.9,1118,1601,1029,190,58,338,https://s.gr-assets.com/assets/nophoto/book/11...
1,52087333,Draw Play,Tia Lewis,2016,romance,,937,3.7,204,353,274,77,29,204,https://s.gr-assets.com/assets/nophoto/book/11...
2,1649583,Citizen of the Galaxy,Robert A. Heinlein,1957,fiction,,11250,4.0,3539,4351,2863,444,53,447,https://s.gr-assets.com/assets/nophoto/book/11...
3,688299,Congo,Michael Crichton,1980,fiction,,136288,3.6,25081,45775,48505,14001,2926,1633,https://s.gr-assets.com/assets/nophoto/book/11...
4,3464264,Anne of Green Gables,L.M. Montgomery,1908,fiction,,545418,4.2,272952,161856,81578,19933,9099,14586,https://s.gr-assets.com/assets/nophoto/book/11...


In [15]:
# Filtering the relevant columns in the reviews dataset
reviews_clean = reviews_df[['review_id', 'work_id', 'started_at', 
                            'read_at', 'review_text', 'rating']]

In [23]:
# Processing the reviews dataset
reviews_clean = reviews_clean[reviews_clean['review_text'].notna()]

In [26]:
# Sentiment scoring function
def get_sentiment_score(text):
        score = sid.polarity_scores(str(text))
        return score['compound']

# Apply sentiment scoring to the reviews
reviews_clean['sentiment_score'] = reviews_clean['review_text'].apply(get_sentiment_score)

In [27]:
# Map sentinment scores to sentiment categories
def sentiment_label(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'
reviews_clean['sentiment_category'] = reviews_clean['sentiment_score'].apply(sentiment_label)

In [28]:
# Export the cleaned datasets to CSV files
works_clean.to_csv('/Users/HP/Documents/Data Analytics/CodeBasics/Projects Portfolio/Goodreads+Book+Reviews/works_clean.csv', index=False)
reviews_clean.to_csv('/Users/HP/Documents/Data Analytics/CodeBasics/Projects Portfolio/Goodreads+Book+Reviews/reviews_clean.csv', index=False)

print("\nCleaned datasets exported successfully: 'works_clean.csv' and 'reviews_clean.csv'")


Cleaned datasets exported successfully: 'works_clean.csv' and 'reviews_clean.csv'
