<h2 style="color: #75aadb;">00 | Libraries</h2>

In [None]:
# 📚 Basic libraries
import pandas as pd

# File system libraries
import os
import sys

# 📝 Text Processing
import spacy
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Project libraries
import utils.functions as fn

# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

<h2 style="color: #75aadb;">01 | Data Extraction</h2>

In [None]:
data = pd.read_csv('../datasets/amazon_books_reviews.csv')
df = data.copy()
df.head(10) # Explore data

Unnamed: 0,title,authors,categories,all_summaries,all_texts,avg_score
0,Its Only Art If Its Well Hung!,['Julie Strain'],['Comics & Graphic Novels'],Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,4.0
1,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],Really Enjoyed It Essential for every personal...,I don't care much for Dr. Seuss but after read...,4.56
2,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],Outstanding Resource for Small Church Pastors ...,"I just finished the book, &quot;Wonderful Wors...",5.0
3,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],not good Here is my opinion Buyer beware Fall ...,I bought this book because I read some glowing...,3.72
4,"Nation Dance: Religion, Identity and Cultural ...",['Edward Long'],,interplay of traditions across Caribbean,from publisher:Addresses the interplay of dive...,5.0
5,The Church of Christ: A Biblical Ecclesiology ...,['Everett Ferguson'],['Religion'],Ecclesiological Milestone Early Christian deve...,With the publication of Everett Ferguson's boo...,4.5
6,The Overbury affair (Avon),['Miriam Allen De Ford'],,Overbury,Full of intrigue and a good overview of the co...,3.0
7,A Walk in the Woods: a Play in Two Acts,['Lee Blessing'],,meaty and fast Surprisingly delightful A walk ...,"This play was excellent. It's very smart, inte...",4.67
8,Saint Hyacinth of Poland,['Mary Fabyan Windeatt'],['Biography & Autobiography'],Review from the Publisher A miracle on every p...,Tells the wonderful story of how St. Hyacinth ...,4.5
9,Rising Sons and Daughters: Life Among Japan's ...,['Steven Wardell'],['Social Science'],Wardell's book is a pure delight. A fascinatin...,Steven Wardell's book is a pure delight and I'...,4.33


In [None]:
df_filtered = df[df['avg_score'] >= 2.5]

In [None]:
df_filtered.shape

(204324, 6)

<h2 style="color: #75aadb;">03 | Text Processing: Sentiment & Genre Extraction</h2>

### Text Preprocessing

In [None]:
# Using GPU
spacy.require_gpu()

In [None]:
df_filtered['all_texts_processed'] = fn.preprocess_text_spacy_trf(df_filtered['all_summaries'])

Processing Texts: 100%|█████████████████████████████████| 204324/204324 [1:06:03<00:00, 51.55text/s]


### Sentiment analysis: VADER

In [None]:
# Show progress
tqdm.pandas()

# Apply sentiment analysis
df_filtered['sentiment_score'] = df_filtered['all_texts_processed'].progress_apply(fn.get_sentiment_vader)

100%|██████████| 204324/204324 [07:44<00:00, 440.09it/s]


In [None]:
df_filtered

Unnamed: 0,title,authors,categories,all_summaries,all_texts,avg_score,all_texts_processed,sentiment_score
0,Its Only Art If Its Well Hung!,['Julie Strain'],['Comics & Graphic Novels'],Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,4.00,nice collection julie strain image,0.3818
1,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],Really Enjoyed It Essential for every personal...,I don't care much for Dr. Seuss but after read...,4.56,enjoy essential personal public library phlip ...,0.9806
2,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],Outstanding Resource for Small Church Pastors ...,"I just finished the book, &quot;Wonderful Wors...",5.00,outstanding resource small church pastor small...,0.9022
3,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],not good Here is my opinion Buyer beware Fall ...,I bought this book because I read some glowing...,3.72,good opinion buyer beware fall knee bravo vero...,0.9877
4,"Nation Dance: Religion, Identity and Cultural ...",['Edward Long'],,interplay of traditions across Caribbean,from publisher:Addresses the interplay of dive...,5.00,interplay tradition caribbean,0.0000
...,...,...,...,...,...,...,...,...
212392,The Orphan Of Ellis Island (Time Travel Advent...,['Elvira Woodruff'],['Juvenile Fiction'],LCS Review Loved it! The Orphan Of Ellis Islan...,This book was about a boy named Dominic who wa...,4.69,lcs review love orphan ellis island elvira woo...,0.9914
212393,Red Boots for Christmas,,['Juvenile Fiction'],A wonderful way to share the miracle of Christ...,This beautifully illustrated children's book t...,5.00,wonderful way share miracle christ birth amazi...,0.9260
212394,Mamaw,['Wild Wild Cabbage'],,The most incredible insightful writing about a...,A truly inspiring story of a woman I could onl...,4.67,incredible insightful writing mother love unfo...,0.8834
212395,The Autograph Man,['Zadie Smith'],['Fiction'],very fractured The quest for the holy Grail Lo...,After reading 50 pages and restarting every so...,2.50,fractured quest holy grail look spirituality w...,-0.5423


### Extract Genre-Relevant Keywords

In [None]:
# Apply
df_filtered['genre'] = df_filtered['all_texts_processed'].apply(fn.assign_genre_from_keywords)

In [None]:
# There are still lots of unknown categories
df_filtered['genre'].value_counts()

Unnamed: 0_level_0,count
genre,Unnamed: 1_level_1
Unknown,127995
Fiction,32596
Academic,18794
Non-Fiction,18163
Children's/Young Adult,4208
Poetry/Drama,2568


In [None]:
# For that we are going to use a similar function to replace the unknown genres with the categories column
# Only apply the function to Unknown genres
mask = df_filtered['genre'] == 'Unknown'
df_filtered.loc[mask, 'genre'] = df_filtered.loc[mask, 'categories'].apply(fn.assign_genre_from_categories)

In [None]:
# There are still lots of unknown categories
df_filtered['genre'].value_counts()

Unnamed: 0_level_0,count
genre,Unnamed: 1_level_1
Unknown,127717
Fiction,32782
Academic,18857
Non-Fiction,18174
Children's/Young Adult,4219
Poetry/Drama,2575


<h2 style="color: #75aadb;">04 | Feature Engineering</h2>

In [None]:
# One-Hot Encoding for the 'genre' column
df_encoded = pd.get_dummies(df_filtered['genre'], prefix='genre', dtype=int)

# Concatenate the encoded columns with the original DataFrame
df_new = pd.concat([df_filtered, df_encoded], axis=1)

# Drop the original 'Assigned_Genre' column if it's no longer needed
df_new = df_new.drop('genre', axis=1)

In [None]:
df_new

Unnamed: 0,title,authors,categories,all_summaries,all_texts,avg_score,all_texts_processed,sentiment_score,genre_Academic,genre_Children's/Young Adult,genre_Fiction,genre_Non-Fiction,genre_Poetry/Drama,genre_Unknown
0,Its Only Art If Its Well Hung!,['Julie Strain'],['Comics & Graphic Novels'],Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,4.00,nice collection julie strain image,0.3818,0,0,0,0,0,1
1,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],Really Enjoyed It Essential for every personal...,I don't care much for Dr. Seuss but after read...,4.56,enjoy essential personal public library phlip ...,0.9806,1,0,0,0,0,0
2,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],Outstanding Resource for Small Church Pastors ...,"I just finished the book, &quot;Wonderful Wors...",5.00,outstanding resource small church pastor small...,0.9022,0,0,0,0,0,1
3,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],not good Here is my opinion Buyer beware Fall ...,I bought this book because I read some glowing...,3.72,good opinion buyer beware fall knee bravo vero...,0.9877,0,0,1,0,0,0
4,"Nation Dance: Religion, Identity and Cultural ...",['Edward Long'],,interplay of traditions across Caribbean,from publisher:Addresses the interplay of dive...,5.00,interplay tradition caribbean,0.0000,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212392,The Orphan Of Ellis Island (Time Travel Advent...,['Elvira Woodruff'],['Juvenile Fiction'],LCS Review Loved it! The Orphan Of Ellis Islan...,This book was about a boy named Dominic who wa...,4.69,lcs review love orphan ellis island elvira woo...,0.9914,0,0,0,1,0,0
212393,Red Boots for Christmas,,['Juvenile Fiction'],A wonderful way to share the miracle of Christ...,This beautifully illustrated children's book t...,5.00,wonderful way share miracle christ birth amazi...,0.9260,0,0,0,0,0,1
212394,Mamaw,['Wild Wild Cabbage'],,The most incredible insightful writing about a...,A truly inspiring story of a woman I could onl...,4.67,incredible insightful writing mother love unfo...,0.8834,0,0,1,0,0,0
212395,The Autograph Man,['Zadie Smith'],['Fiction'],very fractured The quest for the holy Grail Lo...,After reading 50 pages and restarting every so...,2.50,fractured quest holy grail look spirituality w...,-0.5423,0,0,0,0,0,1


In [None]:
# Save the processed dataset
df_new.to_csv('../datasets/processed_dataset.csv', index=False)

In [None]:
# Select the numeric columns and title
num_df = df_new[['title', 'avg_score', 'sentiment_score', 'genre_Fiction', 'genre_Non-Fiction', 'genre_Academic', "genre_Children's/Young Adult", 'genre_Poetry/Drama']]

In [None]:
num_df

Unnamed: 0,title,avg_score,sentiment_score,genre_Fiction,genre_Non-Fiction,genre_Academic,genre_Children's/Young Adult,genre_Poetry/Drama
0,Its Only Art If Its Well Hung!,4.00,0.3818,0,0,0,0,0
1,Dr. Seuss: American Icon,4.56,0.9806,0,0,1,0,0
2,Wonderful Worship in Smaller Churches,5.00,0.9022,0,0,0,0,0
3,Whispers of the Wicked Saints,3.72,0.9877,1,0,0,0,0
4,"Nation Dance: Religion, Identity and Cultural ...",5.00,0.0000,0,0,0,0,0
...,...,...,...,...,...,...,...,...
212392,The Orphan Of Ellis Island (Time Travel Advent...,4.69,0.9914,0,1,0,0,0
212393,Red Boots for Christmas,5.00,0.9260,0,0,0,0,0
212394,Mamaw,4.67,0.8834,1,0,0,0,0
212395,The Autograph Man,2.50,-0.5423,0,0,0,0,0


In [None]:
# Save the numeric dataset
num_df.to_csv('../datasets/final_dataset.csv', index=False)