In [1]:
# my imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Example DataFrame
data = pd.read_csv('data/reviews_with_predicted_v_given_sentiment.csv')



In [5]:
data.head()

Unnamed: 0,#,book_name,reviewer_anonymous,reviewer_rating,review_description,date,sentiment_given,sentiment_predicted
0,0,The Woman in Me,6ce3606d5be9785bcd9b10b844b98cf3b337b7a97a7f9c...,4,I'm only a third way in. Shipped lightening fa...,26-10-2023,positive,positive
1,1,The Woman in Me,243aa726ab7df2a7630a3a36c7d3a12f14e9d80cd3ab83...,5,"""There have been so many times when I was scar...",06-11-2023,positive,positive
2,2,The Woman in Me,eaea2ab37288945d63173beddf5680a39c37672c4386d6...,5,The media could not be loaded. I personally ha...,01-11-2023,positive,positive
3,3,The Woman in Me,9e554d1ebb53e03ec42b99ae5842c8a7309af90010bc51...,5,I have been a fan of Britney's music since the...,25-10-2023,positive,positive
4,4,The Woman in Me,4795e19c3660f232dd519252ac99d99fd53e23c7cf9a5e...,5,"Whether or not you’re a fan, it’s a great read...",01-11-2023,positive,positive


In [6]:
# Step 1: Filter to include only positive reviews
positive_reviews = data[data['sentiment_given'] == 'positive']

In [8]:
# Step 2: Group by title to get the count of positive reviews per book
positive_count = positive_reviews.groupby('book_name').size().reset_index(name='positive_count')

In [9]:
# Step 3: Group the entire dataset by title to get the total count of reviews per book
total_count = data.groupby('book_name').size().reset_index(name='total_count')

In [11]:
# Step 4: Merge the positive count and total count DataFrames
merged_df = pd.merge(positive_count, total_count, on='book_name')

In [14]:
# Step 5: Calculate the percentage of positive reviews
merged_df['positive_percentage'] = (merged_df['positive_count'] / merged_df['total_count']) * 100

# Display the result
merged_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
positive_count,91.0,9.945055,1.177216,7.0,10.0,10.0,10.0,20.0
total_count,91.0,10.076923,1.097783,7.0,10.0,10.0,10.0,20.0
positive_percentage,91.0,98.681319,4.002441,80.0,100.0,100.0,100.0,100.0


In [21]:
merged_df.book_name.unique()

array(['A Court of Mist and Fury (A Court of Thorns and Roses, 2)',
       'A Court of Thorns and Roses (A Court of Thorns and Roses, 1)',
       'A Court of Thorns and Roses Paperback Box Set (5 books)',
       'A Court of Wings and Ruin (A Court of Thorns and Roses, 3)',
       'A Little Life', 'All the Light We Cannot See: A Novel',
       'Atomic Habits: An Easy & Proven Way to Build Good Habits & Break Bad Ones',
       "Balloons over Broadway: The True Story of the Puppeteer of Macy's Parade (Bank Street College of Education Flora Stieglitz Straus Award (Awards))",
       'Brown Bear, Brown Bear, What Do You See?',
       'Chicka Chicka Boom Boom (Board Book)',
       'Demon Copperhead: A Pulitzer Prize Winner',
       'Dirty Thirty (30) (Stephanie Plum)',
       "Don't Let the Pigeon Drive the Sleigh!", 'Elon Musk',
       'First Little Readers Parent Pack: Guided Reading Level A: 25 Irresistible Books That Are Just the Right Level for Beginning Readers',
       'Five Silly Turk

In [22]:
sales_df = pd.read_csv('data/cleaned_bestsellers_approximated_sales.csv')

In [29]:
sales_df.book_title.unique()

array(['A Tale of Two Cities', 'The Little Prince (Le Petit Prince)',
       'The Alchemist (O Alquimista)',
       "Harry Potter and the Philosopher's Stone",
       'And Then There Were None', 'Dream of the Red Chamber (紅樓夢)',
       'The Hobbit', "Alice's Adventures in Wonderland",
       'The Lion, the Witch and the Wardrobe',
       'She: A History of Adventure', 'The Da Vinci Code',
       'Harry Potter and the Chamber of Secrets',
       'The Catcher in the Rye',
       'Harry Potter and the Prisoner of Azkaban',
       'Harry Potter and the Goblet of Fire',
       'Harry Potter and the Order of Phoenix',
       'Harry Potter and the Half-Blood Prince',
       'Harry Potter and the Deathly Hallows',
       'The Bridges of Madison County',
       'One Hundred Years of Solitude (Cien años de soledad)', 'Lolita',
       'Heidi', 'The Common Sense Book of Baby and Child Care',
       'Anne of Green Gables', 'Black Beauty',
       'The Name of the Rose (Il Nome della Rosa)',
       '

In [28]:
# TODO: normalize titles

In [33]:
# Normalize the book titles
sales_df['normalized_title'] = sales_df['book_title'].str.split('(').str[0].str.strip().str.lower()

In [35]:
# Normalize the book titles
merged_df['normalized_title'] = merged_df['book_name'].str.split('(').str[0].str.strip().str.lower()

In [54]:
merged_df['is_successful'] = merged_df['normalized_title'].isin(sales_df['normalized_title'])

In [56]:
merged_df.is_successful.unique()

array([False,  True])

In [57]:
merged_df[merged_df.is_successful]

Unnamed: 0,book_name,positive_count,total_count,positive_percentage,normalized_title,is_successful
24,Harry Potter and the Prisoner of Azkaban (Harr...,10,10,100.0,harry potter and the prisoner of azkaban,True
78,The Very Hungry Caterpillar,10,10,100.0,the very hungry caterpillar,True


In [58]:
# Step 1: Filter to include only positive reviews
positive_reviews_predicted = data[data['sentiment_given'] == 'positive']

# Step 2: Group by title to get the count of positive reviews per book
positive_count = positive_reviews_predicted.groupby('book_name').size().reset_index(name='positive_count')

# Step 3: Group the entire dataset by title to get the total count of reviews per book
total_count = data.groupby('book_name').size().reset_index(name='total_count')

# Step 4: Merge the positive count and total count DataFrames
merged_df_predicted = pd.merge(positive_count, total_count, on='book_name')

# Step 5: Calculate the percentage of positive reviews
merged_df_predicted['positive_percentage'] = (merged_df_predicted['positive_count'] / merged_df_predicted['total_count']) * 100



In [59]:
# Display the result
merged_df_predicted

Unnamed: 0,book_name,positive_count,total_count,positive_percentage
0,A Court of Mist and Fury (A Court of Thorns an...,10,10,100.0
1,A Court of Thorns and Roses (A Court of Thorns...,8,10,80.0
2,A Court of Thorns and Roses Paperback Box Set ...,10,10,100.0
3,A Court of Wings and Ruin (A Court of Thorns a...,9,10,90.0
4,A Little Life,10,10,100.0
...,...,...,...,...
86,Turkey Trouble,10,10,100.0
87,Verity,10,10,100.0
88,Where's Bluey?: A Search-and-Find Book,10,10,100.0
89,Where's Spot?,10,10,100.0


In [60]:
# Normalize the book titles
merged_df_predicted['normalized_title'] = merged_df_predicted['book_name'].str.split('(').str[0].str.strip().str.lower()

In [61]:
merged_df_predicted['is_successful'] = merged_df_predicted['normalized_title'].isin(sales_df['normalized_title'])

In [62]:
merged_df_predicted[merged_df.is_successful]

Unnamed: 0,book_name,positive_count,total_count,positive_percentage,normalized_title,is_successful
24,Harry Potter and the Prisoner of Azkaban (Harr...,10,10,100.0,harry potter and the prisoner of azkaban,True
78,The Very Hungry Caterpillar,10,10,100.0,the very hungry caterpillar,True


In [66]:
data[(data.book_name.str.contains('Harry Potter and the Prisoner of Azkaban')) |
data.book_name.str.contains('The Very Hungry Caterpillar')]

Unnamed: 0,#,book_name,reviewer_anonymous,reviewer_rating,review_description,date,sentiment_given,sentiment_predicted
290,290,The Very Hungry Caterpillar,cf87a8289eeb0944ddd75a6457337e18b6925ad3f3f9e1...,5,The media could not be loaded. Where to even b...,04-09-2023,positive,positive
291,291,The Very Hungry Caterpillar,f537e53d930ce002e378cd0e503006effd1a3dadbe653a...,5,Every child’s library whether public or in the...,29-10-2023,positive,positive
292,292,The Very Hungry Caterpillar,7fb95f790d96769e5bf94be567ac03f174214d54f459a8...,5,"Cute story, colorful illustrations, and perfec...",06-11-2023,positive,positive
293,293,The Very Hungry Caterpillar,f5ea3614e396c1bf0c6ebd301a2d7b2f4b23dd26233c1e...,5,Books are my favorite gift to first-time paren...,16-10-2023,positive,positive
294,294,The Very Hungry Caterpillar,62de695c93b80996d0d5c8053c51e6220b60f5d48a3525...,5,"This story creates belly laughs, such fun and ...",25-10-2023,positive,positive
295,295,The Very Hungry Caterpillar,4d08697ada75dc79d8f4a9193efb19d01d3609928411d5...,5,"This book has been around for decades, got it ...",23-10-2023,positive,positive
296,296,The Very Hungry Caterpillar,d5ef996639091278315e21f37b279ee9f2556b49f2c2f7...,4,My little one doesn’t enjoy this book as much ...,03-11-2023,positive,positive
297,297,The Very Hungry Caterpillar,b211d529848e97e39921a1cfa942738bb5388763a19348...,5,Loved this book as a kid and It means a lot to...,18-10-2023,positive,positive
298,298,The Very Hungry Caterpillar,0b8d6362ab499eee931c70ea9bc1e48266688444ed5c03...,5,CLASSIC WONDERFULLY BOOK.,29-10-2023,positive,positive
299,299,The Very Hungry Caterpillar,1626883801c4fe49d1a7073924d0f63542e09fe5bb99ea...,5,This is a great book for children. The colors ...,18-10-2023,positive,positive


In [67]:
merged_df_predicted.to_csv('data/sales_success_data.csv')