## UCSD Scraped Data

Instead of using Scrapy or Beautiful Soup, I found this scraped data of Goodreads that UCSD scraped in 2017. I will use this to create our csv file and gather the data that we need. 

In [3]:
# import the necessary libraries
import pandas as pd
import gzip
import random
from tqdm import tqdm
import ast
from ast import literal_eval
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

In [3]:
# load the book metadata
file_path_to_books = "goodreads_books.json.gz"

# try opening the file
try: 
    with gzip.open(file_path_to_books, 'rt') as f:
        first_line = f.readline()
        print(first_line)
except EOFError:
    print("this file is corrupted or incomplete")

{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin's Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "titl

In [5]:
# this file is huge. we need to process it in chunks to prevent crashes 
# my computer crashed the first time I tried to read the json file

chunk_size =10000

with gzip.open(file_path_to_books, 'rt') as f:
    reader = pd.read_json(f, lines=True, chunksize = chunk_size)

    for i, chunk in enumerate(reader):
        print("Processing chunk {i}...")
        print(chunk.head()) # show the first few rows of each chunk
        break # stop after the first chunk to rest

Processing chunk {i}...
         isbn text_reviews_count    series country_code language_code  \
0  0312853122                  1        []           US                 
1  0743509986                  6        []           US                 
2                              7  [189911]           US           eng   
3  0743294297               3282        []           US           eng   
4  0850308712                  5        []           US                 

                                     popular_shelves        asin is_ebook  \
0  [{'count': '3', 'name': 'to-read'}, {'count': ...                false   
1  [{'count': '2634', 'name': 'to-read'}, {'count...                false   
2  [{'count': '58', 'name': 'to-read'}, {'count':...  B00071IKUY    false   
3  [{'count': '7615', 'name': 'to-read'}, {'count...                false   
4  [{'count': '32', 'name': 'to-read'}, {'count':...                false   

  average_rating kindle_asin  ... publication_month edition_information  \

In [18]:
# create a small representative subset for the books 
# we want to make sure the most popular books are in our dataset

sample_size = 50000
chosen_rows = []

with gzip.open(file_path_to_books, 'rt') as f:
    reader = pd.read_json(f, lines=True, chunksize=10000)

    for i, chunk in enumerate(tqdm(reader, desc="Processing Data", unit="chunk")):
        # ensure we only keep books with enough ratings
        chunk = chunk.dropna(subset=["ratings_count"])

        # convert the columns to numeric
        chunk["ratings_count"] = pd.to_numeric(chunk["ratings_count"], errors = "coerce")
        chunk["average_rating"] = pd.to_numeric(chunk["average_rating"], errors = "coerce")
        
        # create a popularity score 
        chunk["popularity_score"] = (chunk["ratings_count"] * 0.7) + (chunk["average_rating"] * 0.3)

        #sort by popularity and take the top chunk
        top_chunk = chunk.nlargest(int(len(chunk) * 0.1), "popularity_score")
        
        chosen_rows.append(top_chunk)

        if sum(len(df) for df in chosen_rows) >= sample_size:
            break # stop when we have reached our sample size

# we need to combine all of the samples into one dataframe
sample_df = pd.concat(chosen_rows)

# let's save the sample to a csv file
sample_df.to_csv("goodreads_sample_popular.csv", index=False)

#print when we have saved the sample books to a csv file
print(f"Saved {len(sample_df)} books to goodreads_popular_sample.csv")

Processing Data: 49chunk [00:58,  1.19s/chunk]


Saved 50000 books to goodreads_popular_sample.csv


In [5]:
df = pd.read_csv("goodreads_sample_popular.csv")
df

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,popularity_score
0,075640407X,23413,['162522'],US,eng,"[{'count': '296158', 'name': 'to-read'}, {'cou...",,False,4.55,B0010SKUYM,...,,2007.0,https://www.goodreads.com/book/show/186074.The...,https://images.gr-assets.com/books/1472068073m...,186074,416634.0,2502879,The Name of the Wind (The Kingkiller Chronicle...,The Name of the Wind (The Kingkiller Chronicle...,291645.165
1,0312360282,4665,['151539'],US,en-US,"[{'count': '3787', 'name': 'to-read'}, {'count...",,False,3.93,B000UZNQQ0,...,First Edition,,https://www.goodreads.com/book/show/676924.Bet...,https://s.gr-assets.com/assets/nophoto/book/11...,676924,206646.0,662923,"Betrayed (House of Night, #2)","Betrayed (House of Night, #2)",144653.379
2,0375836675,9971,[],US,en-US,"[{'count': '3311', 'name': 'favorites'}, {'cou...",,False,4.09,B001BZRUR4,...,,,https://www.goodreads.com/book/show/19057.I_Am...,https://images.gr-assets.com/books/1398483261m...,19057,94968.0,2737065,I Am the Messenger,I Am the Messenger,66478.827
3,0800759494,2885,[],US,,"[{'count': '9381', 'name': 'to-read'}, {'count...",,False,3.91,B00B853QPM,...,,,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,89375,68157.0,2957021,90 Minutes in Heaven: A True Story of Death an...,90 Minutes in Heaven: A True Story of Death an...,47711.073
4,0385689225,7703,[],US,eng,"[{'count': '72219', 'name': 'to-read'}, {'coun...",,True,4.45,B01DHWACVY,...,,2016.0,https://www.goodreads.com/book/show/29780253-b...,https://images.gr-assets.com/books/1473867911m...,29780253,57318.0,50150838,Born a Crime: Stories From a South African Chi...,Born a Crime: Stories From a South African Chi...,40123.935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1440583129,71,[],US,,"[{'count': '340', 'name': 'to-read'}, {'count'...",,False,3.13,B00PMIGYZ0,...,,2014.0,https://www.goodreads.com/book/show/22495074-y...,https://images.gr-assets.com/books/1413131332m...,22495074,318.0,41941702,You Could Be Home By Now,You Could Be Home By Now,223.539
49996,0802113869,26,[],US,eng,"[{'count': '347', 'name': 'to-read'}, {'count'...",,False,3.97,,...,,1990.0,https://www.goodreads.com/book/show/1242713.Th...,https://images.gr-assets.com/books/1486893483m...,1242713,317.0,1231405,The Kings and Queens of England and Scotland,The Kings and Queens of England and Scotland,223.091
49997,,11,[],US,,"[{'count': '551', 'name': 'to-read'}, {'count'...",B01CBEE0IK,True,3.92,B01CBEE0IK,...,,,https://www.goodreads.com/book/show/29503281-i...,https://s.gr-assets.com/assets/nophoto/book/11...,29503281,316.0,48845325,In Her Eyes,In Her Eyes,222.376
49998,1406912506,28,[],US,,"[{'count': '36', 'name': 'currently-reading'},...",,False,3.58,B0082STR38,...,,2006.0,https://www.goodreads.com/book/show/115467.Chance,https://images.gr-assets.com/books/1399424398m...,115467,316.0,548598,Chance,Chance,222.274


In [24]:
print(df.columns)

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series', 'popularity_score'],
      dtype='object')


In [28]:
# we need to extract genres from popular shelves
print(df["popular_shelves"].head(10))

# let's define a list of common genres first, so we filter from other random tags

common_genres = [
    "fiction", "non-fiction", "romance", "fantasy", "sci-fi", "thriller", "mystery", "science", 
    "historical", "young-adult", "children", "biography", "poetry", "philosophy",
    "graphic-novels", "adventure", "supernatural", "mythology", "fantasy sci-fi", "american-classics",
    "action", "psychology", "self-help", "religion", "horror", "classics", "historical-fiction", "music", 
    "suspense", "contemporary", "comics", "memoir", "science-fiction", "humour", "dystopian"
]

# we need to parse the popular_shelves column (because it is stored as a list of dictionaries
def extract_genres(shelves):

    # convert the data into a list of dictionaries
    if isinstance(shelves, str): # if shelves is a string
        list_of_shelves = ast.literal_eval(shelves) # convert it to a list
    else:
        list_of_shelves = shelves # if it is already a list, keep it

    # filter the shelves that match the common genres
    matching_shelves = [shelf["name"] for shelf in list_of_shelves if shelf["name"] in common_genres]

    # convert the list back to comma-separated strings
    extracted_string = ",".join(matching_shelves)
    return extracted_string

0    [{'count': '296158', 'name': 'to-read'}, {'cou...
1    [{'count': '3787', 'name': 'to-read'}, {'count...
2    [{'count': '3311', 'name': 'favorites'}, {'cou...
3    [{'count': '9381', 'name': 'to-read'}, {'count...
4    [{'count': '72219', 'name': 'to-read'}, {'coun...
5    [{'count': '39490', 'name': 'to-read'}, {'coun...
6    [{'count': '526189', 'name': 'to-read'}, {'cou...
7    [{'count': '4730', 'name': 'to-read'}, {'count...
8    [{'count': '7615', 'name': 'to-read'}, {'count...
9    [{'count': '34840', 'name': 'to-read'}, {'coun...
Name: popular_shelves, dtype: object


In [39]:
# let's apply to function to extract genres
df["genres"] = df["popular_shelves"].apply(extract_genres)

In [40]:
# save the updated CSV to our dataset
df.to_csv("goodreads_sample_with_genres.csv", index=False)

In [7]:
# check how many books are in the dataset
df_with_genres = pd.read_csv("goodreads_sample_with_genres.csv")
df_with_genres

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,popularity_score,genres
0,075640407X,23413,['162522'],US,eng,"[{'count': '296158', 'name': 'to-read'}, {'cou...",,False,4.55,B0010SKUYM,...,2007.0,https://www.goodreads.com/book/show/186074.The...,https://images.gr-assets.com/books/1472068073m...,186074,416634.0,2502879,The Name of the Wind (The Kingkiller Chronicle...,The Name of the Wind (The Kingkiller Chronicle...,291645.165,"fantasy,fiction,adventure,science-fiction,sci-..."
1,0312360282,4665,['151539'],US,en-US,"[{'count': '3787', 'name': 'to-read'}, {'count...",,False,3.93,B000UZNQQ0,...,,https://www.goodreads.com/book/show/676924.Bet...,https://s.gr-assets.com/assets/nophoto/book/11...,676924,206646.0,662923,"Betrayed (House of Night, #2)","Betrayed (House of Night, #2)",144653.379,"romance,supernatural,fiction,young-adult,fanta..."
2,0375836675,9971,[],US,en-US,"[{'count': '3311', 'name': 'favorites'}, {'cou...",,False,4.09,B001BZRUR4,...,,https://www.goodreads.com/book/show/19057.I_Am...,https://images.gr-assets.com/books/1398483261m...,19057,94968.0,2737065,I Am the Messenger,I Am the Messenger,66478.827,"young-adult,fiction,mystery,contemporary,adven..."
3,0800759494,2885,[],US,,"[{'count': '9381', 'name': 'to-read'}, {'count...",,False,3.91,B00B853QPM,...,,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,89375,68157.0,2957021,90 Minutes in Heaven: A True Story of Death an...,90 Minutes in Heaven: A True Story of Death an...,47711.073,"non-fiction,religion,memoir,biography,fiction,..."
4,0385689225,7703,[],US,eng,"[{'count': '72219', 'name': 'to-read'}, {'coun...",,True,4.45,B01DHWACVY,...,2016.0,https://www.goodreads.com/book/show/29780253-b...,https://images.gr-assets.com/books/1473867911m...,29780253,57318.0,50150838,Born a Crime: Stories From a South African Chi...,Born a Crime: Stories From a South African Chi...,40123.935,"non-fiction,memoir,biography,humour,contempora..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1440583129,71,[],US,,"[{'count': '340', 'name': 'to-read'}, {'count'...",,False,3.13,B00PMIGYZ0,...,2014.0,https://www.goodreads.com/book/show/22495074-y...,https://images.gr-assets.com/books/1413131332m...,22495074,318.0,41941702,You Could Be Home By Now,You Could Be Home By Now,223.539,"fiction,classics"
49996,0802113869,26,[],US,eng,"[{'count': '347', 'name': 'to-read'}, {'count'...",,False,3.97,,...,1990.0,https://www.goodreads.com/book/show/1242713.Th...,https://images.gr-assets.com/books/1486893483m...,1242713,317.0,1231405,The Kings and Queens of England and Scotland,The Kings and Queens of England and Scotland,223.091,"non-fiction,biography,historical,historical-fi..."
49997,,11,[],US,,"[{'count': '551', 'name': 'to-read'}, {'count'...",B01CBEE0IK,True,3.92,B01CBEE0IK,...,,https://www.goodreads.com/book/show/29503281-i...,https://s.gr-assets.com/assets/nophoto/book/11...,29503281,316.0,48845325,In Her Eyes,In Her Eyes,222.376,"romance,contemporary,suspense"
49998,1406912506,28,[],US,,"[{'count': '36', 'name': 'currently-reading'},...",,False,3.58,B0082STR38,...,2006.0,https://www.goodreads.com/book/show/115467.Chance,https://images.gr-assets.com/books/1399424398m...,115467,316.0,548598,Chance,Chance,222.274,"fiction,classics,historical-fiction"


In [55]:
chunk_size = 1000
chunks = []
mlb = MultiLabelBinarizer()

# Step 1: Read a small sample chunk to fit MultiLabelBinarizer
sample_chunk = next(pd.read_csv("goodreads_sample_with_genres.csv", chunksize=chunk_size))

# Ensure all values in 'genres' column are lists
sample_chunk["genres"] = sample_chunk["genres"].apply(lambda x: x.split(",") if isinstance(x, str) else [])

# Fit MLB on the sample
mlb.fit(sample_chunk["genres"])

# Step 2: Process the dataset in chunks
for chunk in pd.read_csv("goodreads_sample_with_genres.csv", chunksize=chunk_size):
    # Ensure 'genres' column is a list
    chunk["genres"] = chunk["genres"].apply(lambda x: x.split(",") if isinstance(x, str) else [])

    # One-hot encode genres using the pre-fitted MLB
    encoded_genres = pd.DataFrame(mlb.transform(chunk["genres"]), columns=mlb.classes_)

    # Concatenate the chunk with the encoded genres
    chunk = pd.concat([chunk, encoded_genres], axis=1)

    chunks.append(chunk)

# Combine all processed chunks into a final dataset
df_final = pd.concat(chunks, ignore_index=True)

# Save the final dataset
df_final.to_csv("final_book_dataset.csv", index=False)

print("One-hot encoding completed and dataset saved successfully!")


One-hot encoding completed and dataset saved successfully!


In [59]:
# let's take a look at our dataset
# print(df_final.columns)
final_books_dataset = pd.read_csv("final_books_dataset.csv")
# print(final_books_dataset.columns)
final_books_dataset

  final_books_dataset = pd.read_csv("final_books_dataset.csv")


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,religion,romance,sci-fi,science,science-fiction,self-help,supernatural,suspense,thriller,young-adult
0,075640407X,23413.0,['162522'],US,eng,"[{'count': '296158', 'name': 'to-read'}, {'cou...",,False,4.55,B0010SKUYM,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0312360282,4665.0,['151539'],US,en-US,"[{'count': '3787', 'name': 'to-read'}, {'count...",,False,3.93,B000UZNQQ0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0375836675,9971.0,[],US,en-US,"[{'count': '3311', 'name': 'favorites'}, {'cou...",,False,4.09,B001BZRUR4,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0800759494,2885.0,[],US,,"[{'count': '9381', 'name': 'to-read'}, {'count...",,False,3.91,B00B853QPM,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0385689225,7703.0,[],US,eng,"[{'count': '72219', 'name': 'to-read'}, {'coun...",,True,4.45,B01DHWACVY,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98995,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98996,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98997,,,,,,,,,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
98998,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1]:
import psutil
print(f"Memory Usage: {psutil.virtual_memory().percent}%")

Memory Usage: 74.5%


## Extract reviews from the reviews dataset

In [None]:
# we will do this in another notebook