In [2]:
import json
import pandas as pd

In [3]:
goodreads = pd.read_json('data/goodreads_books_comics_graphic.json', lines=True)

goodreads.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [4]:
# For interactions, we will need use the key below to convert the book_id to the work_id
work_id_book_id_dict = goodreads.groupby('work_id').agg({'book_id': list}) # Look for book_id here, and use the work_id to convert
work_id_book_id_dict.to_csv('work_id_book_id_dict.csv')

In [99]:
goodreads['author_ids'] = goodreads['authors'].apply(lambda x: ' '.join([author['author_id'] for author in x]))

In [100]:
cols_to_keep = ['work_id', 'title', 'author_ids', 'description', 'publication_year', 'ratings_count', 'average_rating', 'num_pages']

In [101]:
import numpy as np

# Group by work_id and keep only the specified columns

# First, sort the dataframe by work_id and ratings_count (descending)
sorted_df = goodreads.sort_values(['work_id', 'ratings_count'], ascending=[True, False])

# Define a function to get the best values from a group of books with the same work_id
def get_best_row(group):
    # Start with the row with the highest ratings_count
    best_row = group.iloc[0].copy()
    
    # For each column we want to potentially fill from other books
    for col in ['description', 'publication_year', 'num_pages']:
        # If the value is empty
        if pd.isna(best_row[col]) or best_row[col] == '':
            # Look for a non-empty value in the group
            non_empty = group[(~group[col].isna()) & (group[col] != '')]
            if len(non_empty) > 0:
                best_row[col] = non_empty.iloc[0][col]
    
    return best_row

# Apply the function to each group of books with the same work_id
best_rows = []
for _, group in sorted_df.groupby('work_id'):
    best_rows.append(get_best_row(group))

# Create a dataframe from the best rows
max_ratings_data = pd.DataFrame(best_rows)[cols_from_max_rating]

# Calculate the aggregated statistics
agg_stats = goodreads.groupby('work_id').agg({
    'ratings_count': 'sum',
    'average_rating': 'mean'
}).reset_index()

# Combine the data
goodreads_works = max_ratings_data.merge(agg_stats, on='work_id')

# Ensure the columns are in the right order
goodreads_works = goodreads_works[cols_to_keep]

# Display the shape of the resulting dataframe
print(f"Original dataframe: {goodreads.shape}")
print(f"Aggregated dataframe: {goodreads_works.shape}")

Original dataframe: (89411, 30)
Aggregated dataframe: (62944, 8)


In [102]:
description_criterion = goodreads_works[goodreads_works['description'].apply(lambda x: len(x) != 0)]['work_id']

In [103]:
# Filtering for english books (mainly)

#goodreads_children['language_code'].apply(lambda x: "" if re.match()) # Change any number of spaces to just empty string
language_criterion = goodreads[goodreads['language_code'].apply(lambda x: x in ["", 'eng', 'en-US', 'en-GB', 'en-CA'])]['work_id']

In [104]:
final = goodreads_works[goodreads_works['work_id'].isin(description_criterion) & goodreads_works['work_id'].isin(language_criterion)]

In [105]:
final.to_csv('data/goodreads_books_comics_graphic_cleaned.csv', index=False)