In [1]:
import pandas as pd
import os
DIR = 'Data'
from tqdm import tqdm
tqdm.pandas()

In [2]:
file_path = os.path.join(DIR, 'goodreads_interactions.csv')
read = pd.read_csv(file_path)

In [3]:
file_path = os.path.join(DIR, 'book_id_map.csv')
book_map = pd.read_csv(file_path)

In [4]:
file_path = os.path.join(DIR, 'goodreads_book_authors.json.gz')
authors = pd.read_json(file_path, compression='gzip', lines=True)

In [5]:
file_path = os.path.join(DIR, 'user_id_map.csv')
user_map = pd.read_csv(file_path)

In [6]:
file_path = os.path.join(DIR, 'goodreads_books.json.gz')

chunk_size = 1000
num_chunks = 1000
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

books = pd.concat(df_list, ignore_index=True)

In [7]:
file_path = os.path.join(DIR, 'goodreads_book_genres_initial.json.gz')

chunk_size = 1000
num_chunks = 1000
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

genres = pd.concat(df_list, ignore_index=True)

In [8]:
file_path = os.path.join(DIR, 'goodreads_reviews_dedup.json.gz')

chunk_size = 1000
num_chunks = 1000
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

reviews = pd.concat(df_list, ignore_index=True)

In [9]:
file_path = os.path.join(DIR, 'goodreads_interactions_dedup.json.gz')

chunk_size = 1000
num_chunks = 1000
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

interactions = pd.concat(df_list, ignore_index=True)

In [10]:
def drop_empty_rows(df, column_name):
   
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

books = drop_empty_rows(books, 'description')

only keep needed rows

In [11]:
interactions = interactions[['user_id', 'book_id', 'review_id', 'is_read', 'rating']]
books = books[['language_code', 'description', 'authors', 'book_id', 'title', 'similar_books', 'image_url', 'url', 'popular_shelves', 'average_rating']]

In [12]:
reviews = reviews[['user_id', 'book_id', 'review_id', 'rating', 'review_text', 'n_votes', 'n_comments']]

combine list of genre names with the book description

In [13]:
genres = genres.sort_values(by='book_id')
books = books.sort_values(by='book_id')

In [14]:
# Function to filter genres
def filter_genres(genre_dict): return [genre for genre, value in genre_dict.items() if value is not None]

# Apply the function to the genres column
genres['filtered_genres'] = genres['genres'].progress_apply(filter_genres)

# Convert the list of filtered genres to a comma-separated string
genres['filtered_genres'] = genres['filtered_genres'].progress_apply(lambda x: ', '.join(x))

100%|██████████| 1000000/1000000 [01:02<00:00, 16049.21it/s]
100%|██████████| 1000000/1000000 [00:08<00:00, 122007.69it/s]


In [15]:
books = pd.merge(books, genres[['book_id', 'filtered_genres']], on='book_id', how='left')

processing popular genres column

In [16]:
exclude_shelves = ['to-read', 'read', 'currently-reading', 'default', 'owned', 'unread', 'my-library']

In [17]:
from tqdm import tqdm
tqdm.pandas()
def expand_popular_shelves(shelves_list):
    expanded_shelves = []
    for shelf in shelves_list:
        count = int(shelf['count'])
        name = shelf['name']
        if name not in exclude_shelves:
            expanded_shelves.extend([name] * count)
    return ' '.join(expanded_shelves)

books['expanded_shelves'] = books['popular_shelves'].progress_apply(expand_popular_shelves)

100%|██████████| 1000000/1000000 [18:23<00:00, 905.94it/s] 


In [18]:
interactions = interactions[interactions['is_read'] != False]

In [19]:
interactions = interactions.reset_index(drop=True)

merge dataframes to get consistent user and book ids

In [20]:
interactions = pd.merge(interactions, user_map, on='user_id', how='left')
interactions = pd.merge(interactions, book_map, on='book_id', how='left')

In [21]:
interactions.drop(columns=['user_id', 'book_id'], inplace=True)
interactions.rename(columns={'user_id_csv': 'user_id', 'book_id_csv': 'book_id'}, inplace=True)

In [22]:
reviews = pd.merge(reviews, user_map, on='user_id', how='left')
reviews = pd.merge(reviews, book_map, on='book_id', how='left')

In [23]:
reviews.drop(columns=['user_id', 'book_id'], inplace=True)
reviews.rename(columns={'user_id_csv': 'user_id', 'book_id_csv': 'book_id'}, inplace=True)

map authors and author ids

In [24]:
books['authors'] = books['authors'].progress_apply(lambda x: [d['author_id'] for d in x])

100%|██████████| 1000000/1000000 [08:08<00:00, 2047.62it/s]


In [25]:
author_name_mapping = authors.set_index('author_id')['name'].to_dict()

In [26]:
books['authors'] = books['authors'].progress_apply(lambda x: [author_name_mapping[int(author_id)] for author_id in x])

100%|██████████| 1000000/1000000 [04:19<00:00, 3859.22it/s]


In [31]:
import pickle

In [28]:
read.to_pickle('Pickle/read.pkl')

In [29]:
interactions.to_pickle('Pickle/interactions.pkl')

In [33]:
books['language_code'].value_counts()

language_code
         449587
eng      299752
en-US     38781
en-GB     24779
spa       23188
          ...  
ast           1
und           1
crh           1
chb           1
kok           1
Name: count, Length: 176, dtype: int64

In [35]:
books = books[books['language_code'].isin(['en-US', 'en-GB', 'eng'])]

In [37]:
books = books[['description', 'authors', 'book_id', 'title', 'url', 'average_rating', 'expanded_shelves', 'filtered_genres']]

In [38]:
with open('Pickle/books.pkl', 'wb') as file: 
    pickle.dump(books, file)

In [39]:
with open('Pickle/reviews.pkl', 'wb') as file: 
    pickle.dump(reviews, file)