In [None]:
import pandas as pd
import os
DIR = '../Data'
from tqdm import tqdm
tqdm.pandas()
import pickle

In [None]:
file_path = os.path.join(DIR, 'goodreads_interactions.csv')
read = pd.read_csv(file_path)

In [None]:
file_path = os.path.join(DIR, 'book_id_map.csv')
book_map = pd.read_csv(file_path)

In [None]:
file_path = os.path.join(DIR, 'goodreads_book_authors.json.gz')
authors = pd.read_json(file_path, compression='gzip', lines=True)

In [None]:
file_path = os.path.join(DIR, 'user_id_map.csv')
user_map = pd.read_csv(file_path)

In [None]:
file_path = os.path.join(DIR, 'goodreads_books.json.gz')

chunk_size = 1000
num_chunks = 1000
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in tqdm(enumerate(chunks), total=num_chunks):
    for _, row in chunk.iterrows():
        df_list.append(row)
    if i >= num_chunks:
        break

books = pd.DataFrame(df_list)

In [None]:
file_path = os.path.join(DIR, 'goodreads_book_genres_initial.json.gz')

chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in tqdm(enumerate(chunks), total=num_chunks):
    for _, row in chunk.iterrows():
        df_list.append(row)
    if i >= num_chunks:
        break

genres = pd.DataFrame(df_list)

In [None]:
file_path = os.path.join(DIR, 'goodreads_reviews_dedup.json.gz')

chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in tqdm(enumerate(chunks), total=num_chunks):
    for _, row in chunk.iterrows():
        df_list.append(row)
    if i >= num_chunks:
        break

reviews = pd.DataFrame(df_list)

In [None]:
file_path = os.path.join(DIR, 'goodreads_interactions_dedup.json.gz')

chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in tqdm(enumerate(chunks), total=num_chunks):
    for _, row in chunk.iterrows():
        df_list.append(row)
    if i >= num_chunks:
        break

interactions = pd.DataFrame(df_list)

In [None]:
def drop_empty_rows(df, column_name):
   
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

books = drop_empty_rows(books, 'description')

only keep needed rows

In [None]:
interactions = interactions[['user_id', 'book_id', 'review_id', 'is_read', 'rating']]

In [None]:
reviews = reviews[['user_id', 'book_id', 'review_id', 'rating', 'review_text', 'n_votes', 'n_comments']]

In [None]:
genres = genres.sort_values(by='book_id')
books = books.sort_values(by='book_id')

In [None]:
def filter_genres(genre_dict):
    """
    Filters out genres from a dictionary where the value is None.

    Parameters:
    -----------
    genre_dict (dict): A dictionary where the keys are genre names and the values are their associated values (e.g., popularity, rating, etc.).

    Returns:
    --------
    list: A list of genre names where the values are not None.
    """
    return [genre for genre, value in genre_dict.items() if value is not None]

genres['filtered_genres'] = genres['genres'].progress_apply(filter_genres)
genres['filtered_genres'] = genres['filtered_genres'].progress_apply(lambda x: ', '.join(x))


In [None]:
books = pd.merge(books, genres[['book_id', 'filtered_genres']], on='book_id', how='inner')

In [None]:
books = books.dropna(subset = ['filtered_genres'])

In [None]:
exclude_shelves = ['to-read', 'read', 'currently-reading', 'default', 'owned', 'unread', 'my-library']

In [None]:
def expand_popular_shelves(shelves_list):
    """
    Expands the shelves list based on the count of each shelf and excludes specified shelves.

    Parameters:
    -----------
    shelves_list (list): A list of dictionaries, where each dictionary contains 'name' (shelf name) and 'count' (the number of books in that shelf).

    Returns:
    --------
    str: A space-separated string of shelf names, with each shelf name repeated according to its count, excluding specified shelves.
    """
    expanded_shelves = []
    for shelf in shelves_list:
        count = int(shelf['count'])
        name = shelf['name']
        if name not in exclude_shelves:
            expanded_shelves.extend([name] * count)
    return ' '.join(expanded_shelves)

books['expanded_shelves'] = books['popular_shelves'].progress_apply(expand_popular_shelves)

In [None]:
interactions = interactions[interactions['is_read'] != False]

In [None]:
interactions = interactions.reset_index(drop=True)

merge dataframes to get consistent user and book ids

In [None]:
interactions = pd.merge(interactions, user_map, on='user_id', how='left')
interactions = pd.merge(interactions, book_map, on='book_id', how='left')

In [None]:
interactions.drop(columns=['user_id', 'book_id'], inplace=True)
interactions.rename(columns={'user_id_csv': 'user_id', 'book_id_csv': 'book_id'}, inplace=True)

In [None]:
reviews = pd.merge(reviews, user_map, on='user_id', how='left')
reviews = pd.merge(reviews, book_map, on='book_id', how='left')

In [None]:
reviews.drop(columns=['user_id', 'book_id'], inplace=True)
reviews.rename(columns={'user_id_csv': 'user_id', 'book_id_csv': 'book_id'}, inplace=True)

In [None]:
books = books[['language_code', 'description', 'authors', 'book_id', 'title', 'expanded_shelves', 'average_rating', 'title_without_series', 'filtered_genres']]

map authors and author ids

In [None]:
books.loc[:, 'authors'] = books['authors'].progress_apply(lambda x: [author for author in x])

In [None]:
author_name_mapping = authors.set_index('author_id')['name'].to_dict()

In [None]:
books['authors'] = books['authors'].progress_apply(
    lambda x: [author_name_mapping[int(author_id['author_id'])] if isinstance(author_id, dict) else author_name_mapping[int(author_id)] for author_id in x]
)

In [None]:
read.to_pickle('../Pickle/read.pkl')

In [None]:
interactions.to_pickle('../Pickle/interactions.pkl')

In [None]:
with open('../Pickle/reviews.pkl', 'wb') as file: 
    pickle.dump(reviews, file)

In [None]:
tqdm.pandas()
chunk_size = 10000 
num_chunks = len(books) // chunk_size + 1
progress_bar = tqdm(total=len(books))
with open('../Pickle/books.pkl', 'wb') as file:
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        chunk = books.iloc[start_idx:end_idx]
        for _, row in chunk.iterrows():
            progress_bar.update(1)
        if i == 0:
            pickle.dump(chunk, file)
        else:
            pickle.dump(chunk, file)
progress_bar.close()