In [1]:
import pandas as pd

Load dataframes

In [13]:
read = pd.read_pickle('Pickle/read.pkl')

In [14]:
interactions = pd.read_pickle('Pickle/interactions.pkl')

In [15]:
genres = pd.read_pickle('Pickle/genres.pkl')

In [16]:
books = pd.read_pickle('Pickle/books.pkl')

In [17]:
len(books)

400000

In [2]:
reviews = pd.read_pickle('Pickle/reviews.pkl')

In [4]:
user_map = pd.read_pickle('Pickle/user_map.pkl')
book_map = pd.read_pickle('Pickle/book_map.pkl')  
authors = pd.read_pickle('Pickle/authors.pkl')

In [8]:
def drop_empty_rows(df, column_name):
   
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

books = drop_empty_rows(books, 'description')

In [9]:
reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

In [10]:
interactions.columns

Index(['user_id', 'book_id', 'review_id', 'is_read', 'rating',
       'review_text_incomplete', 'date_added', 'date_updated', 'read_at',
       'started_at'],
      dtype='object')

only keep needed rows

In [None]:
interactions = interactions[['user_id', 'book_id', 'review_id', 'is_read', 'rating']]
books = books[['language_code', 'description', 'authors', 'book_id', 'title', 'similar_books', 'image_url', 'url', 'popular_shelves', 'average_rating']]


In [3]:
reviews = reviews[['user_id', 'book_id', 'review_id', 'rating', 'review_text', 'n_votes', 'n_comments']]

combine list of genre names with the book description

In [18]:
genres = genres.sort_values(by='book_id')
books = books.sort_values(by='book_id')

In [19]:
# Function to filter genres
def filter_genres(genre_dict): return [genre for genre, value in genre_dict.items() if value is not None]

# Apply the function to the genres column
genres['filtered_genres'] = genres['genres'].apply(filter_genres)

# Convert the list of filtered genres to a comma-separated string
genres['filtered_genres'] = genres['filtered_genres'].apply(lambda x: ', '.join(x))

In [20]:
books = pd.merge(books, genres[['book_id', 'filtered_genres']], on='book_id', how='left')

processing popular genres column

In [21]:
exclude_shelves = ['to-read', 'read', 'currently-reading', 'default', 'owned', 'unread', 'my-library']

In [22]:
def expand_popular_shelves(shelves_list):
    expanded_shelves = []
    for shelf in shelves_list:
        count = int(shelf['count'])
        name = shelf['name']
        if name not in exclude_shelves:
            expanded_shelves.extend([name] * count)
    return ' '.join(expanded_shelves)

books['expanded_shelves'] = books['popular_shelves'].apply(expand_popular_shelves)

In [23]:
interactions = interactions[interactions['is_read'] != False]

In [24]:
interactions = interactions.reset_index(drop=True)

merge dataframes to get consistent user and book ids

In [25]:
interactions = pd.merge(interactions, user_map, on='user_id', how='left')
interactions = pd.merge(interactions, book_map, on='book_id', how='left')

In [26]:
interactions.drop(columns=['user_id', 'book_id'], inplace=True)
interactions.rename(columns={'user_id_csv': 'user_id', 'book_id_csv': 'book_id'}, inplace=True)

In [5]:
reviews = pd.merge(reviews, user_map, on='user_id', how='left')
reviews = pd.merge(reviews, book_map, on='book_id', how='left')

In [6]:
reviews.drop(columns=['user_id', 'book_id'], inplace=True)
reviews.rename(columns={'user_id_csv': 'user_id', 'book_id_csv': 'book_id'}, inplace=True)

map authors and author ids

In [29]:
books['authors'] = books['authors'].apply(lambda x: [d['author_id'] for d in x])

In [30]:
author_name_mapping = authors.set_index('author_id')['name'].to_dict()

In [31]:
books['authors'] = books['authors'].apply(lambda x: [author_name_mapping[int(author_id)] for author_id in x])

In [32]:
read = read[read['is_read'] != 0]

remove non eng books

In [33]:
books = books[books['language_code'] == 'eng']

In [34]:
eng_book_ids = books['book_id']

In [35]:
read = read[read['book_id'].isin(eng_book_ids)]

save dataframes

In [36]:
read.to_pickle('Pickle/read.pkl')
genres.to_pickle('Pickle/genres.pkl')
books.to_pickle('Pickle/books.pkl')

In [None]:
interactions.to_pickle('Pickle/interactions.pkl')

In [7]:
reviews.to_pickle('Pickle/reviews.pkl')