In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
books = pd.read_pickle('Pickle/books.pkl')
read = pd.read_pickle('Pickle/read.pkl')

In [3]:
books['filtered_genres'] = books['filtered_genres'].apply(lambda x: x.split(', '))
book_genre_mapping = dict(zip(books['book_id'], books['filtered_genres']))
chunk_size = 50000
results = []
for start in tqdm(range(0, len(read), chunk_size), desc="Processing Chunks"):
    end = min(start + chunk_size, len(read))
    chunk = read.iloc[start:end].copy()
    chunk['filtered_genres'] = chunk['book_id'].map(book_genre_mapping)
    results.append(chunk)

read_with_genres = pd.concat(results, ignore_index=True)

Processing Chunks: 100%|██████████| 4573/4573 [11:08<00:00,  6.84it/s]


In [None]:
# Define the number of top genres
n = 4

def most_common_genres(books_read, n):
    genres = []
    for genres_list in books_read:
        if isinstance(genres_list, list):  
            genres.extend(genres_list)
    genre_counts = pd.Series(genres).value_counts().index.tolist()[:n]
    return genre_counts

In [None]:
# Function to process a batch of user genres
def process_batch(df_batch, n):
    return df_batch.groupby('user_id')['filtered_genres'].apply(lambda x: most_common_genres(x, n)).reset_index()

from collections import defaultdict
batch_size = 5000

user_genres_dict = defaultdict(list)

In [None]:
# Process data in batches
for start in tqdm(range(0, len(read_with_genres), batch_size), desc="Processing in Batches"):
    end = min(start + batch_size, len(read_with_genres))
    df_batch = read_with_genres.iloc[start:end].copy()  
    df_batch.loc[:, 'filtered_genres'] = df_batch['filtered_genres'].apply(lambda x: x if isinstance(x, list) else [])
    batch_result = process_batch(df_batch, n)
    for _, row in batch_result.iterrows():
        user_genres_dict[row['user_id']].extend(row['filtered_genres'])

Processing in Batches: 100%|██████████| 45730/45730 [07:33<00:00, 100.93it/s]


In [None]:
for user_id in user_genres_dict:
    user_genres_dict[user_id] = pd.Series(user_genres_dict[user_id]).value_counts().index.tolist()[:n]

In [None]:
user_most_common_genres = pd.DataFrame(list(user_genres_dict.items()), columns=['user_id', 'most_common_genres'])
user_most_common_genres.set_index('user_id', inplace=True)

In [9]:
user_most_common_genres.to_pickle('Pickle/user_most_common_genres.pkl')