In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from collections import defaultdict

In [2]:
books = pd.read_pickle('Pickle/books.pkl')
read = pd.read_pickle('Pickle/read.pkl')

In [None]:
books['filtered_genres'] = books['filtered_genres'].apply(lambda x: x.split(', '))

# Create a mapping from book_id to filtered genres
book_genre_mapping = dict(zip(books['book_id'], books['filtered_genres']))

chunk_size = 50000  # Define the chunk size for processing the data in parts
results = []

# Process the data in chunks to avoid memory issues with large datasets
for start in tqdm(range(0, len(read), chunk_size), desc="Processing Chunks"):
    end = min(start + chunk_size, len(read))  # Determine the end of the current chunk
    chunk = read.iloc[start:end].copy()  # Select the current chunk of data
    # Map the book_id to its corresponding filtered genres from the book_genre_mapping
    chunk['filtered_genres'] = chunk['book_id'].map(book_genre_mapping)
    results.append(chunk)  # Add the chunk with the genres to the results list

# Concatenate the results into a single DataFrame
read_with_genres = pd.concat(results, ignore_index=True)


Processing Chunks: 100%|██████████| 4573/4573 [11:08<00:00,  6.84it/s]


In [None]:
n = 4

def most_common_genres(books_read, n):
    """
    Returns the most common genres from a list of books read.

    Args:
        books_read (list of lists): A list of lists where each inner list contains genres of a book that the user has read.
        n (int): The number of top genres to return.

    Returns:
        list: A list of the top 'n' most common genres from the user's books read.
    """
    genres = []
    # Iterate through each list of genres from books read
    for genres_list in books_read:
        if isinstance(genres_list, list):  # Ensure the genres list is a valid list
            genres.extend(genres_list)  # Add genres to the genres list
    # Get the 'n' most common genres
    genre_counts = pd.Series(genres).value_counts().index.tolist()[:n]
    return genre_counts


In [None]:
def process_batch(df_batch, n):
    """
    Processes a batch of data to determine the most common genres for each user.

    Args:
        df_batch (DataFrame): A batch of the main dataset containing at least 'user_id' and 'filtered_genres' columns.
        n (int): The number of top genres to return for each user.

    Returns:
        DataFrame: A new DataFrame with each user_id and their top 'n' most common genres.
    """
    return df_batch.groupby('user_id')['filtered_genres'].apply(lambda x: most_common_genres(x, n)).reset_index()

batch_size = 5000

user_genres_dict = defaultdict(list)

In [None]:
# Initialize a dictionary to hold the genres for each user
user_genres_dict = defaultdict(list)

# Process the dataset in batches
for start in tqdm(range(0, len(read_with_genres), batch_size), desc="Processing in Batches"):
    end = min(start + batch_size, len(read_with_genres))
    df_batch = read_with_genres.iloc[start:end].copy()  
    
    # Ensure 'filtered_genres' is a list (in case it's not)
    df_batch.loc[:, 'filtered_genres'] = df_batch['filtered_genres'].apply(lambda x: x if isinstance(x, list) else [])
    
    # Process the batch to find the most common genres for each user
    batch_result = process_batch(df_batch, n)
    
    # Update the user_genres_dict with the genres from the current batch
    for _, row in batch_result.iterrows():
        user_genres_dict[row['user_id']].extend(row['filtered_genres'])


Processing in Batches: 100%|██████████| 45730/45730 [07:33<00:00, 100.93it/s]


In [None]:
for user_id in user_genres_dict:
    user_genres_dict[user_id] = pd.Series(user_genres_dict[user_id]).value_counts().index.tolist()[:n]

In [None]:
user_most_common_genres = pd.DataFrame(list(user_genres_dict.items()), columns=['user_id', 'most_common_genres'])
user_most_common_genres.set_index('user_id', inplace=True)

In [9]:
user_most_common_genres.to_pickle('Pickle/user_most_common_genres.pkl')