In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [3]:
raw_cat_df = pd.read_parquet('5_cat.parquet', engine='pyarrow')

In [4]:
raw_cat_df.columns

Index(['rating', 'title_review', 'text', 'parent_asin', 'user_id',
       'helpful_vote', 'title_meta', 'price', 'store', 'features_clean',
       'description_clean', 'timestamp_utc', 'category_depth',
       'verified_purchase_flag', 'cat_1', 'cat_2', 'cat_3', 'cat_4',
       'price_log', 'helpful_vote_clipped', 'avg_rating_parent'],
      dtype='object')

In [5]:
cat_df = raw_cat_df.loc[:, ['rating', 'cat_1', 'cat_2', 'cat_3', 'parent_asin']]

In [6]:
cat_df

Unnamed: 0,rating,cat_1,cat_2,cat_3,parent_asin
0,1,outdoor recreation,camping & hiking,lights & lanterns,B09LW2KHPM
1,5,exercise & fitness,cardio training,exercise bikes,B0BTDLFXSL
2,5,outdoor recreation,camping & hiking,tents & shelters,B09W4W9JB5
3,5,sports,cycling,kids' bikes & accessories,B07Z6Y87XH
4,5,outdoor recreation,camping & hiking,tents & shelters,B09HRDWXCK
...,...,...,...,...,...
767679,1,exercise & fitness,fitness technology,activity & fitness trackers,B00P2QCJP6
767680,1,sports,cycling,kids' bikes & accessories,B07D4ZT13Y
767681,4,exercise & fitness,fitness technology,activity & fitness trackers,B01L6RE7Z4
767682,5,exercise & fitness,strength training equipment,weight machines,B00JZKH7K2


In [7]:
cat_df.to_csv(path_or_buf='category_ratings.csv', index=False)

In [8]:
df = cat_df
# Reset state for new logic
collection_id = 1
collections = []

# Function to create collections using cat_1 groups, and limit cat_2 to max 2 repeats and cat_3 to max 2 repeats
def create_collections_with_limits(group):
    global collection_id
    buffer = []
    cat2_counts = defaultdict(int)
    cat3_counts = defaultdict(int)

    for _, row in group.iterrows():
        cat2 = row['cat_2']
        cat3 = row['cat_3']
        if cat2_counts[cat2] < 2 and cat3_counts[cat3] < 2:
            buffer.append(row)
            cat2_counts[cat2] += 1
            cat3_counts[cat3] += 1

            if len(buffer) == 5:
                for item in buffer:
                    collections.append((collection_id, item['parent_asin'], item['rating']))
                collection_id += 1
                buffer = []
                cat2_counts = defaultdict(int)
                cat3_counts = defaultdict(int)

# Apply collection creation at the cat_1 level only
for _, group in df.groupby('cat_1'):
    create_collections_with_limits(group)

# Convert to DataFrame and save output
cols_df = pd.DataFrame(collections, columns=['collection_id', 'item_id', 'rating'])
output_path = 'collections.csv'
cols_df.to_csv(output_path, index=False)