In [1]:
import os
import argparse
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv
import re

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'

queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/labeled_query_data.txt'

In [2]:
# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'

tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

In [3]:
parents_df

Unnamed: 0,category,parent
0,abcat0010000,cat00000
1,abcat0011000,abcat0010000
2,abcat0011001,abcat0011000
3,abcat0011002,abcat0011000
4,abcat0011003,abcat0011000
...,...,...
4634,pcmcat97200050013,cat15205
4635,pcmcat97200050015,cat15063
4636,pcmcat99000050001,pcmcat50000050006
4637,pcmcat99000050002,pcmcat99000050001


In [3]:
df = pd.read_csv(queries_file_name)[['category', 'query']]
df = df[df['category'].isin(categories)]

NON_ALPHA = re.compile(r"[^a-zA-Z0-9_]")
COMBINE_WHITESPACE = re.compile(r"\s+")

def transform_name(query: str):
    result = query.strip().lower()
    result = NON_ALPHA.sub(" ", result)
    result = COMBINE_WHITESPACE.sub(" ", result)
    return result

# IMPLEMENT ME: Convert queries to lowercase, and optionally implement other normalization, like stemming.
df['query'] = df['query'].apply(transform_name)

In [4]:
joined_df = pd.merge(df, parents_df, how="inner", on="category")
joined_df['count'] = joined_df.groupby('category')['category'].transform('count')

In [58]:
joined_df.tail()

Unnamed: 0,category,query,parent,count
1854993,pcmcat235500050003,3ds bundle,pcmcat232900050000,1
1854994,pcmcat240500050027,camera lighting,pcmcat240500050025,1
1854995,pcmcat235500050005,portable dvv,pcmcat232900050000,1
1854996,cat02737,drums,cat02014,1
1854997,pcmcat221400050013,gen 16 cymbals,pcmcat152100050012,1


In [5]:
joined_df.loc[joined_df['count'] < 1000, 'category'] = joined_df['parent']

In [6]:
joined_df['count'] = joined_df.groupby('category')['category'].transform('count')

In [None]:
while joined_df['count'].min() < 1000:
    joined_df = pd.merge(df, parents_df, how="inner", on="category")
    joined_df['count'] = joined_df.groupby('category')['category'].transform('count')
    joined_df.loc[joined_df['count'] < 1000, 'category'] = joined_df['parent']
    joined_df.drop(columns=['parent'])
    print(joined_df['count'].min())

In [8]:
parents_dict = dict(parents_df.set_index('category')['parent'])

In [10]:
category_counts = dict(df.category.value_counts())

In [31]:
dct = dict(df.set_index('query').groupby('category').groups)

In [33]:
dct['abcat0106010']

Index(['bluray storage', 'dvd rack', 'dvd media tower', 'dvd racks',
       'dvd tower', 'video glasses', 'dvd towers', 'cd racks', 'dvd rack',
       'glass tv stand', 'media storage', 'dvd holders', 'dvd rack',
       'dvd stand', 'tower', 'cd dvd storage', 'dvd shelf', 'dvd cabinets',
       'walkie talkie', 'blu ray storage cabinets', 'media tower',
       'media tower', 'dvd storage', 'dvd storage', 'blueray stand',
       'dvd rack', 'atlantic', 'dvd stand', 'blu ray storage', 'dvd rack',
       'dvd racks', 'dvd tower', 'media storage', 'dvd cabinet', 'dvd racks',
       'dvd cd vhs racks', 'dvd shelves', 'media rack', 'rack', 'rack',
       'atlantic media tower', 'dvd rack', 'dvd rack', 'cd rack',
       'media tower', 'dvd rack', 'glass mount', 'dvd racks', 'dvd organizer',
       'dvd storage', 'dvd rack', 'media storage', 'cd storage', 'dvd towers',
       'dvd rack', 'dvd rack', 'dvd rack', 'dvd tower', 'cd rack',
       'media tower', 'media tower', 'media storage', 'dvd 