In [2]:
import os
import argparse
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv
import re

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'

queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/labeled_query_data.txt'

In [3]:
# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'

tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

In [4]:
parents_df

Unnamed: 0,category,parent
0,abcat0010000,cat00000
1,abcat0011000,abcat0010000
2,abcat0011001,abcat0011000
3,abcat0011002,abcat0011000
4,abcat0011003,abcat0011000
...,...,...
4634,pcmcat97200050013,cat15205
4635,pcmcat97200050015,cat15063
4636,pcmcat99000050001,pcmcat50000050006
4637,pcmcat99000050002,pcmcat99000050001


In [5]:
df = pd.read_csv(queries_file_name)[['category', 'query']]
df = df[df['category'].isin(categories)]

NON_ALPHA = re.compile(r"[^a-zA-Z0-9_]")
COMBINE_WHITESPACE = re.compile(r"\s+")

def transform_name(query: str):
    result = query.strip().lower()
    result = NON_ALPHA.sub(" ", result)
    result = COMBINE_WHITESPACE.sub(" ", result)
    return result

# IMPLEMENT ME: Convert queries to lowercase, and optionally implement other normalization, like stemming.
df['query'] = df['query'].apply(transform_name)

In [6]:
df

Unnamed: 0,category,query
0,abcat0101001,televisiones panasonic 50 pulgadas
1,abcat0101001,sharp
2,pcmcat193100050014,nook
3,abcat0101001,rca
4,abcat0101005,rca
...,...,...
1865264,pcmcat247400050000,ttv
1865265,pcmcat218000050000,incase
1865266,pcmcat248500050020,ds games
1865267,pcmcat209000050008,archos


In [11]:
df.where(df['category'].value_counts() < 500)

Unnamed: 0,category,query
0,,
1,,
2,,
3,,
4,,
...,...,...
1865264,,
1865265,,
1865266,,
1865267,,
