In [4]:
import ast
import msgpack
import pandas as pd
from collections import Counter
from pathlib import Path

In [5]:
import os
project_dir = Path('/Users/ron/dev/torch/medium')
os.chdir(project_dir)
os.getcwd()

'/Users/ron/dev/torch/medium'

In [6]:
# index, order_id, product_id
order_list = pd.read_csv('./data/instacart/order_list.csv', header=0, index_col=0)

# Convert 'product_id' from string to list
order_list['product_id'] = order_list['product_id'].apply(ast.literal_eval)

In [7]:
# gets frequency of each product_id. This determines the order of the products in the vecabulary
product_ids = [product_id for sublist in order_list['product_id'].tolist() for product_id in sublist]
counts = Counter(product_ids)

In [12]:
# Sort first by counts in descending order and then by product id in ascending order
sorted_counts = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
SPECIAL_TOKENS = ['<PAD>', '<UNK>', '<CLS>', '<SEP>', '<MASK>']
vocab_path = Path('./vocab') / 'instacart_vocab.txt'
vocab = SPECIAL_TOKENS + [product_id for product_id, _ in sorted_counts]
with open(vocab_path, 'x') as f:
    f.write('\n'.join(map(str, vocab)))


In [13]:
# load vocab
# create product_id to token_id mapping
# create a toekn_id to product_name mapping
# translate the product_id lists to token_id lists
# create an MLM dataset from the lists by masking 15% of the tokens
# split the dataset into train, validation and test sets

# load vocab.txt into a list of tokens
with open(vocab_path, 'r') as f:
    vocab = f.read().split('\n')

In [14]:
# build product_id to token_id mapping
# we'll use it to convert the product_id lists of orders to token_id lists
product_id_to_token_id = {}
for token_id, product_id in enumerate(vocab):
    if token_id < 5:
        continue
    product_id_to_token_id[int(product_id)] = token_id


In [15]:
print(list(product_id_to_token_id.items())[:5])
print(list(product_id_to_token_id.keys())[:5])
print(list(product_id_to_token_id.values())[:5])

[(24852, 5), (13176, 6), (21137, 7), (21903, 8), (47209, 9)]
[24852, 13176, 21137, 21903, 47209]
[5, 6, 7, 8, 9]


In [23]:
# convert product_id lists to token_id lists
order_token_list = {}
for order_id, product_id_list in order_list['product_id'].items():
    order_token_list[order_id] = [int(product_id_to_token_id[product_id]) for product_id in product_id_list]

# create a dataframe
order_token_list_df = pd.DataFrame(list(order_token_list.items()), columns=['order_id', 'token_id'])

# convert the token_id column to a string
order_token_list_df['token_id'] = order_token_list_df['token_id'].apply(lambda x: str(x))

# save to csv
order_token_list_df.to_csv('./data/instacart/order_token_list.csv', index=False)

order_token_list_df.to_msgpack('./data/instacart/order_token_list.msgpack')

AttributeError: 'DataFrame' object has no attribute 'to_msgpack'

In [None]:
# create a token_id to product_name mapping
products = pd.read_csv('products.csv', header=0)
# convert product_id to int
products['product_id'] = products['product_id'].astype(int)
# filter by only products that are in the order_list
products = products.set_index('product_id').loc[product_id_to_token_id.keys()].reset_index()

# create token_id to product_name mapping
token_id_to_product_name = {}
for product_id, product_name in products[['product_id', 'product_name']].values:
    token_id = product_id_to_token_id[product_id]
    token_id_to_product_name[token_id] = product_name
df = pd.DataFrame.from_dict(token_id_to_product_name, orient='index', columns=['product_name'])
df.index.name = 'token_id'
df.to_csv('token_id_to_product_name.csv')

In [52]:
order_list.product_id.apply(len).describe()

count    3.214874e+06
mean     1.008888e+01
std      7.525398e+00
min      1.000000e+00
25%      5.000000e+00
50%      8.000000e+00
75%      1.400000e+01
max      1.450000e+02
Name: product_id, dtype: float64