In [1]:
HIERARCHY_MAX_LEN = 4
EMBEDDING_SIZE = 64
MERGE_METHOD = "sum"

In [2]:
import torch.nn as nn
import torch
import itertools
import pandas as pd

In [3]:
df = pd.read_pickle("../raw_data/lines_hier.pkl")
df.drop_duplicates(subset=['Key_product'], inplace=True)
df['hierarchy'] = df['hierarchy'].apply(lambda x: ([0]*HIERARCHY_MAX_LEN + x)[-HIERARCHY_MAX_LEN:])
df = df.loc[:, ['Key_product', 'D_PRODUCT', 'Q_AMOUNT', 'K_DITTA', 'hierarchy']]
df.reset_index(drop=True, inplace=True)
df.head(2)

Unnamed: 0,Key_product,D_PRODUCT,Q_AMOUNT,K_DITTA,hierarchy
0,6482150,BLANX WHITE SHOCK 50ML+LED,6.9,1443812,"[0, 0, 7594899, 6482150]"
1,6482152,"EUMILL GOCCE OCULARI 10FL0,5ML",20.3,1451849,"[8995977, 9241189, 9459149, 6482152]"


In [4]:
mapping_items = {v: i for i, v in enumerate(set(itertools.chain(*df['hierarchy'].tolist())))}
num_items = len(mapping_items)
df['mapped_hier'] =  df['hierarchy'].apply(lambda x: torch.IntTensor([mapping_items[y] for y in x]))

In [5]:
embedding_ = nn.Embedding(num_items, EMBEDDING_SIZE, padding_idx=mapping_items[0])
if MERGE_METHOD == 'sum':
    df['tensor_embed'] = df['mapped_hier'].apply(lambda x: embedding_(x).sum(dim=0).detach().numpy().tolist())
else:
    df['tensor_embed'] = df['mapped_hier'].apply(lambda x: embedding_(x).mean(dim=0).detach().numpy().tolist())

In [6]:
df = df.loc[:, ['Key_product', 'tensor_embed']]
df.head(2)

Unnamed: 0,Key_product,tensor_embed
0,6482150,"[1.4624515771865845, 0.002388477325439453, -1...."
1,6482152,"[-1.308363437652588, 2.7731428146362305, -0.12..."


In [7]:
embedding_extra = nn.Embedding(num_items, EMBEDDING_SIZE)
extra_embed = pd.DataFrame([{
    'Key_product': '[MASK]',
    'tensor_embed': embedding_extra(torch.IntTensor([0])).detach().numpy().tolist()[0]
},
{
    'Key_product': '[PAD]',
    'tensor_embed': embedding_extra(torch.IntTensor([1])).detach().numpy().tolist()[0]
}])
df = pd.concat([df, extra_embed]).reset_index(drop=True)

In [8]:
df['tensor_string'] = df['tensor_embed'].astype(str).apply(lambda x: x.replace("[", "").replace("]", "").replace(",", ""))
df['Key_product'] = df['Key_product'].astype(str)
df = df.loc[:, ['Key_product', 'tensor_string']].rename(columns={
    'Key_product':'iid:token',
    'tensor_string': 'item_emb:float_seq'
})

In [11]:
df.to_csv("../RECEIPT_LINES/RECEIPT_LINES.itememb", sep="\t", index=False)