In [1]:
HIERARCHY_MAX_LEN = 4
EMBEDDING_SIZE = 64
MERGE_METHOD = "sum"

In [2]:
import torch.nn as nn
import torch
import itertools
import pandas as pd
import os

In [3]:
ROOT_DIR = os.path.dirname(os.getcwd())
RAW_DATA_PATH = os.path.join(ROOT_DIR, "data","raw")
PROCESSED_DATA_PATH = os.path.join(ROOT_DIR, "data","processed")

In [4]:
df = pd.read_pickle(os.path.join(RAW_DATA_PATH, "lines_hier.pkl"))
df.drop_duplicates(subset=['Key_product'], inplace=True)
df['hierarchy'] = df['hierarchy'].apply(lambda x: ([0]*HIERARCHY_MAX_LEN + x)[-HIERARCHY_MAX_LEN:])
df = df.loc[:, ['Key_product', 'D_PRODUCT', 'Q_AMOUNT', 'K_DITTA', 'hierarchy']]
df.reset_index(drop=True, inplace=True)

# Merge with description
df_description = pd.read_pickle(os.path.join(PROCESSED_DATA_PATH, "df_products_descriptions.pkl"))
df = pd.merge(df, df_description, left_on='D_PRODUCT', right_on='name', how='left')
df = df[['Key_product', 'D_PRODUCT', 'description_word2vec', 'description_glove', 'Q_AMOUNT', 'K_DITTA', 'hierarchy']]
df.head(2)

Unnamed: 0,Key_product,D_PRODUCT,description_word2vec,description_glove,Q_AMOUNT,K_DITTA,hierarchy
0,6482150,BLANX WHITE SHOCK 50ML+LED,"[-0.0003201263859539592, -0.000297295161154969...","[-0.10971122485678504, -0.16961734861111075, -...",6.9,1443812,"[0, 0, 7594899, 6482150]"
1,6482152,"EUMILL GOCCE OCULARI 10FL0,5ML","[-0.007836564183425784, -0.015759744265918602,...","[-0.056279045973986724, 0.25549609312850746, -...",20.3,1451849,"[8995977, 9241189, 9459149, 6482152]"


In [5]:
# Create the mapping for the hierarchy

mapping_items = {v: i for i, v in enumerate(set(itertools.chain(*df['hierarchy'].tolist())))}
num_items = len(mapping_items)
df['mapped_hier'] =  df['hierarchy'].apply(lambda x: torch.IntTensor([mapping_items[y] for y in x]))
df.head(2)

Unnamed: 0,Key_product,D_PRODUCT,description_word2vec,description_glove,Q_AMOUNT,K_DITTA,hierarchy,mapped_hier
0,6482150,BLANX WHITE SHOCK 50ML+LED,"[-0.0003201263859539592, -0.000297295161154969...","[-0.10971122485678504, -0.16961734861111075, -...",6.9,1443812,"[0, 0, 7594899, 6482150]","[tensor(0, dtype=torch.int32), tensor(0, dtype..."
1,6482152,"EUMILL GOCCE OCULARI 10FL0,5ML","[-0.007836564183425784, -0.015759744265918602,...","[-0.056279045973986724, 0.25549609312850746, -...",20.3,1451849,"[8995977, 9241189, 9459149, 6482152]","[tensor(40162, dtype=torch.int32), tensor(3133..."


In [6]:
# Compute the embedding for each product and merge them

embedding_ = nn.Embedding(num_items, EMBEDDING_SIZE, padding_idx=mapping_items[0])

if MERGE_METHOD == 'sum':
    df['hier_embed'] = df['mapped_hier'].apply(lambda x: embedding_(x).sum(dim=0).detach().numpy().tolist())
else:
    df['hier_embed'] = df['mapped_hier'].apply(lambda x: embedding_(x).mean(dim=0).detach().numpy().tolist())

In [7]:
df = df.loc[:, ['Key_product', 'hier_embed', 'description_word2vec']]
# df = df.loc[:, ['Key_product', 'hier_embed', 'description_glove']]
df.head(2)

Unnamed: 0,Key_product,hier_embed,description_word2vec
0,6482150,"[-1.1548326015472412, 0.11676022410392761, -0....","[-0.0003201263859539592, -0.000297295161154969..."
1,6482152,"[-3.467742681503296, 0.5491219162940979, -2.24...","[-0.007836564183425784, -0.015759744265918602,..."


In [8]:
import numpy as np

# Sum the hierarchy embedding with the description embedding

df['tensor_embed'] = df.apply(lambda x: list(np.array(x['hier_embed']) + np.array(x['description_word2vec'])), axis=1)
# df['tensor_embed'] = df.apply(lambda x: list(np.array(x['hier_embed']) + np.array(x['description_glove'])), axis=1)

In [9]:
df.head(2)

Unnamed: 0,Key_product,hier_embed,description_word2vec,tensor_embed
0,6482150,"[-1.1548326015472412, 0.11676022410392761, -0....","[-0.0003201263859539592, -0.000297295161154969...","[-1.1551527279331952, 0.11646292894277264, -0...."
1,6482152,"[-3.467742681503296, 0.5491219162940979, -2.24...","[-0.007836564183425784, -0.015759744265918602,...","[-3.475579245686722, 0.5333621720281793, -2.23..."


In [10]:
# Add the extra embedding for [MASK] and [PAD]

embedding_extra = nn.Embedding(num_items, EMBEDDING_SIZE)
extra_embed = pd.DataFrame([{
    'Key_product': '[MASK]',
    'tensor_embed': embedding_extra(torch.IntTensor([0])).detach().numpy().tolist()[0]
},
{
    'Key_product': '[PAD]',
    'tensor_embed': embedding_extra(torch.IntTensor([1])).detach().numpy().tolist()[0]
}])
df = pd.concat([df, extra_embed]).reset_index(drop=True)

In [11]:
# Dataframe preparation

df['tensor_string'] = df['tensor_embed'].astype(str).apply(lambda x: x.replace("[", "").replace("]", "").replace(",", ""))
df['Key_product'] = df['Key_product'].astype(str)
df = df.loc[:, ['Key_product', 'tensor_string']].rename(columns={
    'Key_product':'Key_product:token',
    'tensor_string': 'item_emb:float_seq'
})

In [12]:
df.to_csv(os.path.join(PROCESSED_DATA_PATH, "type_2","RECEIPT_LINES","RECEIPT_LINES.item"), sep="\t", index=False)