In [2]:
import pandas as pd
import numpy as np
import os
import gc
import sys
from tqdm import tqdm
sys.path.append('../brand_detector/preprocessing')
from clean_helpers import clean_name_for_word_embedding

# loading raw data

In [3]:
output_dir = '../brand_detector/data/preprocessed'

personal_care_and_beauty_shopee_df = pd.read_csv(os.path.join(output_dir, 'personal_care_and_beauty.csv'))
beauty_amazon_df = pd.read_csv(os.path.join(output_dir, 'beauty_amazon.csv'))
# configuration of shopee data: train/val/test = 35/15/50 (0,1,2)
train = pd.concat([beauty_amazon_df, personal_care_and_beauty_shopee_df[personal_care_and_beauty_shopee_df.is_valid == 0]], axis = 0)
val = personal_care_and_beauty_shopee_df[personal_care_and_beauty_shopee_df.is_valid == 1]
test = personal_care_and_beauty_shopee_df[personal_care_and_beauty_shopee_df.is_valid == 2]
del personal_care_and_beauty_shopee_df, beauty_amazon_df
# preprocessing
train['eval_set'] = ['train' for i in range(len(train))] 
val['eval_set'] = ['val' for i in range(len(val))] 
test['eval_set'] = ['test' for i in range(len(test))] 
train.drop(['is_valid'], axis =1, inplace = True)
val.drop(['is_valid'], axis =1, inplace = True)
test.drop(['is_valid'], axis =1, inplace = True)

print ('train', train.shape)
print ('val', val.shape)
print ('test', test.shape)
# all_df
all_df = pd.concat([train, val, test], axis = 0)
del train, test, val
gc.collect()
all_df.eval_set.value_counts()


train (836369, 4)
val (8300, 4)
test (21508, 4)


train    836369
test      21508
val        8300
Name: eval_set, dtype: int64

# pre-trained word-embedding

In [4]:
word_embedding_path = '/data/ID_large_wordvec_300_2.h5'
word2vec = pd.read_hdf(word_embedding_path)
# get word_to_id
word_to_id = {word: int(i+1) for i, word in enumerate(word2vec.word.tolist())}
# check
assert word_to_id['dan'] == 1, 'wrong in our dict mapping word to id'
assert word_to_id['di'] == 2, 'wrong in our dict mapping word to id'
assert word_to_id['yang'] == 3, 'wrong in our dict mapping word to id'

In [9]:
word2vec.head()

Unnamed: 0,word,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_291,dim_292,dim_293,dim_294,dim_295,dim_296,dim_297,dim_298,dim_299,dim_300
0,dan,0.712676,2.387824,-2.937458,5.594606,1.196547,3.567468,-0.333335,-1.570036,-1.445439,...,-2.644627,0.048757,3.511451,-0.651203,2.474551,-0.118684,1.047346,2.676099,-3.377923,4.213373
1,di,1.646574,3.004994,-2.030986,1.284245,2.100287,0.033384,0.991525,0.005785,0.410926,...,1.630286,-5.025796,1.855228,1.364514,0.25918,2.906763,-2.368078,-1.652482,-5.045381,1.621557
2,yang,3.893595,1.641351,-4.073985,7.351041,0.277348,3.169405,1.772519,-1.548425,-3.901118,...,-2.928375,-3.055444,3.708488,-0.407081,2.588052,-0.101883,-0.059166,-0.397024,-0.894873,4.231138
3,kami,4.253416,-1.777752,-4.561904,1.851718,0.782603,2.520426,0.912112,-1.305857,-2.840159,...,-2.66383,-8.41574,3.040092,1.991707,1.335176,3.994257,4.25812,-0.209764,1.378407,0.395307
4,untuk,-3.49249,3.062385,1.732885,6.49492,1.681367,1.559415,-0.851074,-0.963648,-1.843624,...,-4.2813,-2.645144,-0.548235,-1.230889,-0.314143,0.879712,2.900071,0.65076,-5.74788,0.175559


In [29]:
def encode_word_to_idx(x, word_to_id):
    try:
        return word_to_id[x]
    except Exception:
        # way1: take all the word that do not exist in word space as another word
        return len(word_to_id)+1 
# preprocessing
all_df['clean_tokens'] = all_df.tokens.apply(lambda x: clean_name_for_word_embedding(x) if type(x)==str else x)
all_df['clean_tokens'] = all_df.clean_tokens.apply(lambda x: x.lower() if type(x)==str else x)
all_df['word_id'] = all_df.clean_tokens.apply( lambda x: encode_word_to_idx(x, word_to_id))
#
item_dict = {}
for i, i_n in enumerate(all_df.item_name.unique().tolist()):
    item_dict[i_n] = i+1
all_df['item_id'] = [item_dict[i] for i in all_df.item_name.tolist()]

# example

In [37]:
all_df[all_df.item_name.str.contains('NATURE REPUBLIC REAL SHEET MASK')]

Unnamed: 0,item_name,tokens,is_brand,eval_set,clean_tokens,word_id,item_id
30394,NATURE REPUBLIC REAL SHEET MASK,NATURE,2,test,nature,4682,92823
30395,NATURE REPUBLIC REAL SHEET MASK,REPUBLIC,1,test,republic,11412,92823
30396,NATURE REPUBLIC REAL SHEET MASK,REAL,0,test,real,653,92823
30397,NATURE REPUBLIC REAL SHEET MASK,SHEET,0,test,sheet,5776,92823
30398,NATURE REPUBLIC REAL SHEET MASK,MASK,0,test,mask,2290,92823
30399,NATURE REPUBLIC REAL SHEET MASK 23gr,NATURE,2,test,nature,4682,92824
30400,NATURE REPUBLIC REAL SHEET MASK 23gr,REPUBLIC,1,test,republic,11412,92824
30401,NATURE REPUBLIC REAL SHEET MASK 23gr,REAL,0,test,real,653,92824
30402,NATURE REPUBLIC REAL SHEET MASK 23gr,SHEET,0,test,sheet,5776,92824
30403,NATURE REPUBLIC REAL SHEET MASK 23gr,MASK,0,test,mask,2290,92824


# prepare_training_data

# output:
# -item_id: for creating own item_embedding
# -word_id: for matching word_embedding
# -final_states:

In [53]:


def pad_1d(array, max_len):
    array = array[:max_len]
    length = len(array)
    padded = array + [0]*(max_len - len(array))
    return padded, length

#-------------------
# setting
#-------------------
TRUNCATED = False
num_sentences = all_df['item_name'].nunique()
seq_len_distribution = all_df.head(100).groupby('item_name').tokens.apply( lambda x : len(x.tolist())).to_frame('seq_len').reset_index()

if TRUNCATED == False:
    max_seq_length = seq_len_distribution.seq_len.max()
else:
    max_seq_length = 100
#-------------------
# output
#-------------------
print ('number of sequences : {}'.format(num_sentences))
# 1-D
eval_set = np.zeros(shape=[num_sentences], dtype='S5')
item_id = np.zeros(shape=[num_sentences], dtype=np.int32)
history_length = np.zeros(shape=[num_sentences], dtype=np.int8)
# 2-D
word_id = np.zeros(shape=[num_sentences, max_seq_length], dtype=np.int32)
label = np.zeros(shape=[num_sentences, max_seq_length], dtype=np.int8)
i = 0
for ix, df in tqdm(all_df.head(50).groupby('item_name')):
    # 1-d
    eval_set[i] = df['eval_set'].iloc[0]
    item_id[i] = df['item_id'].iloc[0]
    # 2-d
    word_id[i, :], history_length[i] = pad_1d(list(map(int, df['word_id'])), 
                                              max_len = max_seq_length)
    label[i, :], _ = pad_1d(list(map(int, df['is_brand'])), 
                                              max_len = max_seq_length)
    i += 1





  0%|          | 0/6 [00:00<?, ?it/s][A[A[A[A



100%|██████████| 6/6 [00:00<00:00, 1199.23it/s][A[A[A[A

number of sequences : 93909


In [55]:
word_id.shape

(93909, 14)

In [56]:
label.shape

(93909, 14)

# save

In [57]:
np.save('../brand_detector/data/preprocessed/eval_set.npy', eval_set)
np.save('../brand_detector/data/preprocessed/item_id.npy', item_id)
np.save('../brand_detector/data/preprocessed/word_id.npy', word_id)
np.save('../brand_detector/data/preprocessed/history_length.npy', history_length)
np.save('../brand_detector/data/preprocessed/label.npy', label)
