In [1]:
#! /usr/bin/env python3
"""
Created on Oct 2 2018

Updated on Oct 24 2018

Prepare data for the following tensorflow model.

Noticed:
    It may take around 13.935157557328543 mins, the real number depends on ur machine.

@author: Ray


Reference:
	- LSTM character embedding : https://github.com/cristianoBY/Sequence-Tagging-NER-TensorFlow:
    - character embedding: https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html
"""
import os
import time
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import gc
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/preprocessing')
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/py_model')
sys.path.append('../models/')
from clean_helpers import clean_name_for_word_embedding
from utils import init_logging
from data_utils import get_glove_vocab
from data_utils import write_vocab
from data_utils import load_vocab_and_return_word_to_id_dict
from data_utils import export_glove_vectors
from data_utils import get_char_vocab
import logging
import gc

pd.options.display.max_columns = 100
pd.options.display.max_rows = 5000
pd.options.display.max_colwidth = 1000


def pad_1d(array, max_len, word_padding = True):
    if word_padding == True:
        array = array[:max_len]
        length = len(array)
        padded = array + [9858]*(max_len - len(array)) # padded index of unknown.
    else:
        array = array[:max_len]
        length = len(array)
        padded = array + [0]*(max_len - len(array)) # padded with zero.
    return padded, length

def encode_word_to_idx(word, word_to_id, vocabulary_set, lowercase = True, allow_unknown = True):
    '''encode a word (string) into id'''

    # 1. preprocess word
    if lowercase:
        word = word.lower()
    if word.isdigit():
        word = NUM

    # 2. get id of word
    if word in vocabulary_set:
        return word_to_id[word]
    else:
        if allow_unknown:
            return word_to_id[UNK]
        else:
            raise Exception("Unknow key is not allowed. Check that your vocab (tags?) is correct")

#--------------------
# setting
#--------------------
TRACE_CODE = False # for tracing funtionality and developing quickly
TRUNCATED = False # for reducing memory 
USE_CHARS = True # for character embedding
LOWERCASE = True
ALLOW_UNKNOWN = True
dim_word = 300
UNK = "$UNK$" # for the word in our own courpus which is unknown in embedding
NUM = "$NUM$" # for the word which is number


pre_trained_word_embedding_path = "/data/ID_largewv_300_2.txt"
filename_words_vec = "../models/data/wordvec/word2vec.npz".format(dim_word)
log_dir = 'log/' # log path
init_logging(log_dir)



In [2]:
[i[:-13] for i in os.listdir('../data/processed/') if 'training.' in i ][:]

['lips', 'face', 'mobile', 'dress', 'women_top']

In [6]:
#-------------------
# loading data
#-------------------

for category in [i[:-13] for i in os.listdir('../data/processed/') if 'training.' in i ][:]:
    print ('-----------category-----------', category)
    # setting
    base_dir = "../models/data/wordvec/{}".format(category)
    filename_words_voc = "../models/data/wordvec/{}/words_vocab.txt".format(category)
    filename_chars_voc = "../models/data/wordvec/{}/chars_vocab.txt".format(category)
    # loading data
    date_path = '../data/processed/{}_training.csv'.format(category)
    df = pd.read_csv(date_path)
    # analyze
    sku_df = df.groupby('item_name').eval_set.value_counts().to_frame('counts').reset_index()
    print ('# sku : {}'.format(len(sku_df)))
    print ('# validation ratio : {}'.format( sku_df[sku_df.eval_set != 'test'].eval_set.value_counts(normalize=True)))
    print ('distribution of data', sku_df.eval_set.value_counts())
    del sku_df
    # preprocessing
    s = time.time()
    
    df.dropna(subset = ['tokens'], inplace = True)
    
    df['clean_tokens'] = df.tokens.apply(lambda x: clean_name_for_word_embedding(x) if type(x)==str else x)
    if LOWERCASE:
        df['clean_tokens'] = df.clean_tokens.apply(lambda x: x.lower() if type(x)==str else x)
    df['clean_tokens'] = df.clean_tokens.astype(str)

    # item_id
    item_dict = {}
    for i, i_n in enumerate(df.item_name.unique().tolist()):
        item_dict[i_n] = i+1
    df['item_id'] = [item_dict[i_n] for i_n in df.item_name.tolist()]

    e = time.time()
    logging.info('it spend {} mins on preprocessing'.format( (e-s) / 60.0))
    #-------------------
    # word embedding 
    #-------------------
    s = time.time()
    # Build Word vocab (vocabulary set) for our customized task
    vocab_glove = get_glove_vocab(pre_trained_word_embedding_path) # word set from pre-trained word_embedding
    vocab_words = set(df.clean_tokens.tolist()) # word set from our own whole corpurs including train, dev, and test
    vocab_set = vocab_words & vocab_glove # 這裡面的字, 肯定都有相對應的vector, 不是zero vector(The reason we did是可以節省我們使用的embbedding大小, 不用沒用到的pre-trained字也佔memory)
    vocab_set.add(UNK)
    vocab_set.add(NUM)

    # Save vocab
    write_vocab(vocab_set, base_dir, filename_words_voc)
    # create dictionary mapping word to index
    word_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_words_voc)
    # save word embedding matrix 
    export_glove_vectors(word_to_id_dict, glove_filename = pre_trained_word_embedding_path,
                         output_filename = filename_words_vec, dim = dim_word)
    # encode a word (string) into id
    df['word_id'] = df.clean_tokens.apply( lambda x: encode_word_to_idx(x, word_to_id_dict, vocab_set, LOWERCASE, ALLOW_UNKNOWN))

    e = time.time()
    logging.info('it spend {} mins on word embedding '.format( (e-s) / 60.0)) # it spend 16.531146574020386 mins on word embedding over 37788 words in volcabulary.

    #-------------------
    # chracter embedding 
    #-------------------
    if USE_CHARS == True:
        dim_char = 100
        # Build Char vocab (vocabulary set) 
        
        vocab_chars = get_char_vocab([w for w in df.tokens if type(w)!= float])
        # Save Char vocab
        write_vocab(vocab_chars, base_dir, filename_chars_voc)
        # create dictionary mapping char to index
        chars_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_chars_voc)
        # get max_word_length for padding later
        word_len_distribution = [len(w) for w in df.tokens if type(w)!= float]
        max_word_length = max(word_len_distribution)
    
    # max_seq_length
    seq_len_distribution = df.groupby('item_name').tokens.apply( lambda x : len(x.tolist())).to_frame('seq_len').reset_index()

    if TRUNCATED == False:
        if TRACE_CODE == True:
            max_seq_length = 122
        else:
            max_seq_length = seq_len_distribution.seq_len.max()
    else:
        max_seq_length = 100

    logging.info('max_seq_length : {}'.format( max_seq_length)) # max length of sentence
    logging.info('max_word_length : {}'.format( max_word_length)) # max length of word

    #-------------------
    # output
    #-------------------

    for i in range(3):
        # 
        if i == 0:
            name = 'train'
            output = df[df.eval_set == 'train']
        elif i == 1:
            name = 'val'
            output = df[df.eval_set == 'val']
        else:
            name = 'test'
            output = df.copy()
        # setting
        num_sentences = output['item_name'].nunique()
        logging.info('number of sequences : {}'.format(num_sentences))
        # 1-D
        eval_set = np.zeros(shape=[num_sentences], dtype='S5')
        item_id = np.zeros(shape=[num_sentences], dtype=np.int32) # for recording
        history_length = np.zeros(shape=[num_sentences], dtype=np.int32) # length of sentence
        # 2-D
        word_id = np.zeros(shape=[num_sentences, max_seq_length], dtype=np.int32)
        label = np.zeros(shape=[num_sentences, max_seq_length], dtype=np.int32)
        word_length = np.zeros(shape=[num_sentences, max_seq_length], dtype=np.int32) # length of words 
        # 3-D
        char_id = np.zeros(shape=[num_sentences, max_seq_length, max_word_length], dtype=np.int32)
        i = 0
        for ix, df_ in tqdm(output.groupby('item_name')):
            #logging.info('item_id : {}'.format(i))
            # 1-D
            eval_set[i] = df_['eval_set'].iloc[0]
            item_id[i] = df_['item_id'].iloc[0]
            # 2-D
            word_id[i, :], history_length[i] = pad_1d(list(map(int, df_['word_id'])), max_len = max_seq_length, word_padding = True)
            label[i, :], _ = pad_1d(list(map(int, df_['label'])), max_len = max_seq_length, word_padding = False)
            word_length[i, :], _ = pad_1d([len([char for char in w]) for w in df_['tokens'].tolist()], max_len = max_seq_length, word_padding = False)
            if USE_CHARS == True:
                # 3-D
                for i_word_axis, w in enumerate(df_['tokens'].tolist()):
                    char_id[i, i_word_axis, : ], _ = pad_1d([chars_to_id_dict[char] for char in w], max_len = max_word_length, word_padding = False)

            i += 1

        #--------------------------
        # save
        #--------------------------
        base_path = '/data/ner_task/data_for_brand_detection_model/{}/{}'.format(category, name)
        if not os.path.isdir(base_path):
            os.makedirs(base_path)    

        if TRACE_CODE == True:
            np.save(os.path.join(base_path, 'eval_set_0.npy'), eval_set)
            np.save(os.path.join(base_path, 'word_id_0.npy'), word_id)
            np.save(os.path.join(base_path, 'history_length_0.npy'), history_length)
            np.save(os.path.join(base_path, 'label_0.npy'), label)
            np.save(os.path.join(base_path, 'item_id_0.npy'), item_id)
            if USE_CHARS == True:
                np.save(os.path.join(base_path, 'char_id_0.npy'), char_id)
                np.save(os.path.join(base_path, 'word_length_0.npy'), word_length)
        else:
            np.save(os.path.join(base_path, 'eval_set.npy'), eval_set)
            np.save(os.path.join(base_path, 'word_id.npy'), word_id)
            np.save(os.path.join(base_path, 'history_length.npy'), history_length)
            np.save(os.path.join(base_path, 'label.npy'), label)
            np.save(os.path.join(base_path, 'item_id.npy'), item_id)
            if USE_CHARS == True:
                np.save(os.path.join(base_path, 'char_id.npy'), char_id)
                np.save(os.path.join(base_path, 'word_length.npy'), word_length)


    logging.info('shape of df : {}'.format(df.shape))
    save_path = '../data/processed/{}_w_word_id.csv'.format(category)
    print ('save_path', save_path)
    df.to_csv(save_path, index = False)



-----------category----------- lips
# sku : 44031
# validation ratio : train    0.899918
val      0.100082
Name: eval_set, dtype: float64
distribution of data train    31660
test      8850
val       3521
Name: eval_set, dtype: int64


it spend 0.04522428512573242 mins on preprocessing


Building vocab...
- done. 6629250 tokens
Writing vocab...
- done. 7889 tokens


it spend 13.189097519715627 mins on word embedding 


Writing vocab...
- done. 40 tokens


max_seq_length : 27
max_word_length : 71
number of sequences : 31660
100%|██████████| 31660/31660 [00:21<00:00, 1478.65it/s]
number of sequences : 3521
100%|██████████| 3521/3521 [00:02<00:00, 1532.89it/s]
number of sequences : 44029
100%|██████████| 44029/44029 [00:28<00:00, 1543.60it/s]
shape of df : (347497, 7)


save_path ../data/processed/lips_w_word_id.csv
-----------category----------- face
# sku : 73826
# validation ratio : train    0.899877
val      0.100123
Name: eval_set, dtype: float64
distribution of data train    52848
test     15098
val       5880
Name: eval_set, dtype: int64


it spend 0.07405883073806763 mins on preprocessing


Building vocab...
- done. 6629250 tokens
Writing vocab...
- done. 9408 tokens


it spend 13.0612610856692 mins on word embedding 


Writing vocab...
- done. 41 tokens


max_seq_length : 28
max_word_length : 48
number of sequences : 52848
100%|██████████| 52848/52848 [00:35<00:00, 1495.59it/s]
number of sequences : 5880
100%|██████████| 5880/5880 [00:03<00:00, 1474.54it/s]
number of sequences : 73824
100%|██████████| 73824/73824 [00:48<00:00, 1516.08it/s]
shape of df : (642031, 7)


save_path ../data/processed/face_w_word_id.csv
-----------category----------- mobile
# sku : 186349
# validation ratio : train    0.899843
val      0.100157
Name: eval_set, dtype: float64
distribution of data train    143740
test      26610
val       15999
Name: eval_set, dtype: int64


it spend 0.21230234305063883 mins on preprocessing


Building vocab...
- done. 6629250 tokens
Writing vocab...
- done. 24106 tokens


it spend 12.600360731283823 mins on word embedding 


Writing vocab...
- done. 41 tokens


max_seq_length : 32
max_word_length : 54
number of sequences : 143740
100%|██████████| 143740/143740 [01:33<00:00, 1545.32it/s]
number of sequences : 15999
100%|██████████| 15999/15999 [00:11<00:00, 1425.68it/s]
number of sequences : 186344
100%|██████████| 186344/186344 [01:59<00:00, 1559.49it/s]
shape of df : (1844485, 7)


save_path ../data/processed/mobile_w_word_id.csv
-----------category----------- dress
# sku : 3845
# validation ratio : train    0.899142
val      0.100858
Name: eval_set, dtype: float64
distribution of data train    3040
test      464
val       341
Name: eval_set, dtype: int64


it spend 0.005031935373942057 mins on preprocessing


Building vocab...
- done. 6629250 tokens
Writing vocab...
- done. 2023 tokens


it spend 12.721617607275645 mins on word embedding 


Writing vocab...
- done. 39 tokens


max_seq_length : 19
max_word_length : 78
number of sequences : 3040
100%|██████████| 3040/3040 [00:02<00:00, 1395.53it/s]
number of sequences : 341
100%|██████████| 341/341 [00:00<00:00, 1339.01it/s]
number of sequences : 3845
100%|██████████| 3845/3845 [00:02<00:00, 1476.05it/s]
shape of df : (42558, 7)


save_path ../data/processed/dress_w_word_id.csv
-----------category----------- women_top
# sku : 7807
# validation ratio : train    0.900231
val      0.099769
Name: eval_set, dtype: float64
distribution of data train    5838
test     1322
val       647
Name: eval_set, dtype: int64


it spend 0.008811986446380616 mins on preprocessing


Building vocab...
- done. 6629250 tokens
Writing vocab...
- done. 4276 tokens


it spend 12.774761120478312 mins on word embedding 


Writing vocab...
- done. 39 tokens


max_seq_length : 20
max_word_length : 28
number of sequences : 5838
100%|██████████| 5838/5838 [00:03<00:00, 1508.23it/s]
number of sequences : 647
100%|██████████| 647/647 [00:00<00:00, 1393.20it/s]
number of sequences : 7807
100%|██████████| 7807/7807 [00:05<00:00, 1467.75it/s]
shape of df : (78717, 7)


save_path ../data/processed/women_top_w_word_id.csv


In [None]:
# s = [w for w in df.tokens if type(w) == float]
# df[df.tokens.isin(s)]

In [1]:
dict_category = {
    'lips':{'max_seq_length':27,'max_word_length':71},
    'face':{'max_seq_length':28,'max_word_length':48},
    'mobile':{'max_seq_length':32,'max_word_length':54},
    'face':{'max_seq_length':28,'max_word_length':48},
    'face':{'max_seq_length':28,'max_word_length':48},

}
dict_category

{'lips': {'max_seq_length': 27, 'max_word_length': 71}}