In [1]:
#! /usr/bin/env python3
"""
Created on Oct 2 2018

Updated on Oct 24 2018

Prepare data for the following tensorflow model.

Noticed:
    It may take around 13.935157557328543 mins, the real number depends on ur machine.

@author: Ray

TO do list in the future:
    - character embedding
        - padding for char_ids
        - tf.placeholder for char_ids (tf.int32, shape=[None, None, None],
                        name="char_ids")
Reference:
	- character embedding:https://github.com/cristianoBY/Sequence-Tagging-NER-TensorFlow
"""
import os
import time
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import gc
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/preprocessing')
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/py_model')
sys.path.append('../models/')
from clean_helpers import clean_name_for_word_embedding
from utils import init_logging
from data_utils import get_glove_vocab
from data_utils import write_vocab
from data_utils import load_vocab_and_return_word_to_id_dict
from data_utils import export_glove_vectors
import logging
import gc

pd.options.display.max_columns = 100
pd.options.display.max_rows = 5000
pd.options.display.max_colwidth = 1000


def pad_1d(array, max_len, word_padding = True):
    if word_padding == True:
        array = array[:max_len]
        length = len(array)
        padded = array + [9858]*(max_len - len(array)) # padded index of unknown.
    else:
        array = array[:max_len]
        length = len(array)
        padded = array + [0]*(max_len - len(array)) # padded with zero.
    return padded, length

def encode_word_to_idx(word, word_to_id, vocabulary_set, lowercase = True, allow_unknown = True):
    '''encode a word (string) into id'''

    # 1. preprocess word
    if lowercase:
        word = word.lower()
    if word.isdigit():
        word = NUM

    # 2. get id of word
    if word in vocabulary_set:
        return word_to_id[word]
    else:
        if allow_unknown:
            return word_to_id[UNK]
        else:
            raise Exception("Unknow key is not allowed. Check that your vocab (tags?) is correct")

#--------------------
# setting
#--------------------
TRACE_CODE = False # for tracing funtionality and developing quickly
TRUNCATED = False # for reducing memory 
LOWERCASE = True
ALLOW_UNKNOWN = True
dim_word = 300
UNK = "$UNK$" # for the word in our own courpus which is unknown in embedding
NUM = "$NUM$" # for the word which is number

base_dir = "../models/data/wordvec"
filename_words_voc = "../models/data/wordvec/words_vocab.txt"
filename_chars_voc = "../models/data/wordvec/chars_vocab.txt"

pre_trained_word_embedding_path = "/data/ID_largewv_300_2.txt"
filename_words_vec = "../models/data/wordvec/word2vec.npz".format(dim_word)
log_dir = 'log/' # log path
init_logging(log_dir)


#-------------------
# loading data
#-------------------
if TRACE_CODE == True:
    df = pd.read_csv('../data/processed/mobile_training_v2.csv', nrows = 19)
else:
    df = pd.read_csv('../data/processed/mobile_training_v2.csv')

s = time.time()

# preprocessing
df['clean_tokens'] = df.tokens.apply(lambda x: clean_name_for_word_embedding(x) if type(x)==str else x)
if LOWERCASE:
	df['clean_tokens'] = df.clean_tokens.apply(lambda x: x.lower() if type(x)==str else x)
df['clean_tokens'] = df.clean_tokens.astype(str)

# item_id
item_dict = {}
for i, i_n in enumerate(df.item_name.unique().tolist()):
    item_dict[i_n] = i+1
df['item_id'] = [item_dict[i_n] for i_n in df.item_name.tolist()]

e = time.time()
logging.info('it spend {} mins on preprocessing'.format( (e-s) / 60.0))

#-------------------
# word embedding 
#-------------------
s = time.time()
# Build Word vocab (vocabulary set) for our customized task
vocab_glove = get_glove_vocab(pre_trained_word_embedding_path) # word set from pre-trained word_embedding
vocab_words = set(df.clean_tokens.tolist()) # word set from our own whole corpurs including train, dev, and test
vocab_set = vocab_words & vocab_glove # 這裡面的字, 肯定都有相對應的vector, 不是zero vector(The reason we did是可以節省我們使用的embbedding大小, 不用沒用到的pre-trained字也佔memory)
vocab_set.add(UNK)
vocab_set.add(NUM)

# Save vocab
write_vocab(vocab_set, base_dir, filename_words_voc)
# create dictionary mapping word to index
word_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_words_voc)
# save word embedding matrix 
export_glove_vectors(word_to_id_dict, glove_filename = pre_trained_word_embedding_path,
                     output_filename = filename_words_vec, dim = dim_word)
# encode a word (string) into id
df['word_id'] = df.clean_tokens.apply( lambda x: encode_word_to_idx(x, word_to_id_dict, vocab_set, LOWERCASE, ALLOW_UNKNOWN))

#-------------------
# chracter embedding 
#-------------------

e = time.time()
logging.info('it spend {} mins on word embedding '.format( (e-s) / 60.0)) 




seq_len_distribution = df.groupby('item_name').tokens.apply( lambda x : len(x.tolist())).to_frame('seq_len').reset_index()

if TRUNCATED == False:
    if TRACE_CODE == True:
        max_seq_length = 122
    else:
        max_seq_length = seq_len_distribution.seq_len.max()
else:
    max_seq_length = 100

logging.info('max_seq_length : {}'.format( max_seq_length)) 


it spend 0.2774062951405843 mins on preprocessing


Building vocab...
- done. 6629250 tokens
Writing vocab...
- done. 27621 tokens


it spend 15.054399271806082 mins on word embedding 
max_seq_length : 36


In [2]:
df.clean_tokens.nunique()

37788

In [3]:
from data_utils import get_char_vocab


In [4]:
dim_char = 100
# Build Char vocab (vocabulary set) 
vocab_chars = get_char_vocab(df.tokens)
# Save Char vocab
write_vocab(vocab_chars, base_dir, filename_chars_voc)
# create dictionary mapping char to index
chars_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_chars_voc)
# encode a char of words (string) into id


Writing vocab...
- done. 77 tokens


In [5]:
df.head()

Unnamed: 0,item_name,tokens,label,eval_set,clean_tokens,item_id,word_id
0,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Samsung,2,test,samsung,1,16211
1,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Galaxy,0,test,galaxy,1,9619
2,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,J1,0,test,j1,1,7598
3,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Ace,0,test,ace,1,11594
4,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,2016,0,test,2016,1,3951


In [6]:
# w_ids = []

# for i_id, w_id, tok in zip(list(df.item_id), list(df.word_id), list(df.tokens)):
#     for t in 

SyntaxError: invalid syntax (<ipython-input-6-81f44ebcfca3>, line 4)

In [9]:
word_len_distribution = [len(w) for w in df.tokens]
#word_len_distribution

In [10]:
num_sentences = 1
max_seq_length = 36
max_word_length = 10
char_id = np.zeros(shape=[num_sentences, max_seq_length, max_word_length], dtype=np.int32)
# 3-D assign?

In [11]:
char_id.shape

(1, 36, 10)

In [12]:
#for

In [13]:
# assign on word-axis
char_id[0, 2,:] = np.arange(10)
# # assign on char-axis
# char_id[0, 0: 1] = np.arange(10)

In [None]:
char_id[0][2]

In [None]:
len('Blackview')

In [14]:
test= df[df.item_id.isin([1,2,3])]
test

Unnamed: 0,item_name,tokens,label,eval_set,clean_tokens,item_id,word_id
0,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Samsung,2,test,samsung,1,16211
1,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Galaxy,0,test,galaxy,1,9619
2,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,J1,0,test,j1,1,7598
3,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Ace,0,test,ace,1,11594
4,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,2016,0,test,2016,1,3951
5,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,SM-J111F,0,test,smj111f,1,482
6,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,8GB,0,test,8gb,1,19027
7,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,-,0,test,,1,13519
8,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,White,0,test,white,1,6108
9,Blackview BV8000 Pro RAM 6GB 64GB IP68 Waterproff Rugged Smartphone,Blackview,2,test,blackview,2,27269


In [18]:
num_sentences = 3
max_word_length = 10 # need to be calculated!!
# # setting
# num_sentences = output['item_name'].nunique()
# logging.info('number of sequences : {}'.format(num_sentences))
# 1-D
eval_set = np.zeros(shape=[num_sentences], dtype='S5')
item_id = np.zeros(shape=[num_sentences], dtype=np.int32) # for recording
history_length = np.zeros(shape=[num_sentences], dtype=np.int32)
# 2-D
word_id = np.zeros(shape=[num_sentences, max_seq_length], dtype=np.int32)
label = np.zeros(shape=[num_sentences, max_seq_length], dtype=np.int32)
word_length = np.zeros(shape=[num_sentences, max_seq_length], dtype=np.int32) # length of words 
# 3-D
char_id = np.zeros(shape=[num_sentences, max_seq_length, max_word_length], dtype=np.int32)
i = 0
for ix, df_ in tqdm(test.groupby('item_name')):
    #logging.info('item_id : {}'.format(i))
    # 1-D
    eval_set[i] = df_['eval_set'].iloc[0]
    item_id[i] = df_['item_id'].iloc[0]
    # 2-D
    word_id[i, :], history_length[i] = pad_1d(list(map(int, df_['word_id'])), max_len = max_seq_length, word_padding = True)
    label[i, :], _ = pad_1d(list(map(int, df_['label'])), max_len = max_seq_length, word_padding = False)
    word_length[i, :], _ = pad_1d([len([char for char in w]) for w in df_['tokens'].tolist()], max_len = max_seq_length, word_padding = False)
    # 3-D
    for i_word_axis, w in enumerate(df_['tokens'].tolist()):
        char_id[i, i_word_axis, : ], _ = pad_1d([chars_to_id_dict[char] for char in w], max_len = max_word_length, word_padding = False)
    i += 1


100%|██████████| 3/3 [00:00<00:00, 500.00it/s]


In [16]:
word_id.shape

(3, 36)

In [17]:
char_id.shape

(3, 36, 10)

In [19]:
item_id[0], item_id[1], item_id[2]

(2, 3, 1)

In [20]:
word_id[0], word_id[1], word_id[2]

(array([27269, 19235, 22956, 16279, 25336,  6729,  5608,  8436, 20950,
        10686,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858],
       dtype=int32),
 array([19580,  1726, 24053,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858],
       dtype=int32),
 array([16211,  9619,  7598, 11594,  3951,   482, 19027, 13519,  6108,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858],
       dtype=int32))

In [22]:
word_length.shape

(3, 36)

In [24]:
word_length[0]

array([ 9,  6,  3,  3,  3,  4,  4, 10,  6, 10,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0], dtype=int32)

In [None]:
for i_word_axis, w in enumerate(df_['tokens'].tolist()):
    print ('w', w)
    print ('i_word_axis',i_word_axis)
    # conver w_id into w_token(str)
    charids = [chars_to_id_dict[char] for char in w]
    charids, _ = pad_1d(charids, max_len = max_word_length, word_padding = False)
    char_id[0, i_word_axis, : ] = charids
    #for char_word_axis, char in w_token:
#     for char_word_axis, char in enumerate('w_token'):
#         #print ('char', char)
#         # convert char into charid ==> [c1_id,c2_id, ..]
#         # pad_1d(charid, max_len = max_word_length, word_padding = False)
#         char_id[0, i_word_axis, : ] = np.arange(10)

In [None]:
char_id[0]

In [None]:
for i_word_axis, w in enumerate(word_id[0]):
    print ('w_id', w)
    print ('i_word_axis',i_word_axis)
    # conver w_id into w_token(str)
    charids = [chars_to_id_dict[char] for char in w_token]
    char_id[0, i_word_axis, : ] = pad_1d(charids, max_len = max_word_length, word_padding = False)
    #for char_word_axis, char in w_token:
#     for char_word_axis, char in enumerate('w_token'):
#         #print ('char', char)
#         # convert char into charid ==> [c1_id,c2_id, ..]
#         # pad_1d(charid, max_len = max_word_length, word_padding = False)
#         char_id[0, i_word_axis, : ] = np.arange(10)

In [None]:
word_id

In [None]:
chars_to_id_dict['e']

In [None]:
c, _ = pad_1d([5,3,2], max_len = max_word_length, word_padding = False)
c