In [24]:
#! /usr/bin/env python3
"""
Created on Oct 2 2018

Prepare data for the following tensorflow model.

Noticed:
    It may take around 13.935157557328543 mins, the real number depends on ur machine.

@author: Ray

TO do list in the future:
    - character embedding
    
"""
import os
import time
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import gc
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/preprocessing')
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/py_model')
from clean_helpers import clean_name_for_word_embedding
from utils import init_logging
sys.path.append('../models/')
from data_utils import get_glove_vocab
from data_utils import write_vocab
from data_utils import load_vocab_and_return_word_to_id_dict
from data_utils import export_glove_vectors
import logging
import gc

def pad_1d(array, max_len, word_padding = True):
    if word_padding == True:
        array = array[:max_len]
        length = len(array)
        padded = array + [9858]*(max_len - len(array)) # padded index of unknown.
    else:
        array = array[:max_len]
        length = len(array)
        padded = array + [0]*(max_len - len(array)) # padded with zero.
    return padded, length

def encode_word_to_idx(word, word_to_id, vocabulary_set, lowercase = True, allow_unknown = True):
    '''encode a word (string) into id'''

    # 1. preprocess word
    if lowercase:
        word = word.lower()
    if word.isdigit():
        word = NUM

    # 2. get id of word
    if word in vocabulary_set:
        return word_to_id[word]
    else:
        if allow_unknown:
            return word_to_id[UNK]
        else:
            raise Exception("Unknow key is not allowed. Check that your vocab (tags?) is correct")

#--------------------
# setting
#--------------------
TRACE_CODE = True # for tracing funtionality and developing quickly
TRUNCATED = False # for reducing memory 
LOWERCASE = True
ALLOW_UNKNOWN = True
dim_word = 300
UNK = "$UNK$" # for the word in our own courpus which is unknown in embedding
NUM = "$NUM$" # for the word which is number

base_dir = "../models/data/wordvec"
filename_words_voc = "../models/data/wordvec/words_vocab.txt"
pre_trained_word_embedding_path = "/data/ID_largewv_300_2.txt"
filename_words_vec = "../models/data/wordvec/word2vec.npz".format(dim_word)
log_dir = 'log/' # log path
init_logging(log_dir)

In [25]:
df = pd.read_csv('../data/processed/mobile_training_v2.csv')

In [26]:
df.shape

(2107688, 4)

In [27]:
df.head()

Unnamed: 0,item_name,tokens,label,eval_set
0,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Samsung,2,test
1,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Galaxy,0,test
2,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,J1,0,test
3,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Ace,0,test
4,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,2016,0,test


In [28]:
# preprocessing
df['clean_tokens'] = df.tokens.apply(lambda x: clean_name_for_word_embedding(x) if type(x)==str else x)
if LOWERCASE:
    df['clean_tokens'] = df.clean_tokens.apply(lambda x: x.lower() if type(x)==str else x)
df['clean_tokens'] = df.clean_tokens.astype(str)


In [29]:
import time
# item_id
item_dict = {}
for i, i_n in enumerate(df.item_name.unique().tolist()):
    item_dict[i_n] = i+1
df['item_id'] = [item_dict[i_n] for i_n in df.item_name.tolist()]

#e = time.time()
#logging.info('it spend {} mins on preprocessing'.format( (e-s) / 60.0))

#-------------------
# word embedding 
#-------------------
s = time.time()
# Build Word vocab (vocabulary set) for our customized task
vocab_glove = get_glove_vocab(pre_trained_word_embedding_path) # word set from pre-trained word_embedding
vocab_words = set(df.clean_tokens.tolist()) # word set from our own whole corpurs including train, dev, and test
vocab_set = vocab_words & vocab_glove # 這裡面的字, 肯定都有相對應的vector, 不是zero vector(The reason we did是可以節省我們使用的embbedding大小, 不用沒用到的pre-trained字也佔memory)
vocab_set.add(UNK)
vocab_set.add(NUM)

# Save vocab
write_vocab(vocab_set, base_dir, filename_words_voc)
# create dictionary mapping word to index
word_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_words_voc)
# save word embedding matrix 
export_glove_vectors(word_to_id_dict, glove_filename = pre_trained_word_embedding_path,
                     output_filename = filename_words_vec, dim = dim_word)
# encode a word (string) into id
df['word_id'] = df.clean_tokens.apply( lambda x: encode_word_to_idx(x, word_to_id_dict, vocab_set, LOWERCASE, ALLOW_UNKNOWN))

#e = time.time()
#logging.info('it spend {} mins on word embedding '.format( (e-s) / 60.0)) 


Building vocab...
- done. 6629250 tokens
Writing vocab...
- done. 27621 tokens


In [109]:
df.eval_set.unique()

array(['test', 'train', 'val'], dtype=object)

In [76]:
df[df.eval_set == 'train'].item_name.nunique()

200428

In [92]:
val_item_name = set(pd.Series(df[df.eval_set == 'train'].item_name.unique()).sample(frac = 0.1).unique())
val_item_name

{'hp1701-xiaomi redmi 4x pro prime 4gb ram 64gb rom 4 x mobile phone snapdragon 435 octa core',
 'nokia 105 dual sim 2017 resmi',
 'huawei p10 ram 4gb rom 64gb black gold blue',
 'sony xperia z4z3',
 'housing tab apple ipad 2 32gb original',
 'evercoss extream 1 plus',
 'flanagan 022 mm 9 h kaca melunakkan for xiaomi redmi 4 4x 4a layar pelindung film 5a not',
 'xiaomi mi max 3 ram 4gb rom 64gb garansi distributor',
 'promo xiaomi redmi 6a 216 garansi distributor abu tua',
 'promo conveter jack audio handsfree headset earphone apple iphone 7 limited',
 'nokia 216 new dualsim dual camera mp3 radio garansi resmi biru muda',
 'cuci gudang kingkong xiaomi redmi note 4x full layar black tempered glass original',
 'cuci gudang hdc s7 edge ultra ram 2gb 55 replika supercopy kingcopy',
 'xiaomi redmi note 5 pro 4gb 64gb black garansi distributor 1 tahun gold',
 'vivo v9 ram 4gb64gb smartphone black',
 'kaca lcd samsung n910 galany note 4 original king',
 'xiaomie redmi 5a gray',
 'blackberry z

In [107]:
for ix, row in df[df.eval_set == 'train'].iterrows():
    if row['item_name'] in val_item_name:
        #print (df.iloc[ix].eval_set)
        print (row['item_name'])
        df['eval_set'].iloc[ix] = 'val'

hp xiaomi c redmi note 3 pro 16gb ram 2gb garansi 1 tahun gs
baru samsung j6 case 3d motif karakter 2018


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [108]:
df[df.item_name == 'hp xiaomi c redmi note 3 pro 16gb ram 2gb garansi 1 tahun gs']

Unnamed: 0,item_name,tokens,label,eval_set,clean_tokens,item_id,word_id
813077,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,hp,0,train,hp,80022,4715
813078,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,xiaomi,2,train,xiaomi,80022,14132
813079,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,c,0,train,c,80022,1509
813080,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,redmi,0,train,redmi,80022,7033
813081,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,note,0,val,note,80022,3504
813082,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,3,0,train,3,80022,3249
813083,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,pro,0,train,pro,80022,15082
813084,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,16gb,0,train,16gb,80022,3044
813085,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,ram,0,train,ram,80022,7908
813086,hp xiaomi c redmi note 3 pro 16gb ram 2gb gara...,2gb,0,train,2gb,80022,5340


In [31]:
num_sentences = df['item_name'].nunique()
seq_len_distribution = df.groupby('item_name').tokens.apply( lambda x : len(x.tolist())).to_frame('seq_len').reset_index()


In [33]:
seq_len_distribution.seq_len.max()


36

In [36]:
#seq_len_distribution[seq_len_distribution.seq_len == 36]

In [35]:
#df[df.item_name.str.contains('Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra')]

Unnamed: 0,item_name,tokens,label,eval_set,clean_tokens,item_id,word_id
34065,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,Asus,2,test,asus,2900,922
34066,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,Zenfone,0,test,zenfone,2900,22763
34067,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,5Z,0,test,5z,2900,11970
34068,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,ZS620KL,0,test,zs620kl,2900,24394
34069,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,Ram,0,test,ram,2900,7908
34070,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,6GB128GB,0,test,6gb128gb,2900,8611
34071,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,-,0,test,,2900,24394
34072,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,Snapdragon,0,test,snapdragon,2900,6118
34073,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,845,0,test,845,2900,3249
34074,Asus Zenfone 5Z ZS620KL Ram 6GB128GB - Snapdra...,-,0,test,,2900,24394


In [49]:
from sklearn.model_selection import GroupShuffleSplit

In [51]:
gsplit = GroupShuffleSplit(n_splits = 1, random_state = 3, train_size = 0.9, test_size = 0.1)

In [52]:
help(gsplit)

Help on GroupShuffleSplit in module sklearn.model_selection._split object:

class GroupShuffleSplit(ShuffleSplit)
 |  Shuffle-Group(s)-Out cross-validation iterator
 |  
 |  Provides randomized train/test indices to split data according to a
 |  third-party provided group. This group information can be used to encode
 |  arbitrary domain specific stratifications of the samples as integers.
 |  
 |  For instance the groups could be the year of collection of the samples
 |  and thus allow for cross-validation against time-based splits.
 |  
 |  The difference between LeavePGroupsOut and GroupShuffleSplit is that
 |  the former generates splits using all subsets of size ``p`` unique groups,
 |  whereas GroupShuffleSplit generates a user-determined number of random
 |  test splits, each with a user-determined fraction of unique groups.
 |  
 |  For example, a less computationally intensive alternative to
 |  ``LeavePGroupsOut(p=10)`` would be
 |  ``GroupShuffleSplit(test_size=10, n_splits=

In [63]:
x = [1,1,2,3,4,2]
g = [1,2]

In [64]:
for train_idx, test_idx in gsplit.split(x,g):
    print ('train_idx',train_idx)

ValueError: Found input variables with inconsistent numbers of samples: [6, 2]