In [11]:
import time
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from billiard import Pool
import gc
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/preprocessing')
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/py_model')
from clean_helpers import clean_name_for_word_embedding
from utils import init_logging
import gc

def pad_1d(array, max_len):
    array = array[:max_len]
    length = len(array)
    padded = array + [0]*(max_len - len(array))
    return padded, length

def encode_word_to_idx(x, word_to_id, vocabulary_set):
    if x in vocabulary_set:
        return word_to_id[x]
    else:
        return word_to_id['None']

def parallelize_dataframe(df, func):
    '''
    speeding up DataFrame.apply() via parallelizing.

    '''
    #---------------
    # setting
    #---------------
    num_partitions = 10
    num_cores = 10

    # core
    df1,df2,df3,df4,df5,df6,df7,df8,df9,df10 = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, [df1,df2,df3,df4,df5,df6,df7,df8,df9,df10]))
    pool.close()
    pool.join()
    return df

def speed_up_func_for_feature_engineering(df):
    df['clean_tokens'] = df.tokens.apply(lambda x: clean_name_for_word_embedding(x) if type(x)==str else x)
    df['clean_tokens'] = df.clean_tokens.apply(lambda x: x.lower() if type(x)==str else x)
    df['word_id'] = df.clean_tokens.apply( lambda x: encode_word_to_idx(x, word_to_id, vocabulary_set))
    return df

In [12]:
#-------------------
# loading data
#-------------------
df = pd.read_csv('../data/processed/mobile_training.csv')

#-------------------
# setting
#-------------------
TRUNCATED = False
num_sentences = df['item_name'].nunique()
seq_len_distribution = df.groupby('item_name').tokens.apply( lambda x : len(x.tolist())).to_frame('seq_len').reset_index()

if TRUNCATED == False:
    max_seq_length = seq_len_distribution.seq_len.max()
else:
    max_seq_length = 100


#-------------------
# word embedding preparation
#-------------------
word_embedding_path = '/data/ID_large_wordvec_300_2.h5'
word2vec = pd.read_hdf(word_embedding_path)
# get word_to_id
word_to_id = {word: int(i+1) for i, word in enumerate(word2vec.word.tolist())}
# for those word that do not exist in our vocabulary
word_to_id.update({'None': int(len(word_to_id)+1)})
# vocabulary_set
vocabulary_set = set(word_to_id.keys())


In [12]:
s = time.time()
df['clean_tokens'] = df.tokens.apply(lambda x: clean_name_for_word_embedding(x) if type(x)==str else x)
df['clean_tokens'] = df.clean_tokens.apply(lambda x: x.lower() if type(x)==str else x)
df['word_id'] = df.clean_tokens.apply( lambda x: encode_word_to_idx(x, word_to_id, vocabulary_set))
e = time.time()
print (e-s)

4.660710096359253


In [11]:
len(df) * 0.13265609741210938 / 5000

6.727070293426514

In [13]:
len(df)

253553

In [3]:
df = pd.read_csv('../data/processed/mobile_training.csv')


In [4]:
df.

Unnamed: 0,item_name,tokens,is_brand,is_valid
0,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Samsung,2,train
1,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Galaxy,0,train
2,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,J1,0,train
3,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Ace,0,train
4,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,2016,0,train
5,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,SM-J111F,0,train
6,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,8GB,0,train
7,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,-,0,train
8,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,White,0,train
9,Blackview BV8000 Pro RAM 6GB 64GB IP68 Wa...,Blackview,2,train


In [9]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

In [15]:
'$UNK$' in vocabulary_set

False

In [17]:
len(vocabulary_set)

6629250

In [22]:
d = dict({'A':1,'B':2}) 
d

{'A': 1, 'B': 2}

In [24]:
d[UNK]

KeyError: '$UNK$'