In [21]:
import numpy as np

In [22]:
def get_glove_vocab(filename):
    """Load vocab from file

    Args:
        filename: path to the glove vectors (xxx.txt)

    Returns:
        vocab_set: set() of strings
    """
    print("Building vocab...")
    vocab_set = set()
    with open(filename) as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab_set.add(word)
    print("- done. {} tokens".format(len(vocab_set)))
    return vocab_set

def write_vocab(vocab, filename):
    """Writes a vocab to a file

    Writes one word per line.

    Args:
        vocab: iterable that yields word
        filename: path to vocab file

    Returns:
        write a word per line (xxx.txt)

    """
    print("Writing vocab...")
    with open(filename, "w") as f:
        for i, word in enumerate(vocab):
            if i != len(vocab) - 1:
                f.write("{}\n".format(word))
            else:
                f.write(word)
    print("- done. {} tokens".format(len(vocab)))


def load_vocab_and_return_word_to_id_dict(filename):
    """Loads vocab from a file

    Args:
        filename: (string) the format of the file must be one word per line.

    Returns:
        d: dict[word] = index

    """
    try:
        d = dict()
        with open(filename) as f:
            for idx, word in enumerate(f):
                word = word.strip()
                d[word] = idx
    
    except IOError:
        raise MyIOError(filename)
    return d

def export_glove_vectors(vocab, glove_filename, output_filename, dim):
    """Saves glove vectors in numpy array

    Args:
        vocab: dictionary vocab[word] = index
        glove_filename: a path to a glove file
        trimmed_filename: a path where to store a matrix in npy
        dim: (int) dimension of embeddings
    
    Note:
        - Unknown words have representations with values of zero
    """
    embeddings = np.zeros([len(vocab), dim]) # initialize a zero
    with open(glove_filename) as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            embedding = [float(x) for x in line[1:]]
            if word in vocab:
                word_idx = vocab[word]
                # np.array (by default) will make a copy of the object, while asarray will not unless necessary.
                embeddings[word_idx] = np.asarray(embedding) # it's better for memory.
                
    # save several arrays into a single file in uncompressed .npz format.
    np.savez_compressed(output_filename, embeddings = embeddings)

In [5]:
vocab_glove = get_glove_vocab('/data/ID_largewv_300_2.txt')


Building vocab...
- done. 6629250 tokens


In [44]:
vocab_words = {'A'}

In [45]:
vocab = vocab_words & vocab_glove

In [46]:
vocab

set()

In [11]:
filename_words_voc = "../models/data/words_vocab.txt"
write_vocab(vocab, filename_words_voc)

Writing vocab...
- done. 6629250 tokens


In [16]:
word_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_words_voc)

In [28]:
import time

dim_word = 300
filename_words_voc = "../models/data/word2vec.npz".format(dim_word)
s = time.time()
export_glove_vectors(word_to_id_dict, glove_filename = '/data/ID_largewv_300_2.txt',
                     output_filename = filename_words_voc, dim = dim_word)
e = time.time()
print (e-s)

1401.0644762516022


In [29]:
1401.0644762516022 /60.0

23.351074604193368

In [31]:
vocab_glove

{'flm04419',
 'btvo',
 'f25mo',
 'ikiweecorner',
 'vs20623',
 'strongkangenwaterjogja',
 'beningwarna',
 'taspalembangimport',
 'ss213428',
 'strollerm',
 'au7h',
 'nyalala',
 'afifahmom_shop',
 'neuromarketing',
 'carseatmothercare',
 'glm1124',
 'milebaby',
 'pashmm',
 'xc9399',
 'bt10123',
 'pmxg',
 'rockbors',
 '19363id',
 'bogaerts',
 'custometoll',
 'batldel33',
 'melawaninfeksi',
 'es3270',
 'vbno',
 'midrohdgrey',
 'lc41661',
 'cip830',
 'dd142',
 'vs10173',
 'zzaz',
 '32651bk',
 'pu',
 'chrisye',
 'mhr016uob',
 'metalld46s4',
 'owda',
 'effron',
 'jualclutchcantik',
 'bm10606',
 'ke95498',
 'sunshime',
 'be1836',
 't5babc',
 'taskerenbatam',
 'flx623',
 'chokerkoreafashion',
 '0gga',
 'capasitaspowerbank',
 'sur791',
 '6kyt',
 'solarchargercontroller',
 'segeralah',
 'qvtp',
 'spainia',
 'mobilephonemicrophone',
 '00d00j',
 'cr1025',
 '54866',
 'l3nu',
 'sf65',
 'ranselsekolah',
 'alpuat',
 '305001846',
 '3qt2',
 'cs736',
 '002759',
 'collarharness',
 'sepisifikasi',
 'w3q3',


In [32]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

In [35]:
NONE in vocab_glove

False

In [76]:
def get_glove_vectors(filename):
    """
    Args:
        filename: path to the npz file

    Returns:
        matrix of embeddings (np array)

    """
    try:
        with np.load(filename) as data:
            return data["embeddings"]

    except IOError:
        raise MyIOError(filename)


use_pretrained = True
embeddings = (get_glove_vectors(filename_words_voc)
                if use_pretrained else None)

NameError: name 'MyIOError' is not defined

In [40]:
embeddings.shape

(6629250, 300)

In [43]:
np.load(filename_words_voc).files

['embeddings']

In [47]:
filename_words_voc = "../models/data/wordvec/words_vocab.txt"

word_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_words_voc)


In [69]:
set(word_to_id_dict.keys())

{'2016',
 '64gb',
 '6gb',
 '8gb',
 'ace',
 'blackview',
 'bv8000',
 'galaxy',
 'ip68',
 'j1',
 'pro',
 'ram',
 'rugged',
 'samsung',
 'smartphone',
 'smj111f',
 'waterproff',
 'white'}

In [77]:
use_pretrained = True
dim_word = 300
filename_words_vec = "../models/data/wordvec/word2vec.npz".format(dim_word)

embeddings = (get_glove_vectors(filename_words_vec)
                if use_pretrained else None)

In [81]:
for row in embeddings:
    print (row)

[ 1.144918e+00  8.532290e-01  1.944341e+00  6.601880e-01 -1.364533e+00
 -3.596650e-01 -4.869235e+00 -3.515282e+00 -6.933930e-01  2.253370e+00
 -5.387732e+00  1.075889e+00  1.970433e+00 -1.840181e+00  9.692900e-02
 -7.526960e-01  3.699655e+00  1.053452e+00 -2.673290e-01 -1.389383e+00
 -4.434500e-01 -1.068686e+00  4.988200e-01 -4.309552e+00 -3.892680e-01
  9.781760e-01  2.994910e-01 -1.340982e+00  1.336248e+00  2.083465e+00
  2.759360e-01  1.891687e+00  4.786110e-01 -2.728057e+00 -1.089209e+00
  2.305939e+00  3.844398e+00  7.962810e-01 -1.344278e+00 -1.256833e+00
  9.120980e-01 -3.122767e+00  3.637484e+00  4.298831e+00  5.422104e+00
 -4.545110e-01 -4.378840e-01  1.314170e-01 -7.340520e-01  9.448220e-01
  1.148148e+00 -3.007333e+00  3.653253e+00 -1.429684e+00 -1.440089e+00
  2.081840e-01  9.489870e-01 -4.290972e+00  5.159522e+00  4.288520e-01
  2.234803e+00  1.622748e+00  1.701568e+00  1.345757e+00 -1.273545e+00
  3.957765e+00 -3.641490e+00 -1.947893e+00 -6.498620e-01 -1.005549e+00
 -5.94

In [1]:
import pandas as pd
df = pd.read_csv('../data/processed/mobile_training.csv', nrows = 19)

In [2]:
import sys
LOWERCASE = True
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/preprocessing')
sys.path.append('/home/ld-sgdev/yunrui_li/ner_project/brand_recognition_bio_FE/py_model')
from clean_helpers import clean_name_for_word_embedding
df['clean_tokens'] = df.tokens.apply(lambda x: clean_name_for_word_embedding(x) if type(x)==str else x)
if LOWERCASE:
    df['clean_tokens'] = df.clean_tokens.apply(lambda x: x.lower() if type(x)==str else x)




In [3]:
df

Unnamed: 0,item_name,tokens,is_brand,is_valid,clean_tokens
0,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Samsung,2,train,samsung
1,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Galaxy,0,train,galaxy
2,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,J1,0,train,j1
3,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Ace,0,train,ace
4,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,2016,0,train,2016
5,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,SM-J111F,0,train,smj111f
6,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,8GB,0,train,8gb
7,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,-,0,train,
8,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,White,0,train,white
9,Blackview BV8000 Pro RAM 6GB 64GB IP68 Wa...,Blackview,2,train,blackview


In [8]:
def load_vocab_and_return_word_to_id_dict(filename):
    """Loads vocab from a file

    Args:
        filename: (string) the format of the file must be one word per line.

    Returns:
        d: dict[word] = index
    """
    try:
        d = dict()
        with open(filename) as f:
            for idx, word in enumerate(f):
                # give all the vocabualary in the intersection of our own corpurs and embedding vocabulary with a certain unique id
                word = word.strip() # remove strip
                d[word] = idx
    except IOError:
        raise MyIOError(filename)
    return d

filename_words_voc = "../models/data/wordvec/words_vocab.txt"
word_to_id_dict = load_vocab_and_return_word_to_id_dict(filename_words_voc)


In [13]:
word_to_id_dict

{'j1': 0,
 'ram': 1,
 'galaxy': 2,
 'pro': 3,
 '64gb': 4,
 'smartphone': 5,
 '2016': 6,
 'blackview': 7,
 '6gb': 8,
 'bv8000': 9,
 'ip68': 10,
 'smj111f': 11,
 'ace': 12,
 'rugged': 13,
 'samsung': 14,
 'waterproff': 15,
 'white': 16,
 '8gb': 17}

In [72]:
def encode_word_to_idx(word, word_to_id, vocabulary_set):
    if word in vocabulary_set:
        return word_to_id[word]
    else:
        print ('word',word)
        return word_to_id['None']
df['word_id'] = df.clean_tokens.apply( lambda x: encode_word_to_idx(x, word_to_id_dict, set(word_to_id_dict.keys())))


word 


KeyError: 'None'

In [85]:
df.clean_tokens.iloc[7]

''

In [82]:
'-'

set()

In [87]:
set(['A',df.clean_tokens.iloc[7]])

{'', 'A'}

In [4]:
set1 = {'A','B'}
set2 = {'B','C'}
set2 & set1

{'B'}

In [6]:
'A' in {'A':1}

True

In [10]:
def get_processing_word(vocab_words=None, vocab_chars=None,
                    lowercase=False, chars=False, allow_unk=True):
    """Return lambda function that transform a word (string) into list,
    or tuple of (list, id) of int corresponding to the ids of the word and
    its corresponding characters.

    Args:
        vocab: dict[word] = idx

    Returns:
        f("cat") = ([12, 4, 32], 12345)
                 = (list of char ids, word id)

    """
    def f(word):
        # 0. get chars of words
        if vocab_chars is not None and chars == True:
            char_ids = []
            for char in word:
                # ignore chars out of vocabulary
                if char in vocab_chars:
                    char_ids += [vocab_chars[char]]

        # 1. preprocess word
        if lowercase:
            word = word.lower()
        if word.isdigit():
            word = NUM

        # 2. get id of word
        if vocab_words is not None:
            if word in vocab_words:
                word = vocab_words[word]
            else:
                if allow_unk:
                    word = vocab_words[UNK]
                else:
                    raise Exception("Unknow key is not allowed. Check that "\
                                    "your vocab (tags?) is correct")

        # 3. return tuple char ids, word id
        if vocab_chars is not None and chars == True:
            return char_ids, word
        else:
            return word

    return f


In [12]:
for char in 'fuck':
    print (char)

f
u
c
k
