In [1]:
#coding:utf-8
import os
import numpy as np
import torch as t
from nltk.tokenize import word_tokenize
from torch import nn

In [2]:
import sys
sys.path.append('D:\\Jupyter\\Python\\ATAE-LSTM')
import Ipynb_importer

In [3]:
from config import opt

In [4]:
class Emb(object):
    def __init__(self):
        # create and init the items below
        # self.embedding   string word ==> np.ndarray vector
        self.embedding = {}
        
        # load the pre-trained data
        self.root = opt.base_root + opt.embedding_root
        f = open(self.root, 'r', encoding='UTF-8')
        
        l = f.readline()
        have_opened = 1
        while(l != '' and have_opened<=opt.embedding_load):
            # l : "a 0.1 0.2 0.3 ..."
            if l[-1] == '\n':
                l = l[:-1]
            l = l.split(' ')
            if not len(l)==opt.hidden_size + 1:
                l = f.readline()
                continue
            
            # l[0]  : string word
            # l[1:] : list<string> vector
            self.embedding[l[0].lower()] = np.array(l[1:], dtype=float)
            
            if(len(self.embedding)==have_opened):
                print('Embedding : have input words : '+str(have_opened))
                have_opened *= 2
            l = f.readline()
            
        print('Embedding : have input words : '+str(have_opened))
        f.close()
        
        # create the items to modify and use dynamically below
        # self.dictionary    string word ==> int index
        # self.words         int index ==> string word
        # self.no_pretrained string word ==> int appearance
        self.dictionary = {}
        self.words = []
        self.no_pretrained = {}
        
        return
    
    def _get_dic_(self):
        return self.dictionary
    
    def _get_words_(self):
        return self.words
    
    def _make_layer_(self):
        weight = []
        for word in self.words:
            weight.append(self.embedding[word])
        weight.append(np.random.uniform(-opt.epsilon, opt.epsilon, opt.hidden_size))
        
        layer = nn.Embedding.from_pretrained(t.FloatTensor(weight), freeze=False)
        
        return layer
    
    def _add_word_(self, sentence):
        # para sentence : a string to be tokenized by nltk.tokenize.word_tokenize
        sentence = word_tokenize(sentence)
        for word in sentence:
            word = word.lower()
            if word in self.dictionary:
                continue
            if word in self.embedding:
                # add this word into self.dictionary and self.words
                self.dictionary[word] = len(self.words)
                self.words.append(word)
                assert len(self.dictionary) == len(self.words)
            else:
                # if this no-pretrained word arise for at least opt.word_independence times
                # set an indepent embedding for it
                if word not in self.no_pretrained:
                    self.no_pretrained[word] = 1
                else:
                    self.no_pretrained[word] += 1
                    if self.no_pretrained[word] >= opt.word_independence:
                        self.no_pretrained.pop(word)
                        self.dictionary[word] = len(self.words)
                        self.words.append(word)
                        assert len(self.dictionary) == len(self.words)
                        
                        # set an indepent embedding for it
                        # init from U(-ε,ε) 
                        self.embedding[word] = np.random.uniform(-opt.epsilon, opt.epsilon, opt.hidden_size)
        return

In [5]:
emb = Emb()

Embedding : have input words : 1
Embedding : have input words : 2
Embedding : have input words : 4
Embedding : have input words : 8
Embedding : have input words : 16
Embedding : have input words : 32
Embedding : have input words : 64
Embedding : have input words : 128
Embedding : have input words : 256
Embedding : have input words : 512
Embedding : have input words : 1024
Embedding : have input words : 2048
Embedding : have input words : 4096
Embedding : have input words : 8192
Embedding : have input words : 16384


In [6]:
if __name__=='__main__':
    for i in range(20):
        emb._add_word_('All the appetizers and salads were fabulous, the steak was mouth watering and the pasta was delicious!!!')
        print(emb._get_dic_())

{'all': 0, 'the': 1, 'and': 2, 'salads': 3, 'were': 4, 'fabulous': 5, ',': 6, 'steak': 7, 'was': 8, 'mouth': 9, 'watering': 10, 'pasta': 11, 'delicious': 12, '!': 13}
{'all': 0, 'the': 1, 'and': 2, 'salads': 3, 'were': 4, 'fabulous': 5, ',': 6, 'steak': 7, 'was': 8, 'mouth': 9, 'watering': 10, 'pasta': 11, 'delicious': 12, '!': 13}
{'all': 0, 'the': 1, 'and': 2, 'salads': 3, 'were': 4, 'fabulous': 5, ',': 6, 'steak': 7, 'was': 8, 'mouth': 9, 'watering': 10, 'pasta': 11, 'delicious': 12, '!': 13}
{'all': 0, 'the': 1, 'and': 2, 'salads': 3, 'were': 4, 'fabulous': 5, ',': 6, 'steak': 7, 'was': 8, 'mouth': 9, 'watering': 10, 'pasta': 11, 'delicious': 12, '!': 13}
{'all': 0, 'the': 1, 'and': 2, 'salads': 3, 'were': 4, 'fabulous': 5, ',': 6, 'steak': 7, 'was': 8, 'mouth': 9, 'watering': 10, 'pasta': 11, 'delicious': 12, '!': 13, 'appetizers': 14}
{'all': 0, 'the': 1, 'and': 2, 'salads': 3, 'were': 4, 'fabulous': 5, ',': 6, 'steak': 7, 'was': 8, 'mouth': 9, 'watering': 10, 'pasta': 11, 'delic

In [8]:
E_all = emb.embedding['all']
type(E_all)

numpy.ndarray

In [9]:
np.dot(E_all, E_all).sum()

40.67011703049458

In [10]:
E_the = emb.embedding['the']
np.dot(E_the, E_the).sum()

54.210771477027805

In [11]:
E_app = emb.embedding['appetizers']
np.dot(E_app, E_app).sum()

0.00986528236536729

In [12]:
E_app

array([ 1.09201740e-03, -5.13712532e-03, -3.19205056e-03, -1.12457257e-03,
       -4.53393842e-03,  6.99012226e-03,  6.47041221e-03,  7.09443739e-04,
       -8.34251975e-03,  1.49655140e-03, -3.32683062e-03,  9.85519760e-03,
       -3.03301902e-03, -2.10983893e-03,  1.09938106e-03,  5.06186232e-03,
       -7.36367169e-03,  9.42059871e-03, -4.40424982e-03,  2.04025071e-03,
        4.92205153e-03,  3.36179628e-03, -2.98355201e-03,  1.56471938e-03,
       -6.06411934e-03, -1.04457344e-03,  4.35706646e-03,  7.39735786e-03,
        9.93128600e-03, -3.64700902e-03,  4.84594567e-03, -3.78971697e-03,
       -5.02008490e-03, -2.67187010e-03, -1.43062002e-03,  1.94243282e-03,
       -8.24167631e-03,  3.00631260e-03,  2.71851105e-03,  9.75263780e-03,
       -1.49531785e-05,  7.64968197e-03, -6.52892709e-04,  8.12582885e-03,
        6.30999275e-04, -3.19742068e-03,  3.49292842e-03, -9.06613473e-03,
        9.05621505e-03, -4.44274658e-03, -1.47550979e-03,  8.57886749e-03,
       -5.61718981e-03,  

In [13]:
E_all

array([-0.15063  , -0.73908  , -0.29427  , -0.30443  ,  0.16372  ,
       -0.51402  , -0.12802  ,  0.3488   ,  0.16604  ,  0.30374  ,
       -0.44225  ,  0.63938  , -0.03751  ,  0.18215  ,  0.21155  ,
        0.13111  , -0.40701  ,  1.7404   , -0.19956  ,  0.13135  ,
       -0.061498 , -0.39734  ,  0.1863   , -0.18395  ,  0.21751  ,
       -0.17888  , -0.23852  , -0.0082472, -0.17775  , -0.074496 ,
       -0.1261   ,  0.78656  , -0.065985 ,  0.054162 ,  0.13414  ,
        0.20414  , -0.33127  , -0.37488  ,  0.047762 ,  1.0582   ,
        0.034112 ,  0.43122  ,  0.48421  ,  0.40248  ,  0.019783 ,
        0.071812 ,  0.41956  ,  0.12729  ,  0.30718  , -0.34761  ,
       -0.20851  , -0.62237  , -0.50255  , -0.14081  ,  0.026269 ,
        0.071628 , -0.083321 , -0.24162  , -0.3319   , -0.20401  ,
        0.34444  ,  0.060308 , -0.2042   , -0.086079 ,  0.057419 ,
        0.12683  ,  0.151    ,  0.10077  ,  0.35313  ,  0.25843  ,
        0.35271  ,  0.28722  ,  0.79539  , -0.15875  ,  0.2477