In [1]:
# this notebook explores adding more words to the embedding vocabulary

In [2]:
import numpy as np
from collections import defaultdict
import tenncor as tc
from extenncor.embed import Embedding, make_embedding, vdistance

np.random.seed(0)

In [3]:
text = "natural language processing and machine learning is fun and exciting"
text2 = "machine language is not natural"

corpus = [[word.lower() for word in text.split()]]
corpus_addition = 'not'
corpus2 = [[word.lower() for word in text2.split()]]

In [4]:
def generate_training_data(embedding, corpus):
    training_data = []
    # Cycle through each sentence in corpus
    for sentence in corpus:
        sent_len = len(sentence)
        # Cycle through each word in sentence
        for i in range(len(sentence)):
            # Convert target word to one-hot
            w_target = embedding.onehot(sentence[i])
            # Cycle through context window
            w_context = []
            # Note: window_size 2 will have range of 5 values
            for j in range(i - window, i + window+1):
                # Criteria for context word
                # 1. Target word cannot be context word (j != i)
                # 2. Index must be greater or equal than 0 (j >= 0) - if not list index out of range
                # 3. Index must be less or equal than length of sentence (j <= sent_len-1) - if not list index out of range
                if j != i and j <= sent_len-1 and j >= 0:
                    # Append the one-hot representation of word to w_context
                    w_context.append(embedding.onehot(sentence[j]))
                    # print(sentence[i], sentence[j])
                    # training_data contains a one-hot representation of the target word and context words
            training_data.append([w_target, w_context])
    return np.array(training_data)

def vec_sim(embedding, word, top_n):
    v1 = embedding.get_vec(word)
    word_sim = {}

    for i in range(len(embedding)):
        v2 = embedding[i]
        # Find the similary score for each word in vocab
        word = embedding.idx2word[i]
        word_sim[word] = vdistance(v1, v2)

    words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)

    for word, sim in words_sorted[:top_n]:
        print(word, sim)

In [5]:
n = 10 # dimensions of word embeddings, also refer to size of hidden layer
lr = 0.01 # learning rate
epochs = 50 # number of training epochs
window = 2 # context window +- center word

# training
# Initialising weight matrices
# Both s1 and s2 should be randomly initialised but for this demo, we pre-determine the arrays (getW1 and getW2)
# getW1 - shape (9x10) and getW2 - shape (10x9)

# Find unique word counts using dictonary
word_counts = defaultdict(int)
for row in corpus:
    for word in row:
        word_counts[word] += 1
# Generate Lookup Dictionaries (vocab)
index_word = list(word_counts.keys())
nwords = len(index_word)

embedding = make_embedding(index_word, n)
trainmodel = tc.api.layer.link([
    embedding.embedding,
    embedding.exbedding,
    tc.api.layer.bind(tc.api.softmax),
])

training_data = generate_training_data(embedding, corpus)

winput = tc.variable(np.random.rand(nwords) * 2 - 1, 'input')
woutput = tc.variable(np.random.rand(2 * window, nwords) * 2 - 1, 'output')

y_pred = trainmodel.connect(winput)

train_err = tc.apply_update([trainmodel],
    lambda error, leaves: tc.api.approx.sgd(error, leaves, lr),
    lambda models: tc.api.reduce_sum(tc.api.pow(tc.api.extend(models[0].connect(winput), [1, 2 * window]) - woutput, 2.)))

tc.optimize("cfg/optimizations.json")

# Cycle through each epoch
for i in range(epochs):
    # Intialise loss to 0
    loss = 0

    # Cycle through each training sample
    # w_t = vector for target word, w_c = vectors for context words
    for w_t, w_c in training_data:
        wcdata = np.array(w_c)
        ydata = y_pred.get().reshape(1, nwords)
        for j in range(2 * window - wcdata.shape[0]):
            wcdata = np.concatenate((wcdata, ydata), 0)
        winput.assign(np.array(w_t))
        woutput.assign(wcdata)
        loss += train_err.get()
    print('Epoch:', i, "Loss:", loss)

print(embedding.get_vec("machine"))

# Find similar words
vec_sim(embedding, "machine", 3)

Epoch: 0 Loss: 34.94494700431824
Epoch: 1 Loss: 34.394609332084656
Epoch: 2 Loss: 34.04316842556
Epoch: 3 Loss: 33.72573661804199
Epoch: 4 Loss: 33.43827557563782
Epoch: 5 Loss: 33.17703139781952
Epoch: 6 Loss: 32.9386340379715
Epoch: 7 Loss: 32.72014570236206
Epoch: 8 Loss: 32.51902973651886
Epoch: 9 Loss: 32.333131194114685
Epoch: 10 Loss: 32.16061580181122
Epoch: 11 Loss: 31.999934434890747
Epoch: 12 Loss: 31.849769115447998
Epoch: 13 Loss: 31.708996891975403
Epoch: 14 Loss: 31.576662063598633
Epoch: 15 Loss: 31.45194125175476
Epoch: 16 Loss: 31.334125995635986
Epoch: 17 Loss: 31.222601771354675
Epoch: 18 Loss: 31.1168292760849
Epoch: 19 Loss: 31.016337037086487
Epoch: 20 Loss: 30.92071032524109
Epoch: 21 Loss: 30.829578757286072
Epoch: 22 Loss: 30.742612957954407
Epoch: 23 Loss: 30.65951669216156
Epoch: 24 Loss: 30.58002507686615
Epoch: 25 Loss: 30.503898859024048
Epoch: 26 Loss: 30.43091869354248
Epoch: 27 Loss: 30.360889077186584
Epoch: 28 Loss: 30.29362404346466
Epoch: 29 Loss: 

In [6]:
w1 = embedding.weight
w2 = embedding.exbedding.get_storage()[0]

forward = w1.get()
backward = w2.get()

mod_forward = np.concatenate((forward, np.random.uniform(-1, 1, size=(1, 10))), axis=0)
mod_backward = np.concatenate((backward, np.random.uniform(-1, 1, size=(10, 1))), axis=1)

mod_w1 = tc.variable(mod_forward, 'forward')
mod_w2 = tc.variable(mod_backward, 'backward')
mod_embedding = Embedding(mod_w1, mod_w2, index_word + [corpus_addition])
mod_model = tc.api.layer.link([
    mod_embedding.embedding,
    mod_embedding.exbedding,
    tc.api.layer.bind(tc.api.softmax),
])

In [7]:
mod_winput = tc.variable(np.random.rand(nwords + 1) * 2 - 1, 'input')
mod_woutput = tc.variable(np.random.rand(2 * window, nwords + 1) * 2 - 1, 'output')

mod_y_pred = mod_model.connect(mod_winput)

mod_train_err = tc.apply_update([mod_model],
    lambda error, leaves: tc.api.approx.sgd(error, leaves, lr),
    lambda models: tc.api.reduce_sum(tc.api.pow(tc.api.extend(models[0].connect(mod_winput), [1, 2 * window]) - mod_woutput, 2.)))

tc.optimize("cfg/optimizations.json")

training_data2 = generate_training_data(mod_embedding, corpus2)

# Cycle through each epoch
for i in range(epochs):
    # Intialise loss to 0
    loss = 0

    # Cycle through each training sample
    # w_t = vector for target word, w_c = vectors for context words
    for w_t, w_c in training_data2:
        wcdata = np.array(w_c)
        ydata = mod_y_pred.get().reshape(1, nwords + 1)
        for j in range(2 * window - wcdata.shape[0]):
            wcdata = np.concatenate((wcdata, ydata), 0)
        mod_winput.assign(np.array(w_t))
        mod_woutput.assign(wcdata)
        loss += mod_train_err.get()
    print('Epoch:', i, "Loss:", loss)

print(mod_embedding.get_vec("machine"))

# Find similar words
vec_sim(mod_embedding, "machine", 3)

Epoch: 0 Loss: 14.606863260269165
Epoch: 1 Loss: 14.715889692306519
Epoch: 2 Loss: 14.64060640335083
Epoch: 3 Loss: 14.567920207977295
Epoch: 4 Loss: 14.497643232345581
Epoch: 5 Loss: 14.429615259170532
Epoch: 6 Loss: 14.363690376281738
Epoch: 7 Loss: 14.299741506576538
Epoch: 8 Loss: 14.237656354904175
Epoch: 9 Loss: 14.17733645439148
Epoch: 10 Loss: 14.118694067001343
Epoch: 11 Loss: 14.061651229858398
Epoch: 12 Loss: 14.006137132644653
Epoch: 13 Loss: 13.9520925283432
Epoch: 14 Loss: 13.899460911750793
Epoch: 15 Loss: 13.848192572593689
Epoch: 16 Loss: 13.79824185371399
Epoch: 17 Loss: 13.749569177627563
Epoch: 18 Loss: 13.702137112617493
Epoch: 19 Loss: 13.655909538269043
Epoch: 20 Loss: 13.610855102539062
Epoch: 21 Loss: 13.566941976547241
Epoch: 22 Loss: 13.524141669273376
Epoch: 23 Loss: 13.482425332069397
Epoch: 24 Loss: 13.441765904426575
Epoch: 25 Loss: 13.402134418487549
Epoch: 26 Loss: 13.363505601882935
Epoch: 27 Loss: 13.325851678848267
Epoch: 28 Loss: 13.28914475440979
E

In [8]:
print(index_word + [corpus_addition])
print(forward)
print(backward)
print(mod_w1.get())
print(mod_w2.get())

print('cross similiarities')
orig_data = w1.get()
mod_data = mod_w1.get()
for i in range(nwords):
    v1 = orig_data[i]
    v2 = mod_data[i]
    word = index_word[i]
    print(word, vdistance(v1, v2))

['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'exciting', 'not']
[[ 0.03082157  0.43199688  0.19140545  0.12391488 -0.1743178   0.27198088
  -0.13611728  0.6726653   0.94228876 -0.28644916]
 [ 0.49185762 -0.06332493 -0.02737583  0.8859195  -0.7707699  -0.7410697
  -0.8444996   0.9937508   0.6157334   0.49587414]
 [ 0.89807427  0.6270576  -0.13817546  0.5954936  -0.87662816  0.18292807
  -0.4780913   0.9074295   0.03989711 -0.10391264]
 [-0.6278146   0.56902015 -0.09472717  0.10327913 -0.84771305  0.24537711
   0.02896766  0.05620021  0.7442206   0.23441094]
 [-0.42575166 -0.24539575  0.28960332 -0.81587696  0.27500275  0.29652333
  -0.54270196 -0.72514606 -0.22661425 -0.15576968]
 [ 0.0539834  -0.06859595  0.94048643 -0.88526845 -0.5705192  -0.7514896
   0.37114364 -0.39261967 -0.20158623 -0.3559844 ]
 [-0.7845716  -0.577518    0.3956456  -0.7371265  -0.45208016 -0.2550295
   0.7442549  -0.80364853  0.4561647  -0.63619083]
 [ 0.9400159  -0.1209984   0