In [22]:
import tensorflow as tf
import numpy as np
import tensorflow.keras.preprocessing as preprocessing
from collections import Counter
import random

In [23]:
def text_to_word_sequence(text):
    words=preprocessing.text.text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\n', lower=True, split=' ')
    return words


def create_lookup_tables(words):
    word_counts = Counter(words)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

In [24]:
# p_drop(word)=1-sqrt(treshold/freq(word)) , frequent words are more likely to be removed from the dataset
def subsampling(int_words):
    word_counts=Counter(int_words) # a dictionary from int_word to number of times it appeared in the text
    total_count=len(int_words)
    p_drops={word:1-np.sqrt(1e-5/(count/total_count)) for word,count in word_counts.items()}
    train_words=[word for word in int_words if random.random()<(1-p_drops[word])] # the bigger p_drop the less likely to be chosen
    return train_words

In [25]:
text=""
#reading at most 100 lines
with open('data.txt') as f:
    i=0
    for line in f.readlines():
        text+=line    
        i+=1
        if i>=100:
            break
print(text)
    
words=text_to_word_sequence(text)

vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]
print(int_words)
train_words=subsampling(int_words)


Since the yield keyword is only used with generators, it makes sense to recall the concept of generators first.

The idea of generators is to calculate a series of results one-by-one on demand (on the fly). In the simplest case, a generator can be used as a list, where each element is calculated lazily. Lets compare a list and a generator that do the same thing - return powers of two
Iterating over the list and the generator looks completely the same. However, although the generator is iterable, it is not a collection, and thus has no length. Collections (lists, tuples, sets, etc) keep all values in memory and we can access them whenever needed. A generator calculates the values on the fly and forgets them, so it does not have any overview about the own result set.

Generators are especially useful for memory-intensive tasks, where there is no need to keep all of the elements of a memory-heavy list accessible at the same time. Calculating a series of values one-by-one can also be usefu

In [45]:
#---------------- SKIP GRAM ------------------
def get_contexts(batch,i,window_size):
    n=window_size//2
    return list(set(batch[max(0,i-n):i]+batch[i+1:min(len(batch),i+n+1)]))

#generator for batches
def get_batch_sg(words,batch_size,window_size):
    n_batches=len(words)//batch_size
    words=words[:n_batches*batch_size]
    for batch_start in range(0,len(words),batch_size):
        batch=words[batch_start:batch_start+batch_size]
        x,y=[],[]
        for i in range(len(batch)):
            center=batch[i]
            contexts=get_contexts(batch,i,window_size)
            y.extend(contexts)
            x.extend([center]*len(contexts))
        yield x,y 

In [50]:
def train_sg(int_words,vocab_to_int):
    vocab_size=len(vocab_to_int)
    # hyperparameters
    epochs=300
    batch_size=100
    window_size=5
    word_dimension=300
    n_samples=10
    
    inputs=tf.placeholder(tf.int32,[None],name='inputs') # size is variable , inputs are indexes of words in the batch
    labels=tf.placeholder(tf.int32,[None,None],name='labels')
    with tf.variable_scope("skip_gram"):
        embedding_V=tf.Variable(tf.random_uniform((vocab_size,word_dimension),-1,1))
        embed=tf.nn.embedding_lookup(embedding_V,inputs) # chooses the given rows
        embedding_U=tf.Variable(tf.random_normal((vocab_size,word_dimension)))
        softmax_biases=tf.Variable(tf.zeros(vocab_size))
        #loss with negative_sampling
        loss=tf.nn.sampled_softmax_loss(weights=embedding_U,biases=softmax_biases,inputs=embed,labels=labels,num_sampled=n_samples,num_classes=vocab_size)
        cost=tf.reduce_mean(loss)
        optimizer=tf.train.AdamOptimizer().minimize(cost)
    
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(epochs):
            cost_value=0
            batch_generator=get_batch_sg(int_words,batch_size,window_size)
            for x,y in batch_generator:
                feed_dict={inputs:x,labels:np.array(y)[:,None]}  #labels:np.array(y)[:,None] adds a dimmension, is like squeeze(1)
                _,cost_val=sess.run([optimizer,cost],feed_dict)
                cost_value+=cost_val
            print('epoch_{}'.format(epoch),'cost_value: ',cost_value)
    return embedding_V,embedding_U


In [51]:
center_embeds_sg,context_embeds_sg=train_sg(int_words,vocab_to_int)
word_embeddings=center_embeds_sg+context_embeds_sg

epoch_0 cost_value:  336.75617504119873
epoch_1 cost_value:  320.9684896469116
epoch_2 cost_value:  308.1462640762329
epoch_3 cost_value:  301.6763162612915
epoch_4 cost_value:  297.721435546875
epoch_5 cost_value:  284.02795028686523
epoch_6 cost_value:  276.26794624328613
epoch_7 cost_value:  269.2613353729248
epoch_8 cost_value:  258.493613243103
epoch_9 cost_value:  259.48313331604004
epoch_10 cost_value:  244.2716999053955
epoch_11 cost_value:  239.0437822341919
epoch_12 cost_value:  234.92295169830322
epoch_13 cost_value:  231.56304359436035
epoch_14 cost_value:  221.74530696868896
epoch_15 cost_value:  225.27306938171387
epoch_16 cost_value:  206.11533737182617
epoch_17 cost_value:  210.63945531845093
epoch_18 cost_value:  202.6174440383911
epoch_19 cost_value:  201.24781847000122
epoch_20 cost_value:  188.74456024169922
epoch_21 cost_value:  176.85069227218628
epoch_22 cost_value:  180.1824951171875
epoch_23 cost_value:  182.10415506362915
epoch_24 cost_value:  177.845038890838

epoch_202 cost_value:  14.408864200115204
epoch_203 cost_value:  14.203129380941391
epoch_204 cost_value:  14.633476048707962
epoch_205 cost_value:  14.829628735780716
epoch_206 cost_value:  14.36087229847908
epoch_207 cost_value:  14.338808357715607
epoch_208 cost_value:  15.456692457199097
epoch_209 cost_value:  15.54850059747696
epoch_210 cost_value:  14.722341388463974
epoch_211 cost_value:  15.15331345796585
epoch_212 cost_value:  15.348800837993622
epoch_213 cost_value:  14.864052891731262
epoch_214 cost_value:  15.269559264183044
epoch_215 cost_value:  14.118692606687546
epoch_216 cost_value:  14.749050915241241
epoch_217 cost_value:  12.930843979120255
epoch_218 cost_value:  15.358382910490036
epoch_219 cost_value:  14.648547917604446
epoch_220 cost_value:  13.865361362695694
epoch_221 cost_value:  14.629608184099197
epoch_222 cost_value:  13.814673274755478
epoch_223 cost_value:  16.469171434640884
epoch_224 cost_value:  14.645997166633606
epoch_225 cost_value:  13.86215126514

In [52]:
# ------------------ CBOW -------------------

def get_batch_cbow(int_words,batch_size,window_size):
    n_batches=len(int_words)//batch_size
    int_words=int_words[:n_batches*batch_size]
    center_ind=window_size//2
    
    for bath_start in range(0,len(int_words),batch_size):
        batch=int_words[bath_start:bath_start+batch_size]
        surroundings=np.ndarray((batch_size-(2*center_ind),window_size-1),np.int32)
        labels=np.ndarray((batch_size-(2*center_ind),1),np.int32)
        for i in range(center_ind,batch_size-center_ind,1):    
            center=batch[i]
            col_idx=0
            for j in range(window_size):
                if j==window_size//2:
                    continue
                else:
                    surroundings[i-center_ind,col_idx]=batch[i-center_ind+j]
                    col_idx+=1
            labels[i-center_ind,0]=center
            
        yield surroundings,labels
        

def train_cbow(int_words,vocab_to_int):
    
    # hyperparameters
    epochs=200
    batch_size=100
    window_size=5
    dimension=300
    n_samples=20
    
    half_window=window_size//2
    vocab_size=len(vocab_to_int)
    
    inputs=tf.placeholder(tf.int32,[batch_size-(2*half_window),window_size-1])
    labels=tf.placeholder(tf.int32,[batch_size-(2*half_window),1])
    with tf.variable_scope('cbow'):
    
        embeddings=tf.Variable(tf.random_uniform((vocab_size,dimension),-1,1))

        #get_avg_embed
        embeds=None
        for i in range(window_size-1):
            embedding_i=tf.nn.embedding_lookup(embeddings,inputs[:,i])
            emb_x,emb_y = embedding_i.get_shape().as_list()
            if embeds is None:
                embeds=tf.reshape(embedding_i,[emb_x,emb_y,1])
            else:
                embeds=tf.concat([embeds,tf.reshape(embedding_i,[emb_x,emb_y,1])],2)
        avg_embed=tf.reduce_mean(embeds,2,keepdims=False)

        softmax_w=tf.Variable(tf.random_normal((vocab_size,dimension)))
        softmax_b=tf.Variable(tf.zeros(vocab_size))

        loss=tf.nn.sampled_softmax_loss(weights=softmax_w,biases=softmax_b,inputs=avg_embed,labels=labels,num_sampled=n_samples,num_classes=vocab_size)
        cost=tf.reduce_mean(loss)
        optimizer=tf.train.AdamOptimizer().minimize(cost)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(epochs):
            batch_generator=get_batch_cbow(int_words,batch_size,window_size)
            cost_value=0
            
            for x,y in batch_generator:
                feed_dic={inputs:x, labels:y}
                _,cost_val=sess.run([optimizer,cost],feed_dict=feed_dic)
                cost_value+=cost_val
            print('epoch_{}'.format(epoch),'cost_value: ',cost_value)
            
    return embeddings,softmax_w
        
       
            

In [53]:
center_embeds_cbow,context_embeds_cbow=train_cbow(int_words,vocab_to_int)
word_embeddings_cbow=center_embeds_cbow+context_embeds_cbow

epoch_0 cost_value:  214.35323905944824
epoch_1 cost_value:  201.5667428970337
epoch_2 cost_value:  190.93402910232544
epoch_3 cost_value:  179.54526090621948
epoch_4 cost_value:  174.1926827430725
epoch_5 cost_value:  166.70750617980957
epoch_6 cost_value:  157.18461179733276
epoch_7 cost_value:  146.86467742919922
epoch_8 cost_value:  145.0067572593689
epoch_9 cost_value:  134.81006240844727
epoch_10 cost_value:  130.02308702468872
epoch_11 cost_value:  119.76873779296875
epoch_12 cost_value:  117.49447917938232
epoch_13 cost_value:  111.73339200019836
epoch_14 cost_value:  105.98544049263
epoch_15 cost_value:  99.07962203025818
epoch_16 cost_value:  91.6895067691803
epoch_17 cost_value:  88.30649709701538
epoch_18 cost_value:  81.97561454772949
epoch_19 cost_value:  76.7828837633133
epoch_20 cost_value:  75.66244864463806
epoch_21 cost_value:  68.61311161518097
epoch_22 cost_value:  62.304757833480835
epoch_23 cost_value:  59.14339739084244
epoch_24 cost_value:  55.737908601760864
e

In [85]:
# ---------------- GLOVE -----------------
def cooccurence_mat(int_words,vocab_size,window_size):
    skip_window=window_size//2
    matrix=np.zeros((vocab_size,vocab_size),np.float32)
    # we go through the dataset and count cooccurence for every (center,context) pairs in window
    for center_ind in range(skip_window,len(int_words)-skip_window,1):
        for j in range(window_size):
            if j!=skip_window:
                matrix[int_words[center_ind],int_words[center_ind-skip_window+j]]+=1.0/abs(skip_window-j)
                matrix[int_words[center_ind-skip_window+j],int_words[center_ind]]+=1.0/abs(skip_window-j)
    return matrix

def getContexts(batch,i,window_size):
    n=window_size//2
    return list(set(batch[max(0,i-n):i]+batch[i+1:min(len(batch),i+n+1)]))

def get_batch_glove(int_words,vocab_size,batch_size,window_size):
    n_batches=len(int_words)//batch_size
    int_words=int_words[:n_batches*batch_size]
    cooccur_mat=cooccurence_mat(int_words,vocab_size,window_size)
    for batch_start in range(0,len(int_words),batch_size):
        batch=int_words[batch_start:batch_start+batch_size]
        x,y,freq=[],[],[]
        for i in range(len(batch)):
            center=batch[i]
            contexts=getContexts(batch,i,window_size)
            for i in range(len(contexts)):
                if cooccur_mat[contexts[i],center]>0:
                    freq.append(cooccur_mat[contexts[i],center])
                    x.append(center)
                    y.append(contexts[i])
        # x: indexes of centers, y: indexes of contexts, freq : number of times (center,context) cooccur
        yield x,y,freq
    
    
def train_glove(int_words,vocab_to_int):
    
    # hyperparameters
    epochs=100
    batch_size=100
    window_size=5
    dimension=300
    n_samples=20
    
    vocab_size=len(vocab_to_int)
    
    inputs=tf.placeholder(tf.int32,[None])
    labels=tf.placeholder(tf.int32,[None])
    freqs=tf.placeholder(tf.float32,[None])
    
    embedding_V=tf.Variable(tf.random_uniform([vocab_size,dimension],-1,1))
    embedding_U=tf.Variable(tf.random_uniform([vocab_size,dimension],-1,1))
    
    center_embeds=tf.nn.embedding_lookup(embedding_V,inputs)
    contexts_embeds=tf.nn.embedding_lookup(embedding_U,labels)
    
    # 2 hypermarameters
    alpha=100
    beta=3/4
    #f(freq_i)=min((freq_i/alpha)^beta,1)
    loss=tf.multiply(tf.square(tf.reduce_sum(tf.multiply(center_embeds,contexts_embeds),axis=1)-tf.log(freqs)),tf.minimum(1.0,tf.pow(tf.div(freqs,alpha),beta)))
    cost=tf.reduce_mean(loss)
    optimizer=tf.train.AdamOptimizer().minimize(cost)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(epochs):
            batch_generator=get_batch_glove(int_words,vocab_size,batch_size,window_size)
            cost_value=0
            for x,y,freq in batch_generator:
                feed_dic={inputs:x , labels:y , freqs: freq}
                _,cost_val=sess.run([optimizer,cost],feed_dic)
                cost_value+=cost_val
            print('epoch_{}'.format(epoch),'cost_value: ',cost_value)
    return embedding_V,embedding_U

In [86]:
center_embeds_glove,context_embeds_glove=train_glove(int_words,vocab_to_int)
embeddings_glove=center_embeds_glove+context_embeds_glove

epoch_0 cost_value:  44.63076031208038
epoch_1 cost_value:  32.49052178859711
epoch_2 cost_value:  25.987874150276184
epoch_3 cost_value:  21.671276688575745
epoch_4 cost_value:  18.25673097372055
epoch_5 cost_value:  15.443507373332977
epoch_6 cost_value:  13.107285022735596
epoch_7 cost_value:  11.154768764972687
epoch_8 cost_value:  9.515049040317535
epoch_9 cost_value:  8.132575869560242
epoch_10 cost_value:  6.963164106011391
epoch_11 cost_value:  5.971194013953209
epoch_12 cost_value:  5.127766028046608
epoch_13 cost_value:  4.409231245517731
epoch_14 cost_value:  3.79607093334198
epoch_15 cost_value:  3.2720328345894814
epoch_16 cost_value:  2.823558524250984
epoch_17 cost_value:  2.4393343925476074
epoch_18 cost_value:  2.1098144948482513
epoch_19 cost_value:  1.8269412517547607
epoch_20 cost_value:  1.5839033722877502
epoch_21 cost_value:  1.3748552426695824
epoch_22 cost_value:  1.1948820315301418
epoch_23 cost_value:  1.0398338064551353
epoch_24 cost_value:  0.90614322014153