In [2]:
import tensorflow as tf
import numpy as np
import tensorflow.keras.preprocessing as preprocessing
from collections import Counter
import random

In [3]:
def text_to_word_sequence(text):
    words=preprocessing.text.text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\n', lower=True, split=' ')
    return words


def create_lookup_tables(words):
    word_counts = Counter(words)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

In [4]:
# p_drop(word)=1-sqrt(treshold/freq(word)) , frequent words are more likely to be removed from the dataset
def subsampling(int_words):
    word_counts=Counter(int_words) # a dictionary from int_word to number of times it appeared in the text
    total_count=len(int_words)
    p_drops={word:1-np.sqrt(1e-5/(count/total_count)) for word,count in word_counts.items()}
    train_words=[word for word in int_words if random.random()<(1-p_drops[word])] # the bigger p_drop the less likely to be chosen
    return train_words

In [5]:
text=""
#reading at most 100 lines
with open('data.txt') as f:
    i=0
    for line in f.readlines():
        text+=line    
        i+=1
        if i>=100:
            break
print(text)
    
words=text_to_word_sequence(text)

vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]
print(int_words)
train_words=subsampling(int_words)


Since the yield keyword is only used with generators, it makes sense to recall the concept of generators first.

The idea of generators is to calculate a series of results one-by-one on demand (on the fly). In the simplest case, a generator can be used as a list, where each element is calculated lazily. Lets compare a list and a generator that do the same thing - return powers of two
Iterating over the list and the generator looks completely the same. However, although the generator is iterable, it is not a collection, and thus has no length. Collections (lists, tuples, sets, etc) keep all values in memory and we can access them whenever needed. A generator calculates the values on the fly and forgets them, so it does not have any overview about the own result set.

Generators are especially useful for memory-intensive tasks, where there is no need to keep all of the elements of a memory-heavy list accessible at the same time. Calculating a series of values one-by-one can also be usefu

In [6]:
#---------------- SKIP GRAM ------------------
def get_contextes(batch,i,window_size):
    n=window_size//2
    return list(set(batch[max(0,i-n):i]+batch[i+1:min(len(batch),i+n+1)]))

#generator for batches
def get_batch_sg(words,batch_size,window_size):
    n_batches=len(words)//batch_size
    words=words[:n_batches*batch_size]
    for batch_start in range(0,len(words),batch_size):
        batch=words[batch_start:batch_start+batch_size]
        for i in range(len(batch)):
            x,y=[],[]
            center=batch[i]
            y.extend(get_contextes(batch,i,window_size))
            x.extend([center]*len(y))
        yield x,y 

In [14]:
def train_sg(words,vocab_to_int):
    vocab_size=len(vocab_to_int)
    # hyperparameters
    epochs=100
    batch_size=100
    window_size=5
    word_dimension=300
    n_samples=10
    
    inputs=tf.placeholder(tf.int32,[None],name='inputs') # size is variable , inputs are indexes of words in the batch
    labels=tf.placeholder(tf.int32,[None,None],name='labels')
    with tf.variable_scope("skip_gram"):
        embedding_V=tf.Variable(tf.random_uniform((vocab_size,word_dimension),-1,1))
        embed=tf.nn.embedding_lookup(embedding_V,inputs) # chooses the given rows
        embedding_U=tf.Variable(tf.random_normal((vocab_size,word_dimension)))
        softmax_biases=tf.Variable(tf.zeros(vocab_size))
        #loss with negative_sampling
        loss=tf.nn.sampled_softmax_loss(weights=embedding_U,biases=softmax_biases,inputs=embed,labels=labels,num_sampled=n_samples,num_classes=vocab_size)
        cost=tf.reduce_mean(loss)
        optimizer=tf.train.AdamOptimizer().minimize(cost)
    
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(epochs):
            cost_value=0
            batch_generator=get_batch_sg(int_words,batch_size,window_size)
            for x,y in batch_generator:
                feed_dict={inputs:x,labels:np.array(y)[:,None]}  #labels:np.array(y)[:,None] adds a dimmension, is like squeeze(1)
                _,cost_value=sess.run([optimizer,cost],feed_dict)
                cost_value+=cost_value
            print('epoch_{}'.format(epoch),'cost_value: ',cost_value)
    return embedding_V,embedding_U


In [15]:
embedding_V,embedding_U=train_sg(words,vocab_to_int)
word_embeddings=embedding_V+embedding_U

epoch_0 cost_value:  62.023907
epoch_1 cost_value:  47.717552
epoch_2 cost_value:  54.585716
epoch_3 cost_value:  46.912094
epoch_4 cost_value:  29.777925
epoch_5 cost_value:  26.018322
epoch_6 cost_value:  20.571072
epoch_7 cost_value:  36.816338
epoch_8 cost_value:  24.565477
epoch_9 cost_value:  22.84002
epoch_10 cost_value:  24.794106
epoch_11 cost_value:  6.6287956
epoch_12 cost_value:  7.7672706
epoch_13 cost_value:  5.041242
epoch_14 cost_value:  19.385658
epoch_15 cost_value:  5.813324
epoch_16 cost_value:  2.8184562
epoch_17 cost_value:  23.947823
epoch_18 cost_value:  15.932253
epoch_19 cost_value:  13.054022
epoch_20 cost_value:  0.43791658
epoch_21 cost_value:  0.015754502
epoch_22 cost_value:  0.010242688
epoch_23 cost_value:  4.792638
epoch_24 cost_value:  4.587945
epoch_25 cost_value:  0.0007517163
epoch_26 cost_value:  0.00091312587
epoch_27 cost_value:  0.023369104
epoch_28 cost_value:  3.9572878
epoch_29 cost_value:  0.00020478327
epoch_30 cost_value:  0.00026377995
e

In [92]:
# ------------------ CBOW -------------------

def get_batch_cbow(int_words,batch_size,window_size):
    n_batches=len(int_words)//batch_size
    int_words=int_words[:n_batches*batch_size]
    center_ind=window_size//2
    
    for bath_start in range(0,len(int_words),batch_size):
        batch=int_words[bath_start:bath_start+batch_size]
        surroundings=np.ndarray((batch_size-(2*center_ind),window_size-1),np.int32)
        labels=np.ndarray((batch_size-(2*center_ind),1),np.int32)
        for i in range(center_ind,batch_size-center_ind,1):    
            center=batch[i]
            col_idx=0
            for j in range(window_size):
                if j==window_size//2:
                    continue
                else:
                    surroundings[i-center_ind,col_idx]=batch[i-center_ind+j]
                    col_idx+=1
            labels[i-center_ind,0]=center
            
        yield surroundings,labels
        

def train_cbow(int_words,vocab_to_int):
    
    # hyperparameters
    epochs=200
    batch_size=100
    window_size=5
    dimension=300
    n_samples=20
    
    half_window=window_size//2
    vocab_size=len(vocab_to_int)
    
    inputs=tf.placeholder(tf.int32,[batch_size-(2*half_window),window_size-1])
    labels=tf.placeholder(tf.int32,[batch_size-(2*half_window),1])
    with tf.variable_scope('cbow'):
    
        embeddings=tf.Variable(tf.random_uniform((vocab_size,dimension),-1,1))

        #get_avg_embed
        embeds=None
        for i in range(window_size-1):
            embedding_i=tf.nn.embedding_lookup(embeddings,inputs[:,i])
            emb_x,emb_y = embedding_i.get_shape().as_list()
            if embeds is None:
                embeds=tf.reshape(embedding_i,[emb_x,emb_y,1])
            else:
                embeds=tf.concat([embeds,tf.reshape(embedding_i,[emb_x,emb_y,1])],2)
        avg_embed=tf.reduce_mean(embeds,2,keepdims=False)

        softmax_w=tf.Variable(tf.random_normal((vocab_size,dimension)))
        softmax_b=tf.Variable(tf.zeros(vocab_size))

        loss=tf.nn.sampled_softmax_loss(weights=softmax_w,biases=softmax_b,inputs=avg_embed,labels=labels,num_sampled=n_samples,num_classes=vocab_size)
        cost=tf.reduce_mean(loss)
        optimizer=tf.train.AdamOptimizer().minimize(cost)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(epochs):
            batch_generator=get_batch_cbow(int_words,batch_size,window_size)
            cost_value=0
            
            for x,y in batch_generator:
                feed_dic={inputs:x, labels:y}
                _,cost_value=sess.run([optimizer,cost],feed_dict=feed_dic)
                cost_value+=cost_value
            print('epoch_{}'.format(epoch),'cost_value: ',cost_value)
            
    return embeddings,softmax_w
        
       
            

In [93]:
embedding_U,embedding_V=train_cbow(int_words,vocab_to_int)
word_embeddings=embedding_U+embedding_V

epoch_0 cost_value:  18.463247
epoch_1 cost_value:  16.95909
epoch_2 cost_value:  18.090807
epoch_3 cost_value:  15.196023
epoch_4 cost_value:  13.884656
epoch_5 cost_value:  12.252251
epoch_6 cost_value:  13.827466
epoch_7 cost_value:  11.537938
epoch_8 cost_value:  12.163001
epoch_9 cost_value:  10.341201
epoch_10 cost_value:  9.090264
epoch_11 cost_value:  8.916907
epoch_12 cost_value:  8.668685
epoch_13 cost_value:  8.29587
epoch_14 cost_value:  7.705408
epoch_15 cost_value:  7.193191
epoch_16 cost_value:  6.596796
epoch_17 cost_value:  5.3499675
epoch_18 cost_value:  4.808045
epoch_19 cost_value:  5.202244
epoch_20 cost_value:  4.412601
epoch_21 cost_value:  3.3996048
epoch_22 cost_value:  4.1421094
epoch_23 cost_value:  3.3975832
epoch_24 cost_value:  3.9550858
epoch_25 cost_value:  3.1409779
epoch_26 cost_value:  2.7786732
epoch_27 cost_value:  3.2275574
epoch_28 cost_value:  2.8136652
epoch_29 cost_value:  2.25799
epoch_30 cost_value:  1.7740451
epoch_31 cost_value:  1.6948042
