<a href="https://colab.research.google.com/github/ronykroy/NLP/blob/master/Word2vec_in_python_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Source :: https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy/?source=post_page-----13445eebd281----------------------

In [0]:
import numpy as np
import re
from collections import defaultdict


In [0]:
class word2vec():
    def __init__ (self):
        self.n = settings['n']
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
        pass
    
    
    # GENERATE TRAINING DATA
    def generate_training_data(self, settings, corpus): # corpus can be a simple array .. at best an array of strings

        # GENERATE WORD COUNTS
        word_counts = defaultdict(int) # same as a standard dict The value fields' data type is specified upon initialization here its int
        for row in corpus:
            for word in row:
                word_counts[word] += 1 #populate the word_count dictionary thusly

        self.v_count = len(word_counts.keys()) # vocabulary count.. is thus the number of words... 

        # GENERATE LOOKUP DICTIONARIES
        self.words_list = sorted(list(word_counts.keys()),reverse=False) # list of words
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list)) # a word to integer mapping
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list)) # an integer to word mapping reverse lookup

        training_data = []
        # CYCLE THROUGH EACH SENTENCE IN CORPUS
        for sentence in corpus:
            sent_len = len(sentence)

            # CYCLE THROUGH EACH WORD IN SENTENCE
            for i, word in enumerate(sentence):
                
                #w_target  = sentence[i]
                w_target = self.word2onehot(sentence[i])

                # CYCLE THROUGH CONTEXT WINDOW
                w_context = []
                for j in range(i-self.window, i+self.window+1):
                    if j!=i and j<=sent_len-1 and j>=0: # conditions to limit J [the sliding window index] within the sentence length
                        w_context.append(self.word2onehot(sentence[j]))
                training_data.append([w_target, w_context]) # append Y, X to the training data
        return np.array(training_data)


    # SOFTMAX ACTIVATION FUNCTION
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)


    # CONVERT WORD TO ONE HOT ENCODING
    def word2onehot(self, word):
        word_vec = [0 for i in range(0, self.v_count)] # an array of 0s
        word_index = self.word_index[word] # get the word index
        word_vec[word_index] = 1 # & set that to 1
        return word_vec # return


    # FORWARD PASS
    def forward_pass(self, x):
        h = np.dot(self.w1.T, x) # Hidden Layer:  w1 was v_count*Embedding_size after transpose Embedding_size*v_count
        u = np.dot(self.w2.T, h) # see explanaiton in link at source: w2 is context matrix
        y_c = self.softmax(u) # thats it.. this is the network architecture
        return y_c, h, u
    # notes
    # no bias here.. dot is the matrix product of type MxN * NxP = MxP sized matrix
    # u = w2T(w1T*X) so basically we are optimizing the weights for the context vectors around a word.. (w2 is the context matrix.. see defn)
    #  If you multiply a 1 x 10,000 one-hot vector by a 10,000 x 300 matrix, it will effectively just select the matrix row corresponding to the “1”.
    # output of the hidden layer is just the “word vector” for the input word # source:: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

    # two different words have very similar “contexts” (that is, what words are likely to appear around them), 
    # then our model needs to output very similar results for these two words. 
    # And one way for the network to output similar context predictions 
    # for these two words is if the word vectors are similar. 
    # So, if two words have similar contexts, then our network is motivated to learn similar word vectors for these two words

    # the above is what the training does..

    # stemming – the network will likely learn similar word vectors for the words “ant” and “ants” because these should have similar contexts. and thus a first step in stemming

    # BACKPROPAGATION
    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))

        # UPDATE WEIGHTS
        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)
        pass


    # TRAIN W2V model
    def train(self, training_data):
        # INITIALIZE WEIGHT MATRICES
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))     # embedding matrix # Why is it 0.8 to -0.8, Guess: avoiding exploding and diminishing grads this way..?
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))     # context matrix # MxN M rows and N columns
        
        # CYCLE THROUGH EACH EPOCH
        for i in range(0, self.epochs):

            self.loss = 0

            # CYCLE THROUGH EACH TRAINING SAMPLE
            for w_t, w_c in training_data: # for target and context one hot vecotrs... that were 1* vocab length in size

                # FORWARD PASS
                y_pred, h, u = self.forward_pass(w_t)
                
                # CALCULATE ERROR
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                # BACKPROPAGATION
                self.backprop(EI, h, w_t)

                # CALCULATE LOSS
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
                #self.loss += -2*np.log(len(w_c)) -np.sum([u[word.index(1)] for word in w_c]) + (len(w_c) * np.log(np.sum(np.exp(u))))
                
            print ('EPOCH:',i, 'LOSS:', self.loss)
        pass


    # input a word, returns a vector (if available)
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w


    # input a vector, returns nearest word(s)
    def vec_sim(self, vec, top_n): # vecotrs .. similar

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(vec, v_w2)
            theta_den = np.linalg.norm(vec) * np.linalg.norm(v_w2) # norm is basicaly euclidean dist...
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        #words_sorted = sorted(word_sim.items(), key=lambda(word, sim):sim, reverse=True)
        # not sure how the above was supposed to work 2.7 syntax may be..?
        words_sorted = sorted(word_sim.items(), key = lambda entry: entry[1] , reverse=True)

        for word, sim in words_sorted[:top_n]:
            print (word, sim)
            
        pass

    # input word, returns top [n] most similar words
    def word_sim(self, word, top_n):
        
        w1_index = self.word_index[word]
        v_w1 = self.w1[w1_index]

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        #words_sorted = sorted(word_sim.items(), key=lambda(word, sim):sim, reverse=True)
        # not sure how the above was supposed to work 2.7 syntax may be..?
        words_sorted = sorted(word_sim.items(), key = lambda entry: entry[1] , reverse=True)
        for word, sim in words_sorted[:top_n]:
            print (word, sim)
            
        pass

In [0]:
word_counts = defaultdict(int)

In [0]:
word_counts

defaultdict(int, {})

In [0]:
corpus = [['the','quick','brown','fox','jumped','over','the','lazy','dog']]

In [0]:
for row in corpus:
            for word in row:
                word_counts[word] += 1

        

In [0]:
word_counts

defaultdict(int,
            {'brown': 1,
             'dog': 1,
             'fox': 1,
             'jumped': 1,
             'lazy': 1,
             'over': 1,
             'quick': 1,
             'the': 2})

In [0]:
settings = {}
settings['n'] = 5                   # dimension of word embeddings
settings['window_size'] = 2         # context window +/- center word
settings['min_count'] = 0           # minimum word count
settings['epochs'] = 5000           # number of training epochs
settings['neg_samp'] = 10           # number of negative words to use during training
settings['learning_rate'] = 0.01    # learning rate
np.random.seed(0)                   # set the seed for reproducibility

corpus = [['the','quick','brown','fox','jumped','over','the','lazy','dog']]

# INITIALIZE W2V MODEL
w2v = word2vec()

# generate training data
training_data = w2v.generate_training_data(settings, corpus)

# train word2vec model
w2v.train(training_data)


EPOCH: 0 LOSS: 68.37096376709992
EPOCH: 1 LOSS: 67.88313773129582
EPOCH: 2 LOSS: 67.43127505997181
EPOCH: 3 LOSS: 67.0112049943476
EPOCH: 4 LOSS: 66.61935410862232
EPOCH: 5 LOSS: 66.25264389886307
EPOCH: 6 LOSS: 65.9084083928434
EPOCH: 7 LOSS: 65.5843274465069
EPOCH: 8 LOSS: 65.2783724111086
EPOCH: 9 LOSS: 64.98876161794144
EPOCH: 10 LOSS: 64.71392370243775
EPOCH: 11 LOSS: 64.45246722531292
EPOCH: 12 LOSS: 64.20315538093011
EPOCH: 13 LOSS: 63.96488483826672
EPOCH: 14 LOSS: 63.736667956900455
EPOCH: 15 LOSS: 63.51761777345709
EPOCH: 16 LOSS: 63.30693527348039
EPOCH: 17 LOSS: 63.103898557554714
EPOCH: 18 LOSS: 62.90785358463229
EPOCH: 19 LOSS: 62.71820623435013
EPOCH: 20 LOSS: 62.5344154770562
EPOCH: 21 LOSS: 62.355987477886515
EPOCH: 22 LOSS: 62.18247049153716
EPOCH: 23 LOSS: 62.01345042888875
EPOCH: 24 LOSS: 61.8485469965609
EPOCH: 25 LOSS: 61.687410326726464
EPOCH: 26 LOSS: 61.529718027831066
EPOCH: 27 LOSS: 61.375172597813304
EPOCH: 28 LOSS: 61.22349915045999
EPOCH: 29 LOSS: 61.07444

In [0]:
w2v.words_list

['brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the']

In [0]:
#w2v.word_vec('doge') 
# nice addl feature: if key error word doesnt exist...

In [0]:
w2v.word_vec('dog')

array([ 0.98920247,  0.54087254,  0.00658507,  3.04401467, -1.11124155])

In [0]:
w2v.vec_sim(w2v.word_vec('dog'), 2) 
# here comes class uses... you can make sure that the word you are searching for is being searched in the w2v model you have trained on the particular porpus
# given the limited vocab.. results are a tad disappointing

dog 1.0
over 0.6268815024010889


In [0]:
w2v.word_sim('dog', 2) # interesting that over is most closely attached to dog... :)

dog 1.0
over 0.6268815024010889


### some debugging...

In [0]:
n = 5 # say twe want a word embeddings to be 5.. usually its in the range of 100 to a 300 beyon a 300 you experience diminishing returns..

In [0]:
v_count = len(word_counts.keys())

In [0]:
v_count

8

In [0]:
w1 = np.random.uniform(-0.8, 0.8, (v_count, n))

In [0]:
w1

array([[ 0.38124162, -0.22493038,  0.36001609,  0.37986577, -0.1982128 ],
       [-0.74560557, -0.18165655,  0.47181929,  0.0169721 ,  0.05965399],
       [-0.5833557 ,  0.48368682, -0.36368072, -0.63716429,  0.44846639],
       [ 0.22461991,  0.37057726, -0.0097097 , -0.17843582,  0.66563951],
       [ 0.57894744,  0.6148898 , -0.54854941,  0.64356732, -0.00740357],
       [ 0.17540658,  0.63163639, -0.30545024, -0.25976584, -0.37109023],
       [-0.42026305,  0.0462256 , -0.79040948, -0.44317374,  0.70064574],
       [-0.07958058, -0.43943299,  0.70873324,  0.24438961,  0.41438924]])

In [0]:
# place holder
vec = w1[3]

In [0]:
#GENERATE LOOKUP DICTIONARIES
words_list = sorted(list(word_counts.keys()),reverse=False) # list of words
word_index = dict((word, i) for i, word in enumerate(words_list)) # a word to integer mapping
index_word = dict((i, word) for i, word in enumerate(words_list)) # an integer to word mapping reverse lookup

In [0]:
 # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(v_count):
            v_w2 = w1[i]
            theta_num = np.dot(vec, v_w2)
            theta_den = np.linalg.norm(vec) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = index_word[i]
            word_sim[word] = theta

       

In [0]:
word_sim.items()

dict_items([('brown', -0.34589990862375336), ('dog', -0.2757259436522549), ('fox', 0.49722030549108004), ('jumped', 0.9999999999999998), ('lazy', 0.2502249726125057), ('over', 0.10907028890938095), ('quick', 0.47871844651122014), ('the', 0.05674245997691591)])

In [0]:
words_sorted = sorted(word_sim.items(), key = lambda entry: entry[1] , reverse=True)

In [0]:
words_sorted

[('jumped', 0.9999999999999998),
 ('fox', 0.49722030549108004),
 ('quick', 0.47871844651122014),
 ('lazy', 0.2502249726125057),
 ('over', 0.10907028890938095),
 ('the', 0.05674245997691591),
 ('dog', -0.2757259436522549),
 ('brown', -0.34589990862375336)]

In [0]:
  words_sorted = sorted(word_sim.items(), key=lambda(word, sim):sim, reverse=True)

        for word, sim in words_sorted[:top_n]:
            print word, sim

NameError: ignored

In [0]:
d_test = defaultdict(int)

In [0]:
d_test = {'ciao': 17, 'bye': 14, 'hello': 23}

In [0]:
d_test.items()

dict_items([('ciao', 17), ('bye', 14), ('hello', 23)])

In [0]:
sorted(d_test.items())

[('bye', 14), ('ciao', 17), ('hello', 23)]

In [0]:
dir(d_test.items())

['__and__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__rand__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__ror__',
 '__rsub__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__xor__',
 'isdisjoint']

In [0]:
sorted(d_test.items(), key=lambda(k,v): v)

SyntaxError: ignored

In [0]:
sorted(d_test.items(), key=lambda item :item[1])
#sorted(lis, key = lambda i: (i['age'], i['name']))

[('bye', 14), ('ciao', 17), ('hello', 23)]