
## Task 1 (5 points)

Implement simplified word2vec with negative sampling from scratch (using pure numpy). Assume that in the training data objects and contexts are given explicitly, one pair per line, and objects are on the left. The result of the training should be object vectors. Please, write them to a file using *natural* text format, ie

<pre>
word1 x1_1 x1_2 ... x1_N 
word2 x2_1 x2_2 ... x2_N
...
wordK xK_1 xK_2 ... xk_N
</pre>

Use the loss from Slide 3 in Lecture NLP.2, compute the gradient manually. You can use some gradient clipping, or regularisation. 

**Remark**: the data is specially prepared to make the learning process easier. 
Present vectors using the code below. In this task we define success as 'obtaining a result which looks definitely not random'

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
def sigmoid(x):
    sig = np.where(x < 0, np.exp(x)/(1 + np.exp(x)), 1/(1 + np.exp(-x)))
    return sig

In [113]:
# words = pd.read_csv("task1_objects_contexts_polish.txt", sep = " ", names = ["word_1", "word_2"])
# # unq_words = pd.Series(
# pd.concat([words["word_1"],words["word_2"] ]).unique()
# #     )


class Word2Vec:
    
    def __init__(self, space_dim = 20):
        self.space_dim = space_dim

    
    def upload_words(self, obj_cont_file):
        words = pd.read_csv(obj_cont_file, sep = " ", names = ["word_1", "word_2"])
        self.context_freq = words.groupby(['word_2']).agg(
            count=pd.NamedAgg(column="word_2", aggfunc="count")
        )
        self.objects = words["word_1"].unique()
        self.contexts = words["word_2"].unique()
        self.n_objects = len(self.objects)
        self.n_contexts = len(self.contexts)
        
    def get_objects(self):
        return(self.objects)
    
    def get_contexts(self):
        return(self.contexts)
    
    def get_obj_id(self, obj):
        return self.objects_to_id.loc[obj]
    
    def get_cont_id(self, cont):
        return self.contexts_to_id.loc[cont]
    
#     def get_obj_id(self, obj):
#         return self.objects_to_id.loc[obj]
    
    def make_maps(self):
        self.contexts_to_id = pd.Series(np.arange(self.n_contexts), index = self.contexts)
        self.id_to_contexts = pd.Series(self.contexts)
        self.objects_to_id = pd.Series(np.arange(self.n_objects), index = self.objects)
        self.id_to_objects = pd.Series(self.objects)
        
    def make_arrays(self):
        rng = np.random.default_rng()
        mu, sigma = 0, 0.1
        self.objects_embed = rng.normal(size = (self.n_objects, self.space_dim))
        self.contexts_embed = rng.normal(size = (self.n_contexts, self.space_dim))
        
    def calc_modified_unigram_probs(self):
        self.id_context_freq = self.context_freq.set_index(self.contexts_to_id.loc[self.context_freq.index])
        self.id_context_probs = self.id_context_freq.div(self.id_context_freq.sum(axis=0), axis=1)
        self.id_context_probs = self.id_context_probs ** (3/4)
        self.id_context_probs = self.id_context_freq.div(self.id_context_freq.sum(axis=0), axis=1)
        self.id_context_probs.sort_index(inplace = True)
        self.context_probs_array = np.squeeze(self.id_context_probs.to_numpy(copy = True))
        
    def generate_neg_samples(self, K):
        rng = np.random.default_rng()
        self.neg_samples = rng.choice(np.arange(0, self.n_contexts), size = (K, 1000000) , p = self.context_probs_array, replace = True)
        self.neg_samples_pointer = 0   

    def get_ready(self):
        self.make_maps()
        self.make_arrays()
        self.calc_modified_unigram_probs()
        self.neg_samples_pointer = None
                   
        
    def get_neg_samples(self, word_idx, K):

        # while True:
        #     if self.neg_samples_pointer > 99999:
        #         self.generate_neg_samples(K)
        #     neg_samples_indices = self.neg_samples[:,self.neg_samples_pointer]
        #     self.neg_samples_pointer += 1
            
        #     if  not np.any(np.isin(word_idx, neg_samples_indices)):
        #         break

        # while True:
        if self.neg_samples_pointer > 999999:
            self.generate_neg_samples(K)
        neg_samples_indices = self.neg_samples[:,self.neg_samples_pointer]
        self.neg_samples_pointer += 1
            
            # if  not np.any(np.isin(word_idx, neg_samples_indices)):
                # break
            
        return neg_samples_indices


        return neg_sample_indices
    
    def save_emmbedding(self, file_path):
        self.id_to_objects.sort_index(inplace = True)
        words = self.id_to_objects.to_numpy(copy = True)
        words_embed = np.hstack((words[:,np.newaxis], self.objects_embed))
#         print(words_embed)
        np.savetxt(file_path, words_embed, delimiter=" ", fmt ='%s', header = str(self.n_objects) + " " + str(self.space_dim), comments = "")
    
#     def calc_obj_fun(object_idx, context_idx, neg_contexts_indices):
    
    def train_word2vec(self, obj_cont_file_path, K = 3, lr = 0.1, reps = 1):

        if self.neg_samples_pointer is None:
            self.generate_neg_samples(K)

        for i in range(reps):
            print("Starting epoch " + str(i) )

            obj_cont_file = open(obj_cont_file_path, "r")
            line_num = 0
            for line in obj_cont_file:
                line_num += 1
                obj, cont = line.strip().split(" ")
                obj_idx = self.get_obj_id(obj)
                cont_idx = self.get_cont_id(cont)
                neg_cont_indices = self.get_neg_samples(cont_idx, K)

                obj_vec = self.objects_embed[obj_idx,:][np.newaxis, :] # row vec
                # print("obj", obj_vec.shape)
                cont_vec = self.contexts_embed[cont_idx,:][np.newaxis, :] # row vec
                # print("cont", cont_vec.shape)

                neg_cont_vecs = self.contexts_embed[neg_cont_indices,:] # one neg cont is a row

                # print("neg_cont", neg_cont_vecs.shape)

#                 obj_grad = obj_vec / (np.exp(-cont_vec.T @ obj_vec) + 1) # column vec
#                 cont_grad = (cont_vec.T / (np.exp(-cont_vec.T @ obj_vec) -1) + 
#                             (neg_cont_vecs.T / (np.exp(neg_cont_vecs.T @ obj_vec) + 1)).sum(axis = 0, keepdims = True)) # column vec

#                 neg_cont_grads = obj_vec / (np.exp(-neg_cont_vecs.T @ obj_vec) + 1).T # gradient for one u_k is one column
                
                cont_grad = (sigmoid(cont_vec @ obj_vec.T) - 1) * obj_vec # row vec            
                obj_grad = (sigmoid(cont_vec @ obj_vec.T) - 1) * cont_vec + (sigmoid(neg_cont_vecs @ obj_vec.T) * neg_cont_vecs).sum(axis = 0, keepdims = True) # row vec
                neg_cont_grads = sigmoid(neg_cont_vecs @ obj_vec.T) * neg_cont_vecs # row vec

                # print("obj_grad", obj_grad.shape)
                # print("cont_grad", cont_grad.shape)
                # print("neg_cont_grads", neg_cont_grads.shape)




                self.objects_embed[obj_idx,:] -= np.squeeze(lr * obj_grad)
                self.contexts_embed[cont_idx,:] -= np.squeeze(lr * cont_grad)

                for grad_idx, neg_idx in enumerate(neg_cont_indices):
                    # print(grad_idx, neg_idx)
                    # print(neg_cont_grads[:,grad_idx])
                    # print(lr)
                    self.contexts_embed[neg_idx,:] -= np.squeeze(lr * neg_cont_grads[grad_idx,:])

                if line_num % 100000 == 0:
                    print("now on line " + str(line_num))
                    
            obj_cont_file.close()

            self.objects_embed /= self.objects_embed.max(axis = 0)
            self.contexts_embed /= self.contexts_embed.max(axis = 0)

            print("Done with epoch " + str(i))
        
        

        
    
    

        


In [114]:
w2v = Word2Vec(20)
# w2v.upload_words("task1_objects_contexts_polish.txt")
w2v.upload_words("/content/drive/MyDrive/NN_NLP/task1_objects_contexts_polish.txt")
# context_freq = w2v.context_freq
# w2v.make_maps()
# # contexts_to_id = w2v.contexts_to_id
# w2v.make_arrays()
# id_context_freq = context_freq.set_index(contexts_to_id.loc[context_freq.index])
# id_context_freq
w2v.get_ready()
# w2v.train_word2vec("task1_objects_contexts_polish.txt")


In [118]:
# w2v.train_word2vec("task1_objects_contexts_polish.txt")
w2v.train_word2vec("/content/drive/MyDrive/NN_NLP/task1_objects_contexts_polish.txt")

w2v.save_emmbedding("task1_w2v_vectors_2.txt")

Starting epoch 0
now on line 100000
now on line 200000
now on line 300000
now on line 400000
now on line 500000
now on line 600000
now on line 700000
now on line 800000
now on line 900000
now on line 1000000
now on line 1100000
now on line 1200000
now on line 1300000
now on line 1400000
now on line 1500000
now on line 1600000
now on line 1700000
now on line 1800000
now on line 1900000
now on line 2000000
now on line 2100000
now on line 2200000
now on line 2300000
now on line 2400000
now on line 2500000
now on line 2600000
now on line 2700000
now on line 2800000
now on line 2900000
now on line 3000000
now on line 3100000
now on line 3200000
now on line 3300000
now on line 3400000
now on line 3500000
now on line 3600000
now on line 3700000
now on line 3800000
now on line 3900000
now on line 4000000
now on line 4100000
now on line 4200000
now on line 4300000
now on line 4400000
now on line 4500000
now on line 4600000
now on line 4700000
now on line 4800000
now on line 4900000
now on line 

In [124]:
w2v.train_word2vec("/content/drive/MyDrive/NN_NLP/task1_objects_contexts_polish.txt")

w2v.save_emmbedding("task1_w2v_vectors_3.txt")

Starting epoch 0
now on line 100000
now on line 200000
now on line 300000
now on line 400000
now on line 500000
now on line 600000
now on line 700000
now on line 800000
now on line 900000
now on line 1000000
now on line 1100000
now on line 1200000
now on line 1300000
now on line 1400000
now on line 1500000
now on line 1600000
now on line 1700000
now on line 1800000
now on line 1900000
now on line 2000000
now on line 2100000
now on line 2200000
now on line 2300000
now on line 2400000
now on line 2500000
now on line 2600000
now on line 2700000
now on line 2800000
now on line 2900000
now on line 3000000
now on line 3100000
now on line 3200000
now on line 3300000
now on line 3400000
now on line 3500000
now on line 3600000
now on line 3700000
now on line 3800000
now on line 3900000
now on line 4000000
now on line 4100000
now on line 4200000
now on line 4300000
now on line 4400000
now on line 4500000
now on line 4600000
now on line 4700000
now on line 4800000
now on line 4900000
now on line 

In [127]:
w2v.train_word2vec("/content/drive/MyDrive/NN_NLP/task1_objects_contexts_polish.txt")

w2v.save_emmbedding("task1_w2v_vectors_4.txt")

Starting epoch 0
now on line 100000
now on line 200000
now on line 300000
now on line 400000
now on line 500000
now on line 600000
now on line 700000
now on line 800000
now on line 900000
now on line 1000000
now on line 1100000
now on line 1200000
now on line 1300000
now on line 1400000
now on line 1500000
now on line 1600000
now on line 1700000
now on line 1800000
now on line 1900000
now on line 2000000
now on line 2100000
now on line 2200000
now on line 2300000
now on line 2400000
now on line 2500000
now on line 2600000
now on line 2700000
now on line 2800000
now on line 2900000
now on line 3000000
now on line 3100000
now on line 3200000
now on line 3300000
now on line 3400000
now on line 3500000
now on line 3600000
now on line 3700000
now on line 3800000
now on line 3900000
now on line 4000000
now on line 4100000
now on line 4200000
now on line 4300000
now on line 4400000
now on line 4500000
now on line 4600000
now on line 4700000
now on line 4800000
now on line 4900000
now on line 

In [135]:
w2v.train_word2vec("/content/drive/MyDrive/NN_NLP/task1_objects_contexts_polish.txt")

w2v.save_emmbedding("task1_w2v_vectors_5.txt")

Starting epoch 0
now on line 100000
now on line 200000
now on line 300000
now on line 400000
now on line 500000
now on line 600000
now on line 700000
now on line 800000
now on line 900000
now on line 1000000
now on line 1100000
now on line 1200000
now on line 1300000
now on line 1400000
now on line 1500000
now on line 1600000
now on line 1700000
now on line 1800000
now on line 1900000
now on line 2000000
now on line 2100000
now on line 2200000
now on line 2300000
now on line 2400000
now on line 2500000
now on line 2600000
now on line 2700000
now on line 2800000
now on line 2900000
now on line 3000000
now on line 3100000
now on line 3200000
now on line 3300000
now on line 3400000
now on line 3500000
now on line 3600000
now on line 3700000
now on line 3800000
now on line 3900000
now on line 4000000
now on line 4100000
now on line 4200000
now on line 4300000
now on line 4400000
now on line 4500000
now on line 4600000
now on line 4700000
now on line 4800000
now on line 4900000
now on line 

In [137]:
from gensim.models import KeyedVectors
# task1_wv = KeyedVectors.load_word2vec_format('task1_w2v_vectors.txt', binary=False)
# task1_wv = KeyedVectors.load_word2vec_format('task1_w2v_vectors_2.txt', binary=False)
# task1_wv = KeyedVectors.load_word2vec_format('task1_w2v_vectors_3.txt', binary=False)
# task1_wv = KeyedVectors.load_word2vec_format('task1_w2v_vectors_4.txt', binary=False)
task1_wv = KeyedVectors.load_word2vec_format('task1_w2v_vectors_5.txt', binary=False)



example_english_words = ['dog', 'dragon', 'love', 'bicycle', 'marathon', 'logic', 'butterfly']  # replace, or add your own examples
example_polish_words = ['pies', 'kot', 'miłość', 'chłopiec', 'logika', 'ustawa', 'kobieta', 'motyl']

example_words = example_polish_words

for w0 in example_words:
    print ('WORD:', w0)
    for w, v in task1_wv.most_similar(w0):
        print ('   ', w, v)
    print ()

WORD: pies
    dziewczyna 0.8752753734588623
    chłopiec 0.8642098307609558
    nadgarstek 0.8078317046165466
    antywirus 0.7606443762779236
    facet 0.757659912109375
    kobieta 0.730688214302063
    karawana 0.7299747467041016
    arciszewski 0.7172316312789917
    zwierzę 0.7157486081123352
    przechodzień 0.7089031934738159

WORD: kot
    małpa 0.8216554522514343
    kota 0.8076803684234619
    mucha 0.8019682168960571
    miś 0.796568751335144
    gęś 0.7962770462036133
    ocet 0.7945913672447205
    chłopiec 0.7893929481506348
    bez 0.7859345078468323
    anka 0.7807961106300354
    koza 0.7772140502929688

WORD: miłość
    wiara 0.8106540441513062
    mitologia 0.7806751728057861
    więź 0.7768008708953857
    fantazja 0.7763660550117493
    gynt 0.7680486440658569
    cnota 0.7609277367591858
    przyjaźń 0.7608277797698975
    wolność 0.7574243545532227
    wyobraźnia 0.7566429972648621
    młodość 0.7522676587104797

WORD: chłopiec
    dziewczyna 0.932693362236023
 

In [133]:
w2v.save_emmbedding("task1_w2v_vectors_4.txt")

In [None]:
obj_cont_file = open("task1_objects_contexts_polish.txt", "r")
for line in obj_cont_file:
    obj, cont = line.split(" ")
    print(obj, cont)
    break
obj_cont_file.close()

nagromadzenie G2_następstwo



In [98]:
100 % 10

0