In [1]:
from __future__ import print_function

In [2]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.functional as Func

import zipfile
import collections
import numpy as np
from collections import Counter

class skipgram(nn.Module):
    
  def __init__(self, vocab_size, embedding_dim):
      
    super(skipgram, self).__init__()
    self.u_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)   
    self.v_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True) 
    self.embedding_dim = embedding_dim
    self.init_emb()
    
  def init_emb(self):
    initrange = 0.5 / self.embedding_dim
    self.u_embeddings.weight.data.uniform_(-initrange, initrange)
    self.v_embeddings.weight.data.uniform_(-0, 0)
    
  def forward(self, u_pos, v_pos, v_neg, batch_size):

    embed_u = self.u_embeddings(u_pos)
    embed_v = self.v_embeddings(v_pos)

    score  = torch.mul(embed_u, embed_v)
    score = torch.sum(score, dim=1)
    log_target = F.logsigmoid(score).squeeze()
    
    neg_embed_v = self.v_embeddings(v_neg)
    
    neg_score = torch.bmm(neg_embed_v, embed_u.unsqueeze(2)).squeeze()
    neg_score = torch.sum(neg_score, dim=1)
    sum_log_sampled = F.logsigmoid(-neg_score).squeeze()
    loss = log_target + sum_log_sampled

    return -loss.sum()/batch_size

  def save_word_vectors(self, file_name):
      
      ''' Save for each word its vector to the file_name
          E.g   word1 vector1
                word2 vector2
                ....
      '''
      
      pass

In [None]:
class Word2Vec:
    
  def __init__(self, input_file='Dataset2.txt', vocabulary_size=100000, embedding_dim=200, epoch_num=20, batch_size=16, windows_size=5, neg_sample_num=10):
    
    self.embedding_dim = embedding_dim
    self.windows_size = windows_size
    self.vocabulary_size = vocabulary_size
    self.batch_size = batch_size
    self.epoch_num = epoch_num
    self.neg_sample_num = neg_sample_num
    self.context_size = 2
    self.input_file = input_file
    
    self.raw_text = open(self.input_file, 'r').read()
    self.preprocessed_text = self.your_preprocessing(self.raw_text)
    self.word_to_ix = { word: i for i, word in enumerate(set(self.preprocessed_text))}

  def your_preprocessing(self, raw_text):
      # Please define here your preprocessing
      return raw_text
    
  def make_all_word_pairs(self): 
      
      ''' Returns array pos_u, pos_v and neg_v
          pos_u = (N*2*self.window_size, )
          pos_v = (N*2*self.window_size, )
          neg_v = (N*2*self.window_size, self.neg_sample_num),
          
          where N ~ num of word tokens
          the values of arrays are word indices from self.word_to_ix
      '''
      
      # Write your code here
      
      return np.array(self.pos_u, dtype=np.int64), np.array(self.pos_v, dtype=np.int64), np.array(self.neg_v, dtype=np.int64)
       
  def train(self):
        model = skipgram(self.vocabulary_size, self.embedding_dim)
        if torch.cuda.is_available():
          model.cuda()
        pos_u_all, pos_v_all, neg_v_all = self.make_all_word_pairs()
        N =  len(pos_u_all)
        optimizer = optim.SGD(model.parameters(), lr=0.2) # choose your own optimization function
        batch_num = 5000 # int(N/self.batch_size)
        for epoch in range(self.epoch_num):    
            for i in range(batch_num):
                pos_u, pos_v, neg_v = pos_u_all[i*self.batch_size:(i+1)*self.batch_size], \
                                        pos_v_all[i*self.batch_size:(i+1)*self.batch_size], \
                                        neg_v_all[i*self.batch_size:(i+1)*self.batch_size]
        
                pos_u = Variable(torch.LongTensor(pos_u))
                pos_v = Variable(torch.LongTensor(pos_v))
                neg_v = Variable(torch.LongTensor(neg_v))
        
                if torch.cuda.is_available():
                  pos_u = pos_u.cuda()
                  pos_v = pos_v.cuda()
                  neg_v = neg_v.cuda()
        
                optimizer.zero_grad()
                loss = model(pos_u, pos_v, neg_v, self.batch_size)
        
                loss.backward()
           
                optimizer.step()
            print('Epoch:', epoch)
            print('Loss:', loss.data[0])
        model.save_word_vectors('word2vec.txt')