In [1]:
import numpy as np
import pandas as pd
import os, sys, random, time
import gensim, spacy, nltk, re, string
from collections import Counter
from sklearn.utils import shuffle
import unicodedata as ud
from IPython import display
import pickle, json
import progressbar
%matplotlib inline
from gensim.models import Word2Vec, FastText

# Word2Vec

In [6]:
class word2vec():

  def __init__(self, corpus=None, window=5, size=50, k=2):
    self.Center = None
    self.Context = None
    self.h = int(np.floor(window/2))
    self.k = k
    self.size = size
    self.vocab = None
    self.weights = None
    if corpus is not None:
      self.preprocess(corpus)

  def fit(self, corpus=None, epochs=5, train=True, lr=0.001):
    if self.vocab is None:
      self.preprocess(corpus)

    if train:
      self.train(corpus, epochs, lr)

  def preprocess(self, corpus):
    tokens = [token for row in corpus for token in row]
    num_tokens = len(tokens)
    counter = Counter(tokens)
    self.vocab = list(counter.keys())
    self.weights = [(count/num_tokens)**0.75 for count in counter.values()]
    self.weights /= np.sum(self.weights)
    self.initialize_embeddings()

  def dot(self, C, X):
    return np.dot(C, X)

  def loss(self, p, y):
    return -y*np.log(p) -(1-y)*np.log(1-p)

  def sigmoid(self, z):
    return 1/(1+np.exp(z))

  def feedforward(self, C, X):
    return self.sigmoid(self.dot(C, X))

  def backprop(self, c, x, y, p, lr=0.001):
    delta_C = lr*(y-p)*self.Center[c]
    delta_X = lr*(y-p)*self.Context[x]

    self.update_embeddings(c, x, delta_C, delta_X)

  def update_embeddings(self, c, x, delta_C, delta_X):
    self.Center[c] -= delta_C
    self.Context[x] -= delta_X

  def initialize_embeddings(self):
    if self.Center is None:
      self.Center = dict([(token, np.random.rand(self.size, )*0.1) for token in self.vocab])
    if self.Context is None:
      self.Context = dict([(token, np.random.rand(self.size, )*0.1) for token in self.vocab])

  def get_context(self, ind, token, row):
    h = np.random.randint(1, self.h)
    return  [row[i] for i in range(max(0,ind-h),max(0,ind))] + [row[i] for i in range(min(ind+1, len(row)),min(ind+h+1,len(row)))]

  def negative_sample(self, token, context):
    return [self.sample(token, context) for i in range(self.h*self.k)]

  def sample(self, token, context):
    sampled_token = np.random.choice(self.vocab, p=self.weights)
    if sampled_token not in context+[token]:
      return sampled_token
    else:
      return self.sample(token, context)

  def train(self, corpus, epochs=5, lr=0.001):
    bar = progressbar.ProgressBar(maxval=10, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    losses = []
    for epoch in range(epochs):
      print(f'Epoch: {epoch}')
      iter = list(np.linspace(0, len(corpus), 10).astype(int))
      corpus = shuffle(corpus)
      loss = []
      start_time = time.time()

      for j, row in enumerate(corpus):

        for i,c in enumerate(row):
          context = self.get_context(i, c, row)
          negative_sample = self.negative_sample(c, context)
          for (c, x), y in [((c, x), 1) for x in context] + [((c, x), 0) for x in negative_sample]:
            p = self.feedforward(self.Center[c], self.Context[x])
            self.backprop(c, x, y, p)
            loss += [self.loss(p, y)]
            
        if j in iter: 
          bar.update(iter.index(j))

      bar.finish()
      print(f'\nloss: {np.mean(loss)}, \nepoch time: {self.time(start_time)}')
      losses.append(np.mean(loss))

  def time(self, start):
    runtime = time.time()-start
    hours = runtime//3600
    runtime = runtime - 3600*hours
    minutes = runtime//60
    seconds = runtime - 60*minutes
    return ':'.join([str(int(t)) for t in [hours, minutes, seconds]])

### Hyper parameters

In [7]:
SIZE = 50
EPOCHS = 20
WINDOW = 5
LR = 0.001
K = 2
CORPUS_PATH = '/content/drive/My Drive/NLP Urdu/data/corpus.txt'
MODEL_PATH = '/content/drive/My Drive/NLP Urdu/Models/Word Embeddings/word2vec.pkl'

### Train

In [8]:
corpus = open(CORPUS_PATH).read().split('\n')

In [None]:
model = word2vec(window=WINDOW, size=SIZE, k=K)
model.fit(corpus, epochs=EPOCHS, lr=LR)

In [None]:
with open(MODEL_PATH, 'wb') as f:
  dump(model, f)