In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [0]:
from keras.preprocessing.text import Tokenizer

In [0]:
vocab_size = 62549 + 1 # Atleast 3 count needed for word

In [0]:
!rsync -P "/content/gdrive/My Drive/NLP/gdrive.py" .

gdrive.py
            964 100%    0.00kB/s    0:00:00              964 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=0/1)


In [0]:
from gdrive import download_file_from_google_drive
import pickle

In [0]:
download_file_from_google_drive('178Ulc0pKZCFVMKK0P4V3ldZaxpp35Lh0', '/content/tokenizer.pickle')
download_file_from_google_drive('1-0V1subnINsCm1rFa36PTkgVmKjp7SeT', '/content/NepWord2VecTrainingData.npy')

In [0]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [0]:
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras.layers import Input, Dense, Reshape
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers.merge import dot
from tensorflow.python.keras.models import Model,load_model

In [0]:
from tensorflow.python.keras.initializers import TruncatedNormal
from tensorflow.python.keras.optimizers import Adagrad

In [0]:
class Word2Vec(Layer):
  
  def __init__(self, embed_dim, vocab_size, **kwargs):
    self.embed_dim = embed_dim
    self.vocab_size = vocab_size
    super(Word2Vec, self).__init__(**kwargs)
  
  def build(self, input_shape):
    stddev = 1/np.sqrt(self.embed_dim)
    initializer = TruncatedNormal(stddev=stddev)
    # word embedding shape (10000,300) for my use
    # word embedding input=> (batch_size,sequence_length) output => (batch_size, sequence_length, 300(output_dim))
    self.word_embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim, 
                     input_length=1, name="word_embedding", embeddings_initializer=initializer)
    self.context_embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim, 
                     input_length=1, name="context_embedding", embeddings_initializer=initializer)
#     self.trainable_weights=[self.word_embedding, self.context_embedding]
    super(Word2Vec, self).build(input_shape)
  
  def call(self, inputs, **kwargs):
    target_input, context_input = inputs
    target = self.word_embedding(target_input)
    context = self.context_embedding(context_input)
    # shape target: (batch_size, sequence_length, embed_size) => (batch_size,1,300)
    # shape context: (batch_size, sequence_length, embed_size) => (batch_size,1,300)
    dot_product = dot([target,context], axes=(2,2))
    dot_product = Reshape(target_shape=(1,))(dot_product)
    output = Dense(1, activation='sigmoid')(dot_product)
    return output
  
  def get_config(self):
    config = super(Word2Vec, self).get_config()
    config.update({
        'embed_dim': self.embed_dim,
        'vocab_size': self.vocab_size
    })
    return config
  
custom_layers = {
    Word2Vec.__name__: Word2Vec
}

In [0]:
def get_model(embed_dim,vocab_size):
  target_input = Input(shape=(1,), name="target_input")
  context_input = Input(shape=(1,), name="context_input")
  word2vec = Word2Vec(embed_dim=embed_dim, vocab_size=vocab_size)
  output = word2vec([target_input, context_input])

  model = Model(inputs=[target_input,context_input], outputs=output, name="Word2Vec")
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [0]:
# optimizer=Adagrad() was not supported on loading the saved model => https://stackoverflow.com/questions/47092185/tensorflow-word2vec-model-running-on-gpu

In [0]:
from tensorflow.python.keras.utils import Sequence
# from keras.utils import Sequence

In [0]:
class TrainGenerator(Sequence):
  
  def __init__(self, batch_size):
    self.batch_size = batch_size
    # get train data from saved file
    self.train_data = np.load('NepWord2VecTrainingData.npy')
    
  def __len__(self):
    return int(len(self.train_data) / self.batch_size)
    
  def __getitem__(self, index):
    train_data = self.train_data
    target_inputs = train_data[index*self.batch_size:(index+1)*self.batch_size,0]
    context_inputs = train_data[index*self.batch_size:(index+1)*self.batch_size,1]
    target_inputs = target_inputs.reshape(self.batch_size,1)
    context_inputs = context_inputs.reshape(self.batch_size,1)
    output_labels = train_data[index*self.batch_size:(index+1)*self.batch_size,2]
    return [target_inputs, context_inputs], output_labels

In [0]:
import numpy as np

In [0]:
import glob, os
import shutil
import numpy as np
import pandas as pd

In [0]:
LOG_DIR = '/content/gdrive/My Drive/NLP/Word2Vec/training_logs/Word2Vec-Nepali'
MODEL_CHECKPOINT_NAME = 'model{epoch:03d}.hdf5'
MODEL_CHECKPOINT_DIRNAME = LOG_DIR + '/checkpoints/'
MODEL_CHECKPOINT_BEST = LOG_DIR + '/best/'
log_dir = './logs'  # Tensorboard
!mkdir -p '{MODEL_CHECKPOINT_DIRNAME}'
!mkdir -p '{MODEL_CHECKPOINT_BEST}'

In [0]:
if not os.path.exists('/content/ngrok-stable-linux-amd64.zip'):
  !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
  !unzip ngrok-stable-linux-amd64.zip

In [0]:
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(log_dir)
)

In [0]:
get_ipython().system_raw('./ngrok authtoken 4LoCPh77Ein9HNXiHKJXx_48gvTwxa9wfQKrjK84UXp &')
get_ipython().system_raw('./ngrok http 6006 &')
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

https://973adbc8.ngrok.io


In [0]:
from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint
import re, os

In [0]:
def get_init_epoch(check_point_path):
    check_point_list = glob.glob(os.path.join(check_point_path, 'model*.hdf5'))
    base_names = [os.path.basename(check_point) for check_point in check_point_list]
    epochs = [int(re.search(r'\d+', string).group()) for string in base_names]
    return np.max(epochs) if epochs else 0
  
def load_saved_model(model_path):
    return load_model(model_path, custom_objects=custom_layers)

In [0]:
init_epoch = get_init_epoch(MODEL_CHECKPOINT_DIRNAME)
if init_epoch:
    model_path = MODEL_CHECKPOINT_DIRNAME + MODEL_CHECKPOINT_NAME.format(epoch=init_epoch)
#     model = load_saved_model(model_path)
    model = get_model(300,vocab_size)
    model.load_weights(model_path)
# init_epoch
model_path


In [0]:
callbacks = [TensorBoard(log_dir, update_freq='batch'),
             ModelCheckpoint(MODEL_CHECKPOINT_DIRNAME + MODEL_CHECKPOINT_NAME)]

In [0]:
generator = TrainGenerator(512)
model.fit_generator(generator=generator, epochs=100, callbacks=callbacks, initial_epoch=init_epoch)