#Training ELMo

###References
1. https://appliedmachinelearning.blog/2019/11/30/training-elmo-from-scratch-on-custom-data-set-for-generating-embeddings-tensorflow/
2. https://github.com/allenai/bilm-tf

## Setting up the environment

In [0]:
import warnings
warnings.filterwarnings('ignore')

#comment/uncomment MODEL_LANGUAGE according to what language you want to train ELMO for
#MODEL_LANGUAGE = 'urdu'
MODEL_LANGUAGE = 'roman-urdu'

!git clone https://github.com/allenai/bilm-tf.git

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
cd /content/bilm-tf

In [0]:
!python setup.py install

In [0]:
from google.colab import drive

drive.mount("/content/drive/")

In [0]:
#!rm "/content/bilm-tf/bilm/data.py"
!rm "/content/bilm-tf/bin/train_elmo.py"


In [0]:
!cp "/content/drive/My Drive/elmo/roman_urdu/train_elmo.py" "/content/bilm-tf/bin/train_elmo.py"
#!cp "/content/drive/My Drive/elmo/roman_urdu/data.py" "/content/bilm-tf/bilm/data.py"

## Getting the training data

In [0]:
def load_corpus(corpus):
    lines = []

    with open(corpus) as input_file:
        lines = input_file.readlines()

    return lines

corpus = None

%cd ..
if MODEL_LANGUAGE == 'urdu':
    corpus = load_corpus('/content/drive/My Drive/FYP/Corpora/Training/urdu_filtered.txt')
else:
    corpus = load_corpus('/content/drive/My Drive/FYP/Corpora/Training/roman_filtered.txt')

### Splitting the training data

In [0]:
import os

def split_corpus(corpus, lines_per_file, save_dir):
    # if not os.path.exists("/content/swb/train"):
    #     os.makedirs("/content/swb/train")
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for i in range(0, len(corpus), lines_per_file):
        text = "\n".join(corpus[i: i + lines_per_file])
        
        with open(save_dir + str(i) + ".txt", "w", encoding='utf-8', errors='ignore') as fp:
            fp.write(text)

In [0]:
split_corpus(corpus, 1000, "/content/swb/train/")

### Creating vocab file

In [0]:
def get_tokens_dict(corpus):
    tokens = {}

    for sentence in corpus:
        words = sentence.split()
        for word in words:
            if word in tokens:
                tokens[word] += 1
            else:
                tokens[word] = 1

    return sorted(tokens.items(), key=lambda x: x[1], reverse=True)

def write_vocab_file(filename, tokens):
    with open(filename, 'w', encoding='utf-8', errors='ignore') as vocab_file:
        vocab_file.write("<S>\n</S>\n<UNK>\n")

        for token in tokens:
            print("Token: {}".format(token))
            print("Type: {}".format(type(token)))
            vocab_file.write(token[0] + "\n")

In [0]:
tokens_dict = get_tokens_dict(corpus)
write_vocab_file('vocab.txt', tokens_dict)

## Training

In [0]:
# Delete the file from the cloned repo
!rm "/content/bilm-tf/bin/train_elmo.py"

# Copy the modified file to the repo directory
if MODEL_LANGUAGE == 'urdu':
    !cp "/content/drive/My Drive/FYP/elmo/urdu/train_elmo.py" "/content/bilm-tf/bin/train_elmo.py"
else:
    !cp "/content/drive/My Drive/FYP/elmo/roman_urdu/train_elmo.py" "/content/bilm-tf/bin/train_elmo.py"

In [0]:
if MODEL_LANGUAGE == 'urdu':
    # Creating the checkpoint directory
    if not os.path.exists("/content/drive/My Drive/FYP/elmo/elmo_urdu(2)/checkpoint"):
        os.makedirs("/content/drive/My Drive/FYP/elmo/elmo_urdu(2)/checkpoint")

    json_file = '{"lstm": {"use_skip_connections": true, "projection_dim": 500, "cell_clip": 3, "proj_clip": 3, "dim": 1024, "n_layers": 2}, "char_cnn": {"activation": "relu", "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], "n_highway": 1, "embedding": {"dim": 16}, "n_characters": 261, "max_characters_per_token": 45}}'

    with open("/content/drive/My Drive/FYP/elmo/elmo_urdu(2)/checkpoint/options.json", "w") as fp:
        fp.write(json_file)
else:
    # Creating the checkpoint directory
    if not os.path.exists("/content/drive/My Drive/FYP/elmo/roman_urdu(2)/checkpoint"):
        os.makedirs("/content/drive/My Drive/FYP/elmo/roman_urdu(2)/checkpoint")

    json_file = '{"lstm": {"use_skip_connections": true, "projection_dim": 500, "cell_clip": 3, "proj_clip": 3, "dim": 1024, "n_layers": 2}, "char_cnn": {"activation": "relu", "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], "n_highway": 1, "embedding": {"dim": 16}, "n_characters": 261, "max_characters_per_token": 15}}'

    with open("/content/drive/My Drive/elmo/roman_urdu(2)/checkpoint/options.json", "w") as fp:
        fp.write(json_file)

In [0]:
if MODEL_LANGUAGE == 'urdu': 
    !python bilm-tf/bin/train_elmo.py \
        --train_prefix='/content/swb/train/*' \
        --vocab_file "/content/drive/My Drive/FYP/elmo/elmo_urdu(2)/vocab.txt" \
        --save_dir '/content/drive/My Drive/FYP/elmo/elmo_urdu(2)/checkpoint/'
else:
    !python bilm-tf/bin/train_elmo.py \
        --train_prefix='/content/swb/train/*' \
        --vocab_file "/content/drive/My Drive/FYP/elmo/roman_urdu(2)/vocab.txt" \
        --save_dir '/content/drive/My Drive/FYP/elmo/roman_urdu(2)/checkpoint/'

## Converting the TF checkpoint to hdf5

In [0]:
if MODEL_LANGUAGE == 'urdu':
    !python bilm-tf/bin/dump_weights.py \
        --save_dir '/content/drive/My Drive/FYP/elmo/elmo_urdu(2)/checkpoint/' \
        --outfile '/content/drive/My Drive/FYP/elmo/elmo_urdu(2)/weights.hdf5'
else:
    !python bilm-tf/bin/dump_weights.py \
        --save_dir '/content/drive/My Drive/FYP/elmo/roman_urdu(2)/checkpoint/' \
        --outfile '/content/drive/My Drive/FYP/elmo/roman_urdu(2)/weights.hdf5'

# Extracting embeddings

In [0]:

%ls

In [0]:
# We have the output of this cell on Drive, no need to execute it again!

from bilm.model import dump_token_embeddings

if MODEL_LANGUAGE == 'urdu':
    #dump_token_embeddings('/content/drive/My Drive/FYP/elmo/urdu/vocab.txt',
                          #'/content/drive/My Drive/FYP/elmo/urdu/options.json',
                          #'/content/drive/My Drive/FYP/elmo/urdu/weights.hdf5',
                          #'/content/drive/My Drive/FYP/elmo/urdu/embeddings.hdf5')
    dump_token_embeddings('/content/drive/My Drive/elmo/elmo_urdu(2)/vocab.txt',
                          '/content/drive/My Drive/elmo/elmo_urdu(2)/options.json',
                          '/content/drive/My Drive/elmo/elmo_urdu(2)/weights.hdf5',
                          '/content/drive/My Drive/elmo/elmo_urdu(2)/embeddings.hdf5')
else:
    dump_token_embeddings('/content/drive/My Drive/FYP/elmo/roman_urdu(2)/vocab.txt',
                          '/content/drive/My Drive/FYP/elmo/roman_urdu(2)/checkpoint/options.json',
                          '/content/drive/My Drive/FYP/elmo/roman_urdu(2)/weights.hdf5',
                          '/content/drive/My Drive/FYP/elmo/roman_urdu(2)/embeddings.hdf5')

In [0]:
import h5py
import numpy as np

embeddings_file_path = None
vocab_file_path = None

if MODEL_LANGUAGE == 'urdu':

    embeddings_file_path = 'drive/My Drive/elmo/elmo_urdu/embeddings.hdf5'
    vocab_file_path = 'drive/My Drive/elmo/elmo_urdu/vocab.txt'

else:
    embeddings_file_path = '/content/drive/My Drive/FYP/elmo/roman_urdu/embeddings.hdf5'
    vocab_file_path = '/content/drive/My Drive/FYP/elmo/roman_urdu/vocab.txt'

embeddings_file = h5py.File(embeddings_file_path, 'r')
arr = np.array(embeddings_file['embedding'])
word_vector_dict = {}

In [0]:
with open(vocab_file_path, encoding='utf-8', errors='ignore') as v_f:
    for word, vector in zip(v_f, arr):
        word_vector_dict[word] = vector

In [0]:
with open('elmo_embeddings.txt', 'w', encoding='utf-8', errors='ignore') as o_f:
    o_f.write(str(len(arr)) + " " + str(500))
    
    for key in word_vector_dict:
        o_f.write(key)
        for dim in word_vector_dict[key]:
            o_f.write(str(dim) + " ")
        o_f.write("\n")

In [0]:
%cat elmo_embeddings.txt