# Train ELMo

## Setting up the environment

In [0]:
import warnings
warnings.filterwarnings('ignore')

#MODEL_LANGUAGE = 'urdu'
MODEL_LANGUAGE = 'roman_urdu'

!git clone https://github.com/allenai/bilm-tf.git

In [0]:
cd /content/bilm-tf

In [0]:
!python setup.py install

In [0]:
from google.colab import drive

drive.mount("/content/drive/")

In [0]:
#!rm "/content/bilm-tf/bilm/data.py"
!rm "/content/bilm-tf/bin/train_elmo.py"

In [0]:
#!cp "/content/drive/My Drive/FYP/Models/ELMo/roman_urdu/data.py" "/content/bilm-tf/bilm/data.py"

## Preparing the corpus

### Loading the corpus

In [0]:
def load_corpus(corpus):
    lines = []

    with open(corpus) as input_file:
        lines = input_file.readlines()

    return lines

corpus = load_corpus('/content/drive/My Drive/FYP/Corpora/Training/' + MODEL_LANGUAGE + '-filtered.txt')

### Splitting the corpus

In [0]:
import os

def split_corpus(corpus, lines_per_file, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for i in range(0, len(corpus), lines_per_file):
        text = "\n".join(corpus[i: i + lines_per_file])
        
        with open(save_dir + str(i) + ".txt", "w", encoding='utf-8', errors='ignore') as fp:
            fp.write(text)

split_corpus(corpus, 1000, "/content/swb/train/")

### Creating the vocab file

In [0]:
def get_tokens_dict(corpus):
    tokens = {}

    for sentence in corpus:
        words = sentence.split()
        for word in words:
            if word in tokens:
                tokens[word] += 1
            else:
                tokens[word] = 1

    return sorted(tokens.items(), key=lambda x: x[1], reverse=True)

def write_vocab_file(filename, tokens):
    with open(filename, 'w', encoding='utf-8', errors='ignore') as vocab_file:
        vocab_file.write("<S>\n</S>\n<UNK>\n")

        for token in tokens:
            vocab_file.write(token[0] + "\n")

tokens_dict = get_tokens_dict(corpus)
write_vocab_file('vocab.txt', tokens_dict)

In [0]:
if MODEL_LANGUAGE == 'urdu':
    !cp /content/vocab.txt "/content/drive/My Drive/FYP/Models/ELMo/urdu/vocab.txt"
else:
    !cp /content/vocab.txt "/content/drive/My Drive/FYP/Models/ELMo/roman_urdu/vocab.txt"

## Training

In [0]:
# Delete the file from the cloned repo
!rm "/content/bilm-tf/bin/train_elmo.py"

# Copy the modified file to the repo directory
if MODEL_LANGUAGE == 'urdu':
    !cp "/content/drive/My Drive/FYP/Models/ELMo/urdu/train_elmo.py" "/content/bilm-tf/bin/train_elmo.py"
else:
    !cp "/content/drive/My Drive/FYP/Models/ELMo/roman_urdu/train_elmo.py" "/content/bilm-tf/bin/train_elmo.py"

In [0]:
if not os.path.exists("/content/drive/My Drive/FYP/Models/ELMo/" + MODEL_LANGUAGE + "/checkpoint"):
    os.makedirs("/content/drive/My Drive/FYP/Models/ELMo/" + MODEL_LANGUAGE + "/checkpoint")

if MODEL_LANGUAGE == 'urdu':
    json_file = '{"lstm": {"use_skip_connections": true, "projection_dim": 500, "cell_clip": 3, "proj_clip": 3, "dim": 1024, "n_layers": 2}, "char_cnn": {"activation": "relu", "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], "n_highway": 1, "embedding": {"dim": 16}, "n_characters": 261, "max_characters_per_token": 45}}'
else:
    json_file = '{"lstm": {"use_skip_connections": true, "projection_dim": 500, "cell_clip": 3, "proj_clip": 3, "dim": 1024, "n_layers": 2}, "char_cnn": {"activation": "relu", "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], "n_highway": 1, "embedding": {"dim": 16}, "n_characters": 261, "max_characters_per_token": 15}}'

with open("/content/drive/My Drive/FYP/Models/ELMo/" + MODEL_LANGUAGE + "/checkpoint/options.json", "w") as fp:
    fp.write(json_file)

In [0]:
if MODEL_LANGUAGE == 'urdu': 
    !python bilm-tf/bin/train_elmo.py \
        --train_prefix='/content/swb/train/*' \
        --vocab_file "/content/drive/My Drive/FYP/Models/ELMo/urdu/vocab.txt" \
        --save_dir '/content/drive/My Drive/FYP/Models/ELMo/urdu/checkpoint/'
else:
    !python bilm-tf/bin/train_elmo.py \
        --train_prefix='/content/swb/train/*' \
        --vocab_file "/content/drive/My Drive/FYP/Models/ELMo/roman_urdu/vocab.txt" \
        --save_dir '/content/drive/My Drive/FYP/Models/ELMo/roman_urdu/checkpoint/'

### Converting the TensorFlow checkpoint to hdf5

In [0]:
if MODEL_LANGUAGE == 'urdu':
    !python bilm-tf/bin/dump_weights.py \
        --save_dir '/content/drive/My Drive/FYP/Models/ELMo/urdu/checkpoint/' \
        --outfile '/content/drive/My Drive/FYP/Models/ELMo/urdu/weights.hdf5'
else:
    !python bilm-tf/bin/dump_weights.py \
        --save_dir '/content/drive/My Drive/FYP/Models/ELMo/roman_urdu/checkpoint/' \
        --outfile '/content/drive/My Drive/FYP/Models/ELMo/roman_urdu/weights.hdf5'

## Extracting embeddings

### Dumping the token embeddings

In [0]:
from bilm.model import dump_token_embeddings

dump_token_embeddings('/content/drive/My Drive/FYP/Models/ELMo/' + MODEL_LANGUAGE + '/vocab.txt',
                          '/content/drive/My Drive/FYP/Models/ELMo/' + MODEL_LANGUAGE + '/options.json',
                          '/content/drive/My Drive/FYP/Models/ELMo/' + MODEL_LANGUAGE + '/weights.hdf5',
                          '/content/drive/My Drive/FYP/Models/ELMo/' + MODEL_LANGUAGE + '/embeddings.txt')

### Saving context-independent embeddings in Word2Vec format

In [0]:
import h5py

embeddings_file = '/content/drive/My Drive/FYP/Models/ELMo/' + MODEL_LANGUAGE + '/embeddings.txt'
vocab_file = '/content/drive/My Drive/FYP/Models/ELMo/' + MODEL_LANGUAGE + '/vocab.txt'

embeddings_file = h5py.File(embeddings_file, 'r')
embeddings = list(embeddings_file['embedding'])

In [0]:
def create_word_embeddings_dict(words, embeddings):
    word_embeddings = {}
    
    with open(words, encoding='utf-8', errors='ignore') as words_file: 
        lines = 0

        for word, vector in zip(words_file, embeddings):
            if word not in ['<S>\n', '</S>\n', '<UNK>\n']:
                word = word.rstrip()
                word_embeddings[word] = vector
                lines += 1
        
    return word_embeddings
    
word_embeddings = create_word_embeddings_dict(vocab_file, embeddings)

In [0]:
def write_embeddings_file(output_file, word_vector_dict):
    with open(output_file, 'w', encoding='utf-8', errors='ignore') as o_f:
        o_f.write(str(len(word_embeddings.keys()) - 3) + " 500\n")

        for key in word_vector_dict:                        
            line = ""
            line += key

            for dim in word_vector_dict[key]:
                line += " "
                line += str(dim)
            o_f.write(line + "\n")

write_embeddings_file('/content/drive/My Drive/FYP/Models/ELMo/' + MODEL_LANGUAGE + '/embeddings.txt', word_embeddings)