#Training ELMo

##References
1. https://appliedmachinelearning.blog/2019/11/30/training-elmo-from-scratch-on-custom-data-set-for-generating-embeddings-tensorflow/
2. https://github.com/allenai/bilm-tf

In [0]:
RESUME = False
MODEL_LANGUAGE = 'urdu' # urdu or roman-urdu

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
!pip install h5py
!git clone https://github.com/allenai/bilm-tf.git

Cloning into 'bilm-tf'...
remote: Enumerating objects: 292, done.[K
remote: Total 292 (delta 0), reused 0 (delta 0), pack-reused 292[K
Receiving objects: 100% (292/292), 588.40 KiB | 1.44 MiB/s, done.
Resolving deltas: 100% (137/137), done.


## Getting the training data

In [0]:
from google.colab import drive

drive.mount("/content/drive/")

# These variables store the path to the corpus files (urdu_filtered.txt or roman_filtered.txt)
base_path = '/content/drive/My Drive/FYP/Corpora/Training/'
urdu_path = 'urdu_filtered.txt'
roman_path = 'roman_filtered.txt'
corpus_path = None

if MODEL_LANGUAGE == 'urdu':
    corpus_path = base_path + urdu_path
else:
    corpus_path = base_path + roman_path

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
def get_lines(corpus_path):
    lines = []

    with open(corpus_path) as input_file:
        lines = input_file.readlines()

    return lines

dataset = get_lines(corpus_path)

In [0]:
!rm "/content/bilm-tf/bilm/data.py"
!rm "/content/bilm-tf/bin/train_elmo.py"

!cp "/content/drive/My Drive/elmo/train_elmo_urdu.py" "/content/bilm-tf/bin/train_elmo.py"
!cp "/content/drive/My Drive/elmo/data.py" "/content/bilm-tf/bilm/data.py"

### Splitting the training data

In [0]:
import os

# Path to store the split training data
if not os.path.exists("/content/swb/train"):
    os.makedirs("/content/swb/train")

lines_per_file = 1000

for i in range(0, len(dataset), lines_per_file):
    text = "\n".join(dataset[i: i + lines_per_file])
    
    with open("/content/swb/train/" + str(i) + ".txt", "w", encoding="utf-8", errors="ignore") as fp:
        fp.write(text)

### Creating vocab file

In [0]:
from collections import Counter

texts = " ".join(dataset)
words = texts.split(" ")
dictionary = Counter(words)

print("Number of tokens in Training data = ", len(words))
print("Size of Vocab", len(dictionary))

sorted_vocab = ["<S>", "</S>", "<UNK>"]
sorted_vocab.extend([pair[0] for pair in dictionary.most_common()])

text = "\n".join(sorted_vocab)

with open("/content/swb/vocab.txt", "w", encoding="utf-8", errors="ignore") as fp:
    fp.write(text)

Number of tokens in Training data =  9768986
Size of Vocab 57442


## Installing BiLM

In [0]:
cd /content/bilm-tf

/content/bilm-tf


In [0]:
!python setup.py install

running install
running bdist_egg
running egg_info
writing bilm.egg-info/PKG-INFO
writing dependency_links to bilm.egg-info/dependency_links.txt
writing requirements to bilm.egg-info/requires.txt
writing top-level names to bilm.egg-info/top_level.txt
writing manifest file 'bilm.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
copying bilm/data.py -> build/lib/bilm
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/bilm
copying build/lib/bilm/__init__.py -> build/bdist.linux-x86_64/egg/bilm
copying build/lib/bilm/training.py -> build/bdist.linux-x86_64/egg/bilm
copying build/lib/bilm/model.py -> build/bdist.linux-x86_64/egg/bilm
copying build/lib/bilm/elmo.py -> build/bdist.linux-x86_64/egg/bilm
copying build/lib/bilm/data.py -> build/bdist.linux-x86_64/egg/bilm
byte-compiling build/bdist.linux-x86_64/egg/bilm/__init__.py to __init__.cpython-36.pyc
byte-compiling build/bdist.linux-x86_64/egg/bilm/

In [0]:
if RESUME:
    print("Resuming training, so not writing options.JSON")
else:
    # Creating the checkpoint directory
    if not os.path.exists("/content/drive/My Drive/elmo/checkpoint"):
        os.makedirs("/content/drive/My Drive/elmo/checkpoint")
    # Urdu
    json_file = '{"lstm": {"use_skip_connections": true, "projection_dim": 500, "cell_clip": 3, "proj_clip": 3, "dim": 1024, "n_layers": 2}, "char_cnn": {"activation": "relu", "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], "n_highway": 1, "embedding": {"dim": 16}, "n_characters": 262, "max_characters_per_token": 45}}'

    with open("/content/drive/My Drive/elmo/checkpoint/options.json", "w") as fp:
        fp.write(json_file)

## Training

In [0]:
if RESUME:
    !cd /content/bilm-tf
    !python bin/train_elmo.py \
        --train_prefix='/content/swb/train/*' \
        --vocab_file '/content/swb/vocab.txt' \
        --save_dir '/content/drive/My Drive/elmo/checkpoint/' \
        --ckpt_file '/content/drive/My Drive/elmo/checkpoint/model.ckpt-406250'
else:
    !python bin/train_elmo.py \
        --train_prefix='/content/swb/train/*' \
        --vocab_file '/content/swb/vocab.txt' \
        --save_dir '/content/drive/My Drive/elmo/checkpoint/'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Finished loading
Loading data from: /content/swb/train/973000.txt
Loaded 1999 sentences.
Finished loading
Batch 1600, train_perplexity=1.5238061
Total time: 178.72392988204956
Loading data from: /content/swb/train/65000.txt
Loaded 1999 sentences.
Finished loading
Loading data from: /content/swb/train/966000.txt
Loaded 1999 sentences.
Finished loading
Loading data from: /content/swb/train/519000.txt
Loaded 1999 sentences.
Finished loading
Loading data from: /content/swb/train/876000.txt
Loaded 1999 sentences.
Finished loading
Loading data from: /content/swb/train/251000.txt
Loaded 1999 sentences.
Finished loading
Loading data from: /content/swb/train/270000.txt
Loaded 1999 sentences.
Finished loading
Loading data from: /content/swb/train/678000.txt
Loaded 1999 sentences.
Finished loading
Loading data from: /content/swb/train/75000.txt
Loaded 1999 sentences.
Finished loading
Batch 1700, train_perplexity=3.9334123
Total time