# Import basic libraries and dependencies

In [4]:
import sys
import importlib
import sentencepiece as spm

import config
importlib.reload(config)
from config import English, Bengali

sys.path.append('./scripts')

# Set a language

In [5]:
LANG = English # English, Bengali

# 1. Data Preparation

In [6]:
# import libraries and dependencies
import preprocessing
importlib.reload(preprocessing)
from preprocessing import raw_preprocess, split_train_test

## 1.1. Preprocess data for sentencepiece

In [7]:
# preprocess raw data to get a list of sentences.
sentences = raw_preprocess(LANG)

# write the sentences into a file, each sentence on one line.
with open(f'data/{LANG.name}_preprocessed.txt', 'w') as f:
    for sentence in sentences:
        f.write(f'{sentence}\n')

## 1.2. Train/test split

In [8]:
# split data to train and test.
train, test = split_train_test(sentences, LANG)

# write train and test data into corresponding files.
train_path = f'data/{LANG.name}_train.txt'
test_path = f'data/{LANG.name}_test.txt'

with open(train_path, 'w') as f:
    f.write('\n'.join(train))
with open(test_path, 'w') as f:
    f.write('\n'.join(test))

# 2. Subword segmentation

We use a small subword vocabulary size of 500 and a large subword vocabulary size of 1500.

In [10]:
# import libraries and dependencies
from collections import namedtuple
import segmentation
importlib.reload(segmentation)
from segmentation import train_segmentation, encode_text_file, decode_text_file

In [25]:
# make a profile for each granularity level.
Segmentation_config = namedtuple('Segmentation_config', 'id vocab_size, model_type')
seg_profiles = [
    Segmentation_config('s1', None, 'char'), # segmentation by characters.
    Segmentation_config('s2', 500, 'bpe'), # segmentation by subwords with small vocabulary.
    Segmentation_config('s3', 1500, 'bpe'), # segmentation by subwords with large vocabulary.
]

In [26]:
for seg in seg_profiles:
    # train segmentation on the train data
    spm1 = train_segmentation(
        text_file=train_path, 
        LANG=LANG, 
        vocab_size=seg.vocab_size, 
        model_type=seg.model_type)
  
    encoded_file = f'data/{LANG.name}_{seg.id}.txt'
    encode_text_file(text_file=train_path, model_path=spm1, output_file=encoded_file)
    
    decoded_file = f'data/{LANG.name}_{seg.id}_decoded.txt'
    decode_text_file(text_file=encoded_file, model_path=spm1, output_file=decoded_file)
    
    # test it on the test data
    encoded_file = f'data/{LANG.name}_{seg.id}_test.txt'
    encode_text_file(text_file=test_path, model_path=spm1, output_file=encoded_file)
    
    decoded_file = f'data/{LANG.name}_{seg.id}_decoded_test.txt'
    decode_text_file(text_file=encoded_file, model_path=spm1, output_file=decoded_file)

# 3. Language Model

In [20]:
import  neptune.new as neptune 
from config import NeptuneConfig

## Baseline

In [None]:
for seg in seg_profiles:
    # Hyper parameters
    hidden = 40
    rand_seed = 1
    debug = 2
    bptt = 3
    _class = 9999

    # Parameters
    model_path = f'models/rnnlm/{LANG.name}_{seg.id}'
    train_path = f'../../../data/{LANG.name}_{seg.id}.txt'
    valid_path = f'../../../data/{LANG.name}_{seg.id}_test.txt'

    # run the model 
    !rm -rf $model_path \
      && mkdir $model_path \
      && cd $model_path \
      && ../../../rnnlm-0.3e/rnnlm \
        -train $train_path \
        -valid $valid_path \
        -rnnlm model \
        -hidden $hidden \
        -rand-seed $rand_seed \
        -debug $debug \
        -bptt $bptt \
        -class $_class
    
    # read output and print perplexity
    with open(f'{model_path}/model.output.txt', 'r') as f:
        output = f.read()
        pp = output.split()[-1]
        print(f'Baseline {LANG.name}-{seg.id} validation perplexity: {pp}')

## Experiments

In [57]:
import pickle
import itertools

In [58]:
# experimented results are kept in a file
grid_result_file = 'Task3.grid_search.pickle'
# grid_results stores the results in memory
grid_results = {}

# import previous results if exists
try:
    with open(grid_result_file, 'rb') as f:
        previous_results = pickle.load(f)
        grid_results.update(previous_results)
except:
    pass #TODO: add exceptions etc.

print(grid_results)

{}


In [59]:
hidden_choices = [5, 20, 40, 70, 100]
bptt_choices = [0, 1, 3, 5]
class_choices = [10, 100, 1501]
rand_seed = 1
debug = 2

for seg, hidden, bptt, _class in itertools.product(
    seg_profiles, hidden_choices, bptt_choices, class_choices):

    if (LANG.name, seg.id, hidden, bptt, _class) in grid_results:
        print(f'skip {LANG.name} {seg.id} {hidden} {bptt} {_class}')
        continue

    # Parameters
    model_path = f'models/rnnlm/{LANG.name}_{seg.id}'
    train_path = f'../../../data/{LANG.name}_{seg.id}.txt'
    valid_path = f'../../../data/{LANG.name}_{seg.id}_test.txt'

    # run the model 
    !rm -rf $model_path \
      && mkdir $model_path \
      && cd $model_path \
      && ../../../rnnlm-0.3e/rnnlm \
        -train $train_path \
        -valid $valid_path \
        -rnnlm model \
        -hidden $hidden \
        -rand-seed $rand_seed \
        -debug $debug \
        -bptt $bptt \
        -class $_class \
        2> error.txt

    # read output extract validation perplexity
    with open(f'{model_path}/model.output.txt', 'r') as f:
        output = f.read()
        pp = float(output.split()[-1])
        grid_results[(LANG.name, seg.id, hidden, bptt, _class)] = pp

    # write current results to file
    with open(grid_result_file, 'wb') as f:
        pickle.dump(grid_results, f)

debug mode: 2
train file: ../../../data/en_s1.txt
valid file: ../../../data/en_s1_test.txt
class size: 10
Hidden layer size: 5
BPTT: 0
Rand seed: 1
rnnlm file: model
Starting training using file ../../../data/en_s1.txt
Vocab size: 72
Words in train file: 115565
Iter:   0	Alpha: 0.100000	   TRAIN entropy: 3.6635    Words/sec: 1509068.8   VALID entropy: 3.5505
Iter:   1	Alpha: 0.100000	   TRAIN entropy: 3.5470    Words/sec: 1569081.6   VALID entropy: 3.5424
Iter:   2	Alpha: 0.050000	   TRAIN entropy: 3.4905    Words/sec: 1591379.8   VALID entropy: 3.4945
Iter:   3	Alpha: 0.025000	   TRAIN entropy: 3.4614    Words/sec: 1577800.5   VALID entropy: 3.4669
Iter:   4	Alpha: 0.012500	   TRAIN entropy: 3.4460    Words/sec: 1588143.1   VALID entropy: 3.4524
Iter:   5	Alpha: 0.006250	   TRAIN entropy: 3.4380    Words/sec: 1574082.7   VALID entropy: 3.4446
debug mode: 2
train file: ../../../data/en_s1.txt
valid file: ../../../data/en_s1_test.txt
class size: 100
Hidden layer size: 5
BPTT: 0
Rand see

In [60]:
grid_results

{('en', 's1', 5, 0, 10): 10.887227,
 ('en', 's1', 5, 0, 100): 10.645946,
 ('en', 's1', 5, 0, 1501): 10.482386,
 ('en', 's1', 5, 1, 10): 10.605196,
 ('en', 's1', 5, 1, 100): 10.271953,
 ('en', 's1', 5, 1, 1501): 10.261143,
 ('en', 's1', 5, 3, 10): 10.650879,
 ('en', 's1', 5, 3, 100): 10.273488,
 ('en', 's1', 5, 3, 1501): 10.1756,
 ('en', 's1', 5, 5, 10): 10.650509,
 ('en', 's1', 5, 5, 100): 10.275037,
 ('en', 's1', 5, 5, 1501): 10.170872,
 ('en', 's1', 20, 0, 10): 7.372125,
 ('en', 's1', 20, 0, 100): 6.910608,
 ('en', 's1', 20, 0, 1501): 7.039472,
 ('en', 's1', 20, 1, 10): 6.759173,
 ('en', 's1', 20, 1, 100): 6.477795,
 ('en', 's1', 20, 1, 1501): 6.552387,
 ('en', 's1', 20, 3, 10): 6.617757,
 ('en', 's1', 20, 3, 100): 6.550642,
 ('en', 's1', 20, 3, 1501): 6.441925,
 ('en', 's1', 20, 5, 10): 6.675042,
 ('en', 's1', 20, 5, 100): 6.571401,
 ('en', 's1', 20, 5, 1501): 6.611228,
 ('en', 's1', 40, 0, 10): 6.051518,
 ('en', 's1', 40, 0, 100): 5.971961,
 ('en', 's1', 40, 0, 1501): 5.99838,
 ('e