# Import basic libraries and dependencies

In [125]:
import sys
import importlib
import sentencepiece as spm

import config
importlib.reload(config)
from config import English, Bengali

sys.path.append('./scripts')

# Set a language

In [126]:
LANG = English # English, Bengali

# 1. Data Preparation

In [127]:
# import libraries and dependencies
import preprocessing
importlib.reload(preprocessing)
from preprocessing import raw_preprocess, split_train_test

## 1.1. Preprocess data for sentencepiece

In [128]:
# preprocess raw data to get a list of sentences.
sentences = raw_preprocess(LANG)

# write the sentences into a file, each sentence on one line.
with open(f'data/{LANG.name}_preprocessed.txt', 'w') as f:
    for sentence in sentences:
        f.write(f'{sentence}\n')

## 1.2. Train/test split

In [129]:
# split data to train and test.
train, test = split_train_test(sentences, LANG)

# write train and test data into corresponding files.
train_path = f'data/{LANG.name}_train.txt'
test_path = f'data/{LANG.name}_test.txt'

with open(train_path, 'w') as f:
    f.write('\n'.join(train))
with open(test_path, 'w') as f:
    f.write('\n'.join(test))

# 2. Subword segmentation

In [130]:
#TODO: experiment to get the best vocab_size for subword segmentation

In [131]:
# import libraries and dependencies
from collections import namedtuple
import segmentation
importlib.reload(segmentation)
from segmentation import train_segmentation, encode_text_file, decode_text_file

In [132]:
# make a profile for each granularity level.
Segmentation_config = namedtuple('Segmentation_config', 'id vocab_size, model_type')
seg_profiles = [
    Segmentation_config('s1', None, 'char'), # segmentation by characters.
    Segmentation_config('s2', 500, 'bpe'), # segmentation by subwords with small vocabulary.
    Segmentation_config('s3', 2000, 'bpe'), # segmentation by subwords with large vocabulary.
]

In [133]:
for seg in seg_profiles:
    # do segmentation on the train data
    spm1 = train_segmentation(
        text_file=train_path, 
        LANG=LANG, 
        vocab_size=seg.vocab_size, 
        model_type=seg.model_type)

    encoded_file = f'data/{LANG.name}_{seg.id}.txt'
    encode_text_file(text_file=train_path, model_path=spm1, output_file=encoded_file)
    decoded_file = f'data/{LANG.name}_{seg.id}_decoded.txt'
    decode_text_file(text_file=encoded_file, model_path=spm1, output_file=decoded_file)
    
    # do segmentation on the test data
    spm1 = train_segmentation(
        text_file=test_path, 
        LANG=LANG, 
        vocab_size=seg.vocab_size, 
        model_type=seg.model_type)

    encoded_file = f'data/{LANG.name}_{seg.id}_test.txt'
    encode_text_file(text_file=train_path, model_path=spm1, output_file=encoded_file)
    decoded_file = f'data/{LANG.name}_{seg.id}_decoded_test.txt'
    decode_text_file(text_file=encoded_file, model_path=spm1, output_file=decoded_file)

In [None]:
#TODO: comment briefly on what you observe in terms of word segmentation.

# 3. Language Model

In [135]:
!rm -rf models/rnnlm \
  && mkdir models/rnnlm \
  && cd models/rnnlm \
  && ../../rnnlm-0.3e/rnnlm \
    -train ../../data/en_s1.txt \
    -valid ../../data/en_s1_test.txt \
    -rnnlm model \
    -hidden 40 \
    -rand-seed 1 \
    -debug 2 \
    -bptt 3 \
    -class 9999

debug mode: 2
train file: ../../data/en_s1.txt
valid file: ../../data/en_s1_test.txt
class size: 9999
Hidden layer size: 40
BPTT: 3
Rand seed: 1
rnnlm file: model
Starting training using file ../../data/en_s1.txt
Vocab size: 72
Words in train file: 115565
Iter:   0	Alpha: 0.100000	   TRAIN entropy: 3.2465    Words/sec: 1323.4   VALID entropy: 2.8076
Iter:   1	Alpha: 0.100000	   TRAIN entropy: 2.7571    Words/sec: 1365.8   VALID entropy: 2.6472
Iter:   2	Alpha: 0.100000	   TRAIN entropy: 2.6525    Words/sec: 1410.2   VALID entropy: 2.5773
Iter:   3	Alpha: 0.100000	   TRAIN entropy: 2.5983    Words/sec: 1328.1   VALID entropy: 2.5351
Iter:   4	Alpha: 0.100000	   TRAIN entropy: 2.5616    Words/sec: 1313.6   VALID entropy: 2.5055
Iter:   5	Alpha: 0.100000	   TRAIN entropy: 2.5380    Words/sec: 1372.2   VALID entropy: 2.4962
Iter:   6	Alpha: 0.100000	   TRAIN entropy: 2.5222    Words/sec: 1413.5   VALID entropy: 2.4884
Iter:   7	Alpha: 0.100000	   TRAIN entropy: 2.5083    Words/sec: 1415.6 