# Import basic libraries and dependencies

In [4]:
import sys
import importlib
import sentencepiece as spm

import config
importlib.reload(config)
from config import English, Bengali

sys.path.append('./scripts')

# Set a language

In [5]:
LANG = English # English, Bengali

# 1. Data Preparation

In [6]:
# import libraries and dependencies
import preprocessing
importlib.reload(preprocessing)
from preprocessing import raw_preprocess, split_train_test

## 1.1. Preprocess data for sentencepiece

In [7]:
# preprocess raw data to get a list of sentences.
sentences = raw_preprocess(LANG)

# write the sentences into a file, each sentence on one line.
with open(f'data/{LANG.name}_preprocessed.txt', 'w') as f:
    for sentence in sentences:
        f.write(f'{sentence}\n')

## 1.2. Train/test split

In [8]:
# split data to train and test.
train, test = split_train_test(sentences, LANG)

# write train and test data into corresponding files.
train_path = f'data/{LANG.name}_train.txt'
test_path = f'data/{LANG.name}_test.txt'

with open(train_path, 'w') as f:
    f.write('\n'.join(train))
with open(test_path, 'w') as f:
    f.write('\n'.join(test))

# 2. Subword segmentation

In [9]:
#TODO: experiment to get the best vocab_size for subword segmentation

In [10]:
# import libraries and dependencies
from collections import namedtuple
import segmentation
importlib.reload(segmentation)
from segmentation import train_segmentation, encode_text_file, decode_text_file

In [25]:
# make a profile for each granularity level.
Segmentation_config = namedtuple('Segmentation_config', 'id vocab_size, model_type')
seg_profiles = [
    Segmentation_config('s1', None, 'char'), # segmentation by characters.
    Segmentation_config('s2', 800, 'bpe'), # segmentation by subwords with small vocabulary.
    Segmentation_config('s3', 2000, 'bpe'), # segmentation by subwords with large vocabulary.
]

In [26]:
for seg in seg_profiles:
    # train segmentation on the train data
    spm1 = train_segmentation(
        text_file=train_path, 
        LANG=LANG, 
        vocab_size=seg.vocab_size, 
        model_type=seg.model_type)
  
    encoded_file = f'data/{LANG.name}_{seg.id}.txt'
    encode_text_file(text_file=train_path, model_path=spm1, output_file=encoded_file)
    
    decoded_file = f'data/{LANG.name}_{seg.id}_decoded.txt'
    decode_text_file(text_file=encoded_file, model_path=spm1, output_file=decoded_file)
    
    # test it on the test data
    encoded_file = f'data/{LANG.name}_{seg.id}_test.txt'
    encode_text_file(text_file=test_path, model_path=spm1, output_file=encoded_file)
    
    decoded_file = f'data/{LANG.name}_{seg.id}_decoded_test.txt'
    decode_text_file(text_file=encoded_file, model_path=spm1, output_file=decoded_file)

In [None]:
#TODO: comment briefly on what you observe in terms of word segmentation.

# 3. Language Model

In [20]:
import  neptune.new as neptune 
from config import NeptuneConfig

#### Baseline

In [34]:
# Hyper parameters
seg = seg_profiles[0]
hidden = 40
rand_seed = 1
debug = 2
bptt = 3
_class = 9999

# Parameters
model_path = f'models/rnnlm/{LANG.name}_{seg.id}'
train_path = f'../../../data/{LANG.name}_{seg.id}.txt'
valid_path = f'../../../data/{LANG.name}_{seg.id}_test.txt'

# run the model 
!rm -rf $model_path \
  && mkdir $model_path \
  && cd $model_path \
  && ../../../rnnlm-0.3e/rnnlm \
    -train $train_path \
    -valid $valid_path \
    -rnnlm model \
    -hidden $hidden \
    -rand-seed $rand_seed \
    -debug $debug \
    -bptt $bptt \
    -class $_class

/bin/bash: ../../rnnlm-0.3e/rnnlm: No such file or directory


[]