# Import basic libraries and dependencies

In [8]:
import sys
import importlib
import sentencepiece as spm

import config
importlib.reload(config)
from config import English, Bengali

sys.path.append('./scripts')

# Set a language

In [9]:
LANG = English # English, Bengali

# 1. Data Preparation

In [4]:
# import libraries and dependencies
import preprocessing
importlib.reload(preprocessing)
from preprocessing import raw_preprocess, split_train_test

## 1.1. Preprocess data for sentencepiece

In [5]:
# preprocess raw data to get a list of sentences.
sentences = raw_preprocess(LANG)

# write the sentences into a file, each sentence on one line.
with open(f'data/{LANG.name}_preprocessed.txt', 'w') as f:
    for sentence in sentences:
        f.write(f'{sentence}\n')

## 1.2. Train/test split

In [6]:
# split data to train and test.
train, test = split_train_test(sentences, LANG)

# write train and test data into corresponding files.
train_path = f'data/{LANG.name}_train.txt'
test_path = f'data/{LANG.name}_test.txt'

with open(train_path, 'w') as f:
    f.write('\n'.join(train))
with open(test_path, 'w') as f:
    f.write('\n'.join(test))

# 2. Subword segmentation

In [7]:
#TODO: experiment to get the best vocab_size for subword segmentation

In [8]:
# import libraries and dependencies
from collections import namedtuple
import segmentation
importlib.reload(segmentation)
from segmentation import train_segmentation, encode_text_file, decode_text_file

In [9]:
# make a profile for each granularity level.
Segmentation_config = namedtuple('Segmentation_config', 'id vocab_size, model_type')
seg_profiles = [
    Segmentation_config('s1', None, 'char'), # segmentation by characters.
    Segmentation_config('s2', 500, 'bpe'), # segmentation by subwords with small vocabulary.
    Segmentation_config('s3', 2000, 'bpe'), # segmentation by subwords with large vocabulary.
]

In [10]:
for seg in seg_profiles:
    # do segmentation on the train data
    spm1 = train_segmentation(
        text_file=train_path, 
        LANG=LANG, 
        vocab_size=seg.vocab_size, 
        model_type=seg.model_type)
  
    encoded_file = f'data/{LANG.name}_{seg.id}.txt'
    encode_text_file(text_file=train_path, model_path=spm1, output_file=encoded_file)
    
    decoded_file = f'data/{LANG.name}_{seg.id}_decoded.txt'
    decode_text_file(text_file=encoded_file, model_path=spm1, output_file=decoded_file)
    
    # do segmentation on the test data
    spm1 = train_segmentation(
        text_file=test_path, 
        LANG=LANG, 
        vocab_size=seg.vocab_size, 
        model_type=seg.model_type)

    encoded_file = f'data/{LANG.name}_{seg.id}_test.txt'
    encode_text_file(text_file=train_path, model_path=spm1, output_file=encoded_file)
    
    decoded_file = f'data/{LANG.name}_{seg.id}_decoded_test.txt'
    decode_text_file(text_file=encoded_file, model_path=spm1, output_file=decoded_file)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: data/en_train.txt
  input_format: 
  model_prefix: ./seg_models/spm_en_char
  model_type: CHAR
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces:

In [None]:
#TODO: comment briefly on what you observe in terms of word segmentation.

# 3. Language Model

In [20]:
import  neptune.new as neptune 
from config import NeptuneConfig

#### Baseline

In [34]:
# Hyper parameters
enc = 's1'
hidden = 40
rand_seed = 1
debug = 2
bptt = 3
_class = 9999

# Parameters
model_path = f'models/rnnlm/{LANG.name}-{enc}'
train_path = f'../../../data/{LANG.name}_{enc}.txt'
valid_path = f'../../../data/{LANG.name}_{enc}_test.txt'

# run the model 
!rm -rf $model_path \
  && mkdir $model_path \
  && cd $model_path \
  && ../../../rnnlm-0.3e/rnnlm \
    -train $train_path \
    -valid $valid_path \
    -rnnlm model \
    -hidden $hidden \
    -rand-seed $rand_seed \
    -debug $debug \
    -bptt $bptt \
    -class $_class

/bin/bash: ../../rnnlm-0.3e/rnnlm: No such file or directory


[]