### Author - **Nilesh Bansal**

### Objective - NMT using Attention to translate French into Fongbe and Ewe (two African languages - collectively called Gbe)


#### Step 1 - install relevent libraries and load them

In [None]:
!pip -q install trax
!pip install nltk

[K     |████████████████████████████████| 522kB 13.2MB/s 
[K     |████████████████████████████████| 235kB 30.0MB/s 
[K     |████████████████████████████████| 3.4MB 31.0MB/s 
[K     |████████████████████████████████| 1.2MB 56.1MB/s 
[K     |████████████████████████████████| 368kB 58.5MB/s 
[K     |████████████████████████████████| 61kB 7.4MB/s 
[K     |████████████████████████████████| 3.8MB 54.2MB/s 
[K     |████████████████████████████████| 1.9MB 53.2MB/s 
[K     |████████████████████████████████| 3.2MB 52.3MB/s 
[K     |████████████████████████████████| 890kB 60.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
from termcolor import colored
import random
import numpy as np
import pandas as pd

import trax
from trax import layers as tl
from trax.fastmath import numpy as fastnp
from trax.supervised import training
from trax.supervised import decoding
import textwrap

import pickle
import string
import ast

import nltk
from nltk.tokenize import TweetTokenizer

wrapper = textwrap.TextWrapper(width=70)

In [None]:
!pip list | grep trax

trax                          1.3.7                


#### Step 2 - Load the train, test sets from the google drive

In [None]:
df_train = pd.read_csv(r"/content/drive/MyDrive/competitions/Zindi Competition_NMT_AI4D/Train.csv")
df_test = pd.read_csv(r"/content/drive/MyDrive/competitions/Zindi Competition_NMT_AI4D/Test.csv")

#### Step 3 - separate the data to convert 
1. french to fongbe \
2. french to ewe \
may be use the trained model from 1st and use transfer learning on 2nd

In [None]:
# prepare train and eval sets

df_train_fr_to_fo = df_train[df_train["Target_Language"] == "Fon"]
df_train_fr_to_ewe = df_train[df_train["Target_Language"] == "Ewe"]

data_x_fr_to_fo = list(df_train_fr_to_fo["French"])
data_y_fr_to_fo = list(df_train_fr_to_fo["Target"])

# define train (97% of data) and eval (3% of data)
div_slice_ff = len(data_x_fr_to_fo) - int(len(data_x_fr_to_fo) * 0.03)
data_x_fr_to_fo_train = data_x_fr_to_fo[:div_slice_ff]
data_x_fr_to_fo_eval = data_x_fr_to_fo[div_slice_ff:]
data_y_fr_to_fo_train = data_y_fr_to_fo[:div_slice_ff]
data_y_fr_to_fo_eval = data_y_fr_to_fo[div_slice_ff:]
assert(len(data_x_fr_to_fo_train) + len(data_x_fr_to_fo_eval) == len(data_x_fr_to_fo))

data_x_fr_to_ewe = list(df_train_fr_to_ewe["French"])
data_y_fr_to_ewe = list(df_train_fr_to_ewe["Target"])

div_slice_fe = len(data_x_fr_to_ewe) - int(len(data_x_fr_to_ewe) * 0.03)
data_x_fr_to_ewe_train = data_x_fr_to_ewe[:div_slice_fe]
data_x_fr_to_ewe_eval = data_x_fr_to_ewe[div_slice_fe:]
data_y_fr_to_ewe_train = data_y_fr_to_ewe[:div_slice_fe]
data_y_fr_to_ewe_eval = data_y_fr_to_ewe[div_slice_fe:]
assert(len(data_x_fr_to_ewe_train) + len(data_x_fr_to_ewe_eval) == len(data_x_fr_to_ewe))

#### Create vocab files for all 3 languages
1. French
2. Fongbe
3. Ewe \

Later work on BPE for each of the languages for better results

In [None]:
def create_vocab(language_text):
  vocab_list = []
  for elem in language_text:
    tokenizer = TweetTokenizer()
    tokenized_elem = tokenizer.tokenize(elem)
    vocab_list.extend(tokenized_elem)
  
  # add <pad> and <EOS> to the vocab, no need to add <UNK>
  vocab_list = list(set(vocab_list))
  return vocab_list

In [None]:
french_vocab_list = create_vocab(data_x_fr_to_fo + data_x_fr_to_ewe)
fongbe_vocab_list = create_vocab(data_y_fr_to_fo)
ewe_vocab_list = create_vocab(data_y_fr_to_ewe)
vocab_list = ["<pad>", "<EOS>"] + french_vocab_list + fongbe_vocab_list + ewe_vocab_list

In [None]:
with open(r"/content/drive/MyDrive/competitions/Zindi Competition_NMT_AI4D/fr_vocab.txt", "w") as output:
  for elem in vocab_list:
    output.write(elem + "_" + "\n")

#### Step 4 - Develop data generator
to yield (x, y) tuples, one pair at a time

In [None]:
def data_generator(data_x, data_y, shuffle = False):

  # len of data_x and data_y must be same  
  assert(len(data_x) == len(data_y))
  assert(len(data_x) != 0)

  data_lng = len(data_x)
  index_list = [*range(data_lng)]

  # Shuffle the data if set to True
  if shuffle:
    random.shuffle(index_list)

  while True:

    if shuffle:
      random.shuffle(index_list)

    X = data_x[index_list[0]]
    Y = data_y[index_list[0]]
    
    yield ((X, Y))

In [None]:
# french to ewe
train_stream_fe = data_generator(data_x_fr_to_ewe_train, data_y_fr_to_ewe_train, shuffle = True)
eval_stream_fe = data_generator(data_x_fr_to_ewe_eval, data_y_fr_to_ewe_eval, shuffle = True)

# french to fongbe
train_stream_ff = data_generator(data_x_fr_to_fo_train, data_y_fr_to_fo_train, shuffle = True)
eval_stream_ff = data_generator(data_x_fr_to_fo_eval, data_y_fr_to_fo_eval, shuffle = True)

#### Step 5 - Tokenize and format

In [None]:
# global variables that state the filename and directory of the vocabulary file
VOCAB_FILE = "vocab.txt"
VOCAB_DIR = "/content/drive/MyDrive/competitions/Zindi Competition_NMT_AI4D"

# french to ewe tokenization
tokenized_train_stream_fe = trax.data.Tokenize(vocab_type = 'char', vocab_file = VOCAB_FILE, vocab_dir=VOCAB_DIR)(train_stream_fe)
tokenized_eval_stream_fe = trax.data.Tokenize(vocab_type = 'char', vocab_file = VOCAB_FILE, vocab_dir=VOCAB_DIR)(eval_stream_fe)

# french to fongbe tokenization
tokenized_train_stream_ff = trax.data.Tokenize(vocab_type = 'char', vocab_file = VOCAB_FILE, vocab_dir=VOCAB_DIR)(train_stream_ff)
tokenized_eval_stream_ff = trax.data.Tokenize(vocab_type = 'char', vocab_file = VOCAB_FILE, vocab_dir=VOCAB_DIR)(eval_stream_ff)

In [None]:
# global variable to used in the NMT model
input_vocab_size = trax.data.vocab_size(
    vocab_type='char',
    vocab_file=VOCAB_FILE,
    vocab_dir=VOCAB_DIR)

In [None]:
# Append EOS at the end of each sentence
# Assign <EOS> = 1
EOS = 1

# Generator helper function to append <EOS> to each sentence
def append_eos(stream):
  for (input, target) in stream:
    input_with_eos = list(input) + [EOS]
    target_with_eos = list(target) + [EOS]
    yield np.array(input_with_eos), np.array(target_with_eos)

# french to ewe
# append EOS to the train data
tokenized_train_stream_fe = append_eos(tokenized_train_stream_fe)
# append EOS to the eval data
tokenized_eval_stream_fe = append_eos(tokenized_eval_stream_fe)

# french to fongbe
# append EOS to the train data
tokenized_train_stream_ff = append_eos(tokenized_train_stream_ff)
# append EOS to the eval data
tokenized_eval_stream_ff = append_eos(tokenized_eval_stream_ff)

In [None]:
# filter bymax length - Later - the data is already processed in this case
# pass

In [None]:
# tokenize and detokenize helper functions
def tokenize(input_str, vocab_file = None, vocab_dir = None):
  """ Encodes a string to array of numbers

  Args:
      input_str (str): human readable string to encode
      vocab_file (list): list of all words in the vocab French, Fongbe, Ewe combined

  Returns:
      numpy.ndarray: tokenized version of input string
  """
  # set encoding of EOS as 1
  EOS = 1

  inputs = next(trax.data.tokenize(iter([input_str]), vocab_type = 'char',
                                   vocab_file=vocab_file, vocab_dir=vocab_dir))
  inputs = list(inputs) + [EOS]

  # Adding batch dimension to the front of shape
  batch_inputs = np.reshape(np.array(inputs), [1, -1])

  return batch_inputs

def detokenize(integers, vocab_file = None, vocab_dir = None):
  """ Decodes an array of integers to human readable string

  Args:
      integers (numpy.ndarray): array of integers to decode
      vocab_file (list): list of all words in the vocab French, Fongbe, Ewe combined

  Returns:
      str: decoded sentence
  """

  # remove dimension of size 1
  integers = list(np.squeeze(integers))

  EOS = 1

  # remove EOS 
  if EOS in integers:
    integers = integers[:integers.index(EOS)]

  return trax.data.detokenize(integers, vocab_type = 'char',
                              vocab_file=vocab_file, vocab_dir=vocab_dir)

In [None]:
# check for tokenize and detokenize functionality
train_input, train_target = next(tokenized_train_stream_fe)
# Detokenize an input-target pair of tokenized sentences
print(colored(f'Single detokenized example input:', 'red'), detokenize(train_input, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(colored(f'Single detokenized example target:', 'red'), detokenize(train_target, vocab_file="fo_vocab.txt", vocab_dir=VOCAB_DIR))
print()

# Tokenize and detokenize a word that is not explicitly saved in the vocabulary file.
# This is not applicable rightnow since SPE not created, just used for checking the function
print(colored(f"tokenize('baptise'): ", 'green'), tokenize('baptise', vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(colored(f"detokenize(train_input): ", 'green'), detokenize(train_input, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(colored(f"detokenize(train_target): ", 'green'), detokenize(train_target, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))

[31mSingle detokenized example input:[0m  Je t’offre un toit ce soir, et comme on ne sait jamais… Bref, c’est toi qui vois, Jessica
[31mSingle detokenized example target:[0m mana mlɔƒe wo zã sia me eye esi ame aɖe menya nui o la, wo ŋutɔ kpɔe ɖae Jessica

[32mtokenize('baptise'): [0m [[ 98  97 112 116 105 115 101   1]]
[32mdetokenize(train_input): [0m  Je t’offre un toit ce soir, et comme on ne sait jamais… Bref, c’est toi qui vois, Jessica
[32mdetokenize(train_target): [0m mana mlɔƒe wo zã sia me eye esi ame aɖe menya nui o la, wo ŋutɔ kpɔe ɖae Jessica


#### Step 6 - Bucketing for creating streams of batches

In [None]:
# Buckets are defined in terms of boundaries and batch sizes.
# Batch_sizes[i] determines the batch size for items with length < boundaries[i]
# So below, we'll take a batch of 256 sentences of length < 8, 128 if length is
# between 8 and 16, and so on -- and only 2 if length is over 512.
boundaries =  [8,   16,  32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16,    8,   4,  2]

# create the generators french to ewe
train_batch_stream_fe = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(tokenized_train_stream_fe)
eval_batch_stream_fe = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(tokenized_eval_stream_fe)

# create the generators french to ewe
train_batch_stream_ff = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(tokenized_train_stream_ff)
eval_batch_stream_ff = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(tokenized_eval_stream_ff)

In [None]:
# Add masking for the padding (0s) - french to ewe
train_batch_stream_fe = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream_fe)
eval_batch_stream_fe = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream_fe)

# Add masking for the padding (0s) - french to fongbe
train_batch_stream_ff = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream_ff)
eval_batch_stream_ff = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream_ff)

#### Helper functions

##### Input encoder

In [None]:
# inspired from NLP specialization
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):
    """ Input encoder runs on the input sentence and creates
    activations that will be the keys and values for attention.
    
    Args:
        input_vocab_size: int: vocab size of the input
        d_model: int:  depth of embedding (n_units in the LSTM cell)
        n_encoder_layers: int: number of LSTM layers in the encoder
    Returns:
        tl.Serial: The input encoder
    """
    
    # create a serial network
    input_encoder = tl.Serial( 
        
        # create an embedding layer to convert tokens to vectors
        tl.Embedding(input_vocab_size, d_model),
        
        # feed the embeddings to the LSTM layers. It is a stack of n_encoder_layers LSTM layers
        [tl.LSTM(d_model) for x in range(n_encoder_layers)]
    )
    
    return input_encoder

##### Pre attention decoder

In [None]:
# inspired from NLP specialization
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):
    """ Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.
    
    Args:
        mode: str: 'train' or 'eval'
        target_vocab_size: int: vocab size of the target
        d_model: int:  depth of embedding (n_units in the LSTM cell)
    Returns:
        tl.Serial: The pre-attention decoder
    """
    
    # create a serial network
    pre_attention_decoder = tl.Serial(
        
        # shift right to insert start-of-sentence token and implement
        # teacher forcing during training
        tl.ShiftRight(),

        # run an embedding layer to convert tokens to vectors
        tl.Embedding(target_vocab_size, d_model),

        # feed to an LSTM layer
        tl.LSTM(d_model)
    )
    
    return pre_attention_decoder

##### Prepare attention input

In [None]:
# inspired from NLP specialization
def prepare_attention_input(encoder_activations, decoder_activations, inputs):
    """Prepare queries, keys, values and mask for attention.
    
    Args:
        encoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the input encoder
        decoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the pre-attention decoder
        inputs fastnp.array(batch_size, padded_input_length): padded input tokens
    
    Returns:
        queries, keys, values and mask for attention.
    """
    
    # set the keys and values to the encoder activations
    keys = encoder_activations
    values = encoder_activations

    
    # set the queries to the decoder activations
    queries = decoder_activations
    
    # generate the mask to distinguish real tokens from padding
    # hint: inputs is 1 for real tokens and 0 where they are padding
    print(inputs)
    mask = fastnp.where(inputs > 0, 1, inputs)
    
    # add axes to the mask for attention heads and decoder length.
    mask = fastnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
    
    # broadcast so mask shape is [batch size, attention heads, decoder-len, encoder-len].
    # note: for this assignment, attention heads is set to 1.
    mask = mask + fastnp.zeros((1, 1, decoder_activations.shape[1], 1))
    
    return queries, keys, values, mask

#### Step 7 - NMT Model

In [None]:
def NMTAttn(input_vocab_size=input_vocab_size,
            target_vocab_size=input_vocab_size,
            d_model=1024,
            n_encoder_layers=24,
            n_decoder_layers=24,
            n_attention_heads=16,
            attention_dropout=0.1,
            mode='train'):
    """Returns an LSTM sequence-to-sequence model with attention.

    The input to the model is a pair (input tokens, target tokens), e.g.,
    an English sentence (tokenized) and its translation into German (tokenized).

    Args:
    input_vocab_size: int: vocab size of the input
    target_vocab_size: int: vocab size of the target
    d_model: int:  depth of embedding (n_units in the LSTM cell)
    n_encoder_layers: int: number of LSTM layers in the encoder
    n_decoder_layers: int: number of LSTM layers in the decoder after attention
    n_attention_heads: int: number of attention heads
    attention_dropout: float, dropout for the attention layer
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference

    Returns:
    A LSTM sequence-to-sequence model with attention.
    """
    
    # Step 0: call the helper function to create layers for the input encoder
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)

    # Step 0: call the helper function to create layers for the pre-attention decoder
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)

    # Step 1: create a serial network
    model = tl.Serial( 
        
      # Step 2: copy input tokens and target tokens as they will be needed later.
      tl.Select([0, 1, 0, 1]),
        
      # Step 3: run input encoder on the input and pre-attention decoder the target.
      tl.Parallel(input_encoder, pre_attention_decoder),
        
      # Step 4: prepare queries, keys, values and mask for attention.
      tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),
        
      # Step 5: run the AttentionQKV layer
      # nest it inside a Residual layer to add to the pre-attention decoder activations(i.e. queries)
      tl.Residual(tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)),
      
      # Step 6: drop attention mask (i.e. index = None
      tl.Select([0, 2]),
        
      # Step 7: run the rest of the RNN decoder
      [tl.LSTM(d_model) for _ in range(n_decoder_layers)],
        
      # Step 8: prepare output by making it the right size
      tl.Dense(target_vocab_size),
        
      # Step 9: Log-softmax for output
      tl.LogSoftmax()
    )
    
    return model

#### Step 8 - Training

##### Train Task

In [None]:
# train task - French to Ewe
train_task_fe = training.TrainTask(

    # use the train batch stream as labeled data
    labeled_data= train_batch_stream_fe,
    
    # use the cross entropy loss
    loss_layer= tl.CrossEntropyLoss(),
    
    # use the Adam optimizer with learning rate of 0.01
    optimizer= trax.optimizers.Adam(0.01),
    
    # use the `trax.lr.warmup_and_rsqrt_decay` as the learning rate schedule
    # have 1000 warmup steps with a max value of 0.01
    lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000, 0.01),
    
    # have a checkpoint every 10 steps
    n_steps_per_checkpoint= 10,
    
)

##### Eval task

In [None]:
eval_task_fe = training.EvalTask(
    
    ## use the eval batch stream as labeled data
    labeled_data=eval_batch_stream_fe,
    
    ## use the cross entropy loss and accuracy as metrics
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)

##### Loop

In [None]:
# define the output directory
output_dir = r"/content/drive/MyDrive/competitions/Zindi Competition_NMT_AI4D/output"

# remove old model if it exists. restarts training.
!rm -f ~/output_dir/model.pkl.gz  

# define the training loop
training_loop = training.Loop(NMTAttn(mode='train'),
                              train_task_fe,
                              eval_tasks=[eval_task_fe],
                              output_dir=output_dir)

Traced<ShapedArray(int32[16,128])>with<DynamicJaxprTrace(level=1/0)>
Traced<ShapedArray(int32[16,128])>with<DynamicJaxprTrace(level=1/0)>
Traced<ShapedArray(int32[8,256])>with<DynamicJaxprTrace(level=1/0)>


In [None]:
# NOTE: Execute the training loop.
# Output model saved in google drive
training_loop.run(200)

In [None]:
# instantiate the model we built in eval mode
model = NMTAttn(mode='predict')

# initialize weights from a pre-trained model
model.init_from_file(r"/content/drive/MyDrive/competitions/Zindi Competition_NMT_AI4D/output/model.pkl.gz", weights_only=True)
model = tl.Accelerate(model)

Traced<ShapedArray(int32[16,128])>with<DynamicJaxprTrace(level=2/0)>
Traced<ShapedArray(int32[16,128])>with<DynamicJaxprTrace(level=1/0)>


In [None]:
# inspired from NLP specialization
def next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature):
    """Returns the index of the next token.

    Args:
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        input_tokens (np.ndarray 1 x n_tokens): tokenized representation of the input sentence
        cur_output_tokens (list): tokenized representation of previously translated words
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)

    Returns:
        int: index of the next token in the translated sentence
        float: log probability of the next symbol
    """

    # set the length of the current output tokens
    token_length = len(cur_output_tokens)

    # calculate next power of 2 for padding length 
    padded_length = 2**int(np.ceil(np.log2(token_length + 1)))

    # pad cur_output_tokens up to the padded_length
    padded = cur_output_tokens + [0 for _ in range(padded_length - token_length)]
    
    # model expects the output to have an axis for the batch size in front so
    # convert `padded` list to a numpy array with shape (x, <padded_length>) where the
    # x position is the batch axis. (hint: you can use np.expand_dims() with axis=0 to insert a new axis)
    padded_with_batch = np.expand_dims(padded, axis = 0)

    # get the model prediction. remember to use the `NMTAttn` argument defined above.
    # hint: the model accepts a tuple as input (e.g. `my_model((input1, input2))`)
    output, _ = NMTAttn((input_tokens, padded_with_batch))
    
    # get log probabilities from the last token output
    log_probs = output[0, len(cur_output_tokens), :]

    # get the next symbol by getting a logsoftmax sample (*hint: cast to an int)
    symbol = np.int(tl.logsoftmax_sample(log_probs,temperature=temperature))

    return symbol, float(log_probs[symbol])



In [None]:
def sampling_decode(input_sentence, NMTAttn = None, temperature=0.0, vocab_file=None, vocab_dir=None):
    """Returns the translated sentence.

    Args:
        input_sentence (str): sentence to translate.
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        tuple: (list, str, float)
            list of int: tokenized version of the translated sentence
            float: log probability of the translated sentence
            str: the translated sentence
    """
    
    # encode the input sentence
    input_tokens = tokenize(input_sentence, vocab_file, vocab_dir)
    
    # initialize the list of output tokens
    cur_output_tokens = []
    
    # initialize an integer that represents the current output index
    cur_output = 0
    
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    
    # check that the current output is not the end of sentence token
    while cur_output != EOS:
        
        # update the current output token by getting the index of the next word (hint: use next_symbol)
        cur_output, log_prob = next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature)
        
        # append the current output token to the list of output tokens
        cur_output_tokens.append(cur_output)
    
    # detokenize the output tokens
    sentence = detokenize(cur_output_tokens, vocab_file, vocab_dir)
    
    return cur_output_tokens, log_prob, sentence



In [None]:
# Test the function above.
# Change temperature variable and see variations in results
# Run it several times with each setting and see how often the output changes.
sampling_decode("Oui, parce que même dans le drame, le Camerounais aime le sensationnel. ", model, temperature=0.0, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)