In [91]:
import MeCab
import numpy as np
import io
import spacy
import string
import os
#pip install mecab-python3
#pip install unidic-lite
#pip install spacy

# Define Japanese Tagger - Mecab
ja_tagger = MeCab.Tagger("-Owakati") # default dictionary for parsing japanese

# Define English Tagger - spaCy
# !python -m spacy download en_core_web_sm
en_tagger = spacy.load("en_core_web_sm", disable=['parser', 'ner']) # Use en_core_web_sm to parse English

### preprocess.py

In [105]:
def write_file(fpath, content):
    """ Writes content to a file in fpath
    @param fpath (str): filepath to read data
    @param content (str): file content
    """
    f = open(fpath, 'w')
    f.write(content)
    f.close()
    return

def read_and_process_data(fpath):
    """ Reads the Japanese and English corpus and creates
    2 separate files (.ja and .en) for each train, dev and test dataset
    in the same data folder
    @param fpath (str): filepath to read data
    """

    # reads data
    f = io.open(fpath, mode="r", encoding="utf-8")
    d = f.read()
    
    # split on '\n' to get a list where each item is a Eng2Jap example
    # second split on '\t' to split each item into 2; first subitem is Eng translation, second subitem is Jap translation
    examples = [e.split('\t') for e in d.split('\n')]

    # creates 2 variables to hold english and japanese example
    # only look for examples where there is a 1 to 1 map (so length == 2)
    en_ex, ja_ex = [e[0] for e in examples if len(e) == 2], [e[1] for e in examples if len(e) == 2] 

    # join on '\n' to create the file
    opath_en = f'{fpath}.en'
    opath_ja = f'{fpath}.ja'
    
    write_file(opath_en, '\n'.join(en_ex))
    write_file(opath_ja, '\n'.join(ja_ex))

    os.remove(fpath)
    return 

read_and_process_data('ja_en_data/split/train')
read_and_process_data('ja_en_data/split/dev')
read_and_process_data('ja_en_data/split/test')

FileNotFoundError: [Errno 2] No such file or directory: 'ja_en_data/split/train'

### utils.py

### vocab.py

In [87]:
from collections import Counter
from docopt import docopt
from itertools import chain
import json
import torch
from typing import List
from utils import read_corpus, pad_sents
import sentencepiece as spm


class VocabEntry(object):
    """ Vocabulary Entry, i.e. structure containing either
    src or tgt language terms.
    """
    def __init__(self, word2id=None):
        """ Init VocabEntry Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1 # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word 
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained    
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the VocabEntry.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of VocabEntry to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """ Convert list of words or list of sentences of words
        into list or list of list of indices.
        @param sents (list[str] or list[list[str]]): sentence(s) in words
        @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """ Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = VocabEntry()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry
    
    @staticmethod
    def from_subword_list(subword_list):
        vocab_entry = VocabEntry()
        for subword in subword_list:
            vocab_entry.add(subword)
        return vocab_entry


class Vocab(object):
    """ Vocab encapsulating src and target langauges.
    """
    def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry):
        """ Init Vocab.
        @param src_vocab (VocabEntry): VocabEntry for source language
        @param tgt_vocab (VocabEntry): VocabEntry for target language
        """
        self.src = src_vocab
        self.tgt = tgt_vocab

    @staticmethod
    def build(src_sents, tgt_sents) -> 'Vocab':
        """ Build Vocabulary.
        @param src_sents (list[str]): Source subwords provided by SentencePiece
        @param tgt_sents (list[str]): Target subwords provided by SentencePiece
        """
        # assert len(src_sents) == len(tgt_sents)

        print('initialize source vocabulary ..')
        # src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff)
        src = VocabEntry.from_subword_list(src_sents)

        print('initialize target vocabulary ..')
        # tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff)
        tgt = VocabEntry.from_subword_list(tgt_sents)

        return Vocab(src, tgt)

    def save(self, file_path):
        """ Save Vocab to file as JSON dump.
        @param file_path (str): file path to vocab file
        """
        with open(file_path, 'w') as f:
            json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), f, indent=2)

    @staticmethod
    def load(file_path):
        """ Load vocabulary from JSON dump.
        @param file_path (str): file path to vocab file
        @returns Vocab object loaded from JSON dump
        """
        entry = json.load(open(file_path, 'r'))
        src_word2id = entry['src_word2id']
        tgt_word2id = entry['tgt_word2id']

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))

def get_vocab_list(file_path, source, vocab_size):
    """ Use SentencePiece to tokenize and acquire list of unique subwords.
    @param file_path (str): file path to corpus
    @param source (str): tgt or src
    @param vocab_size: desired vocabulary size
    """ 
    spm.SentencePieceTrainer.Train(input=file_path, model_prefix=source, vocab_size=vocab_size)     # train the spm model
    sp = spm.SentencePieceProcessor()   # create an instance; this saves .model and .vocab files 
    sp.Load('{}.model'.format(source))  # loads tgt.model or src.model
    sp_list = [sp.IdToPiece(piece_id) for piece_id in range(sp.GetPieceSize())] # this is the list of subwords
    return sp_list 



if __name__ == '__main__':
    args = docopt(__doc__)

    print('read in source sentences: %s' % args['--train-src'])
    print('read in target sentences: %s' % args['--train-tgt'])

    src_sents = get_vocab_list(args['--train-src'], source='src', vocab_size=32000)          # EDIT: NEW VOCAB SIZE
    tgt_sents = get_vocab_list(args['--train-tgt'], source='tgt', vocab_size=8000)
    vocab = Vocab.build(src_sents, tgt_sents)
    print('generated vocabulary, source %d words, target %d words' % (len(src_sents), len(tgt_sents)))

    # src_sents = read_corpus(args['--train-src'], source='src')
    # tgt_sents = read_corpus(args['--train-tgt'], source='tgt')

    # vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff']))
    # print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

    vocab.save(args['VOCAB_FILE'])
    print('vocabulary saved to %s' % args['VOCAB_FILE'])


{'10': 4,
 '100000': 5,
 '11': 6,
 '12': 7,
 '120': 8,
 '1200': 9,
 '15': 10,
 '150': 11,
 '1500': 12,
 '17': 13,
 '18': 14,
 '180': 15,
 '1970s': 16,
 '1995': 17,
 '1996': 18,
 '2': 19,
 '20': 20,
 '200': 21,
 '25': 22,
 '26': 23,
 '30': 24,
 '35': 25,
 '38': 26,
 '45': 27,
 '450': 28,
 '46': 29,
 '49th': 30,
 '50': 31,
 '500th': 32,
 '55': 33,
 '5550167': 34,
 '58': 35,
 '6': 36,
 '65': 37,
 '74': 38,
 '7cfi06x': 39,
 '81': 40,
 '8track': 41,
 '9': 42,
 'a': 43,
 'aaron': 44,
 'abaddon': 45,
 'abandoned': 46,
 'abby': 47,
 'ability': 48,
 'abkhazia': 49,
 'able': 50,
 'about': 51,
 'above': 52,
 'absolutely': 53,
 'abu': 54,
 'accept': 55,
 'acceptable': 56,
 'accepting': 57,
 'access': 58,
 'according': 59,
 'accounted': 60,
 'accumulate': 61,
 'accustomed': 62,
 'achievement': 63,
 'acquaintance': 64,
 'acre': 65,
 'across': 66,
 'act': 67,
 'acting': 68,
 'action': 69,
 'actionable': 70,
 'active': 71,
 'actual': 72,
 'actually': 73,
 'adam': 74,
 'adapt': 75,
 'address': 76,
 'ad