In [57]:
import numpy as np
import pandas as pd
import spacy
import torchtext

In [58]:
the_seed = 42
torch.manual_seed(the_seed)
torch.cuda.manual_seed(the_seed)
np.random.seed(the_seed)

Read data. To be generic we do not hard code variable names as their language name. Let's simply call them source and destination

In [59]:
# set language here
language_meta = {
    "en": {
        "spacy": "en_core_web_sm",
        "input_file": "news-commentary-v13.zh-en.en"
    },
    "zh": {
        "spacy": "zh_core_web_sm",
        "input_file": "news-commentary-v13.zh-en.zh"
    }
}

source_language = "en"
dest_language = "zh"

source_nlp = spacy.load(language_meta[source_language]["spacy"])
dest_nlp = spacy.load(language_meta[dest_language]["spacy"])
source_file = language_meta[source_language]["input_file"]
dest_file = language_meta[dest_language]["input_file"]

tokens = dest_nlp.tokenizer("秦始皇是一个好国王？")

print('test tokenizing: ', [t.text for t in tokens])



test tokenizing:  ['秦始皇', '是', '一个', '好', '国王', '？']


In [60]:
# todo: remove cap before start training
src_lines = open(source_file).readlines()[:100]
dest_lines = open(dest_file).readlines()[:100]

# Shuffle the DataFrame rows
df = pd.DataFrame(list(zip(src_lines, dest_lines)), columns=['source', 'destination'])
df = df.sample(frac=1).reset_index(drop=True)

train_offset = int(len(df) * 0.8)
test_offset = int(len(df) * 0.9)

train_pairs = df.iloc[:train_offset]
test_pairs = df.iloc[train_offset: test_offset]
valid_pairs = df.iloc[test_offset:]




In [61]:
UNKNOWN_TOKEN = "<UNK>"
PADDING_TOKEN = "<PAD>"
EOS_TOKEN = "<EOS>"
SOS_TOKEN = "<SOS>"

SPECIAL_TOKENS = [UNKNOWN_TOKEN, PADDING_TOKEN, EOS_TOKEN, SOS_TOKEN]


def process_sentence(src_sent, nlp_tokenizer, max_token=100):
    return [SOS_TOKEN] + [t.text for t in nlp_tokenizer.tokenizer(src_sent.lower())][:max_token] + [EOS_TOKEN]


src_tokens_list = [process_sentence(pair.source, source_nlp) for _, pair in train_pairs.iterrows()]

dest_tokens_list = [process_sentence(pair.destination, dest_nlp) for _, pair in train_pairs.iterrows()]

print(src_tokens_list[0])

src_vocab = torchtext.vocab.build_vocab_from_iterator(
    src_tokens_list,
    min_freq=1,
    specials=SPECIAL_TOKENS
)

dest_vocab = torchtext.vocab.build_vocab_from_iterator(
    dest_tokens_list,
    min_freq=1,
    specials=SPECIAL_TOKENS
)

src_vocab.set_default_index(0)
dest_vocab.set_default_index(0)

assert src_vocab.lookup_indices([UNKNOWN_TOKEN, PADDING_TOKEN, EOS_TOKEN, SOS_TOKEN]) == [0, 1, 2, 3]
assert dest_vocab.lookup_indices([UNKNOWN_TOKEN, PADDING_TOKEN, EOS_TOKEN, SOS_TOKEN]) == [0, 1, 2, 3]


['<SOS>', 'the', 'ambition', 'was', 'to', 'complete', 'the', 'negotiations', 'on', '“', 'one', 'tank', 'of', 'gas', '.', '”', '\n', '<EOS>']
