In [1]:
import numpy as np
import pandas as pd
import spacy
import torchtext
import torch

In [2]:
the_seed = 42
torch.manual_seed(the_seed)
torch.cuda.manual_seed(the_seed)
np.random.seed(the_seed)

Read data. To be generic we do not hard code variable names as their language name. Let's simply call them source and destination

In [3]:
# set language here
language_meta = {
    "en": {
        "spacy": "en_core_web_sm",
        "input_file": ""
    },
    "zh": {
        "spacy": "zh_core_web_sm",
        "input_file": ""
    }
}

source_language = "en"
dest_language = "zh"

source_nlp = spacy.load(language_meta[source_language]["spacy"])
dest_nlp = spacy.load(language_meta[dest_language]["spacy"])
source_file = language_meta[source_language]["input_file"]
dest_file = language_meta[dest_language]["input_file"]

tokens = dest_nlp.tokenizer("秦始皇是一个好国王？")

print('test tokenizing: ', [t.text for t in tokens])



test tokenizing:  ['秦始皇', '是', '一个', '好', '国王', '？']


In [4]:
# todo: remove cap before start training
src_lines = open(source_file).readlines()[:100]
dest_lines = open(dest_file).readlines()[:100]

# Shuffle the DataFrame rows
df = pd.DataFrame(list(zip(src_lines, dest_lines)), columns=['source', 'destination'])
df = df.sample(frac=1).reset_index(drop=True)

train_offset = int(len(df) * 0.8)
test_offset = int(len(df) * 0.9)

train_pairs = df.iloc[:train_offset]
test_pairs = df.iloc[train_offset: test_offset]
valid_pairs = df.iloc[test_offset:]




In [5]:
UNKNOWN_TOKEN = "<UNK>"
PADDING_TOKEN = "<PAD>"
EOS_TOKEN = "<EOS>"
SOS_TOKEN = "<SOS>"

SPECIAL_TOKENS = [UNKNOWN_TOKEN, PADDING_TOKEN, EOS_TOKEN, SOS_TOKEN]


def process_sentence(src_sent, nlp_tokenizer, max_token=100):
    return [SOS_TOKEN] + [t.text for t in nlp_tokenizer.tokenizer(src_sent.lower())][:max_token] + [EOS_TOKEN]


src_tokens_list = [process_sentence(pair.source, source_nlp) for _, pair in train_pairs.iterrows()]

dest_tokens_list = [process_sentence(pair.destination, dest_nlp) for _, pair in train_pairs.iterrows()]

print(src_tokens_list[0])

src_vocab = torchtext.vocab.build_vocab_from_iterator(
    src_tokens_list,
    min_freq=1,
    specials=SPECIAL_TOKENS
)

dest_vocab = torchtext.vocab.build_vocab_from_iterator(
    dest_tokens_list,
    min_freq=1,
    specials=SPECIAL_TOKENS
)

src_vocab.set_default_index(0)
dest_vocab.set_default_index(0)

assert src_vocab.lookup_indices([UNKNOWN_TOKEN, PADDING_TOKEN, EOS_TOKEN, SOS_TOKEN]) == [0, 1, 2, 3]
assert dest_vocab.lookup_indices([UNKNOWN_TOKEN, PADDING_TOKEN, EOS_TOKEN, SOS_TOKEN]) == [0, 1, 2, 3]


['<SOS>', 'the', 'ambition', 'was', 'to', 'complete', 'the', 'negotiations', 'on', '“', 'one', 'tank', 'of', 'gas', '.', '”', '\n', '<EOS>']


In [8]:
print(src_tokens_list[0])
print(src_vocab.lookup_indices(src_tokens_list[0]))

print(dest_vocab.get_stoi())

['<SOS>', 'the', 'ambition', 'was', 'to', 'complete', 'the', 'negotiations', 'on', '“', 'one', 'tank', 'of', 'gas', '.', '”', '\n', '<EOS>']
[3, 4, 277, 37, 7, 360, 4, 189, 44, 52, 117, 738, 10, 470, 8, 53, 6, 2]
{'��': 825, '�觑': 824, '�币': 823, '鸿沟': 821, '鸡肉': 820, '马德里—': 819, '领袖': 816, '预期': 815, '障碍': 810, '随着': 809, '陷入': 807, '除了': 806, '降低': 804, '防止': 802, '门徒': 801, '长期': 800, '错': 798, '里': 795, '那样': 793, '措施': 586, '那些': 792, '足以': 768, '道路': 789, '比如': 255, '选举': 786, '假设': 393, '追溯': 785, '远离': 783, '这次': 781, '迎来': 775, '1929年': 154, '无论是': 601, '过去': 774, '超过': 767, '起来': 766, '扩大': 569, '走出': 765, '谈成': 756, '协议': 51, '记者': 750, '造成': 787, '解散': 747, '倾向': 171, '解体': 746, '视': 744, '表面': 741, '行动': 739, '行为': 738, '蔓延': 736, '良机': 735, '至少': 732, '仍': 80, '空间': 695, '自身': 730, '试图': 754, '自大': 728, '柏林墙': 250, '脆弱': 727, '脆': 726, '能量': 725, '否决': 460, '能够': 724, '认识': 748, '胜利': 723, '考虑': 720, '继续': 717, '统一': 716, '时': 96, '结构': 713, '经济学家': 711, '经济体': 710, '在内'