# Sequence to Sequence Learning with Neural Networks

Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.

__Goal__: to translate a sentence from German to English.

In [1]:
import os
import spacy
import numpy as np

import random
import math
import time

from collections import Counter, OrderedDict

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as tu_data

# text api 
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab

In [2]:
# use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# set random seeds for reproducibility
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.deterministic = True

In [None]:
# download the spacy models via command line:
# python -m spacy download en
# or in a jupyter notebook cell:
# !python -m spacy download en
# python -m spacy download de

In [4]:
# load the spacy models for English and German
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [24]:
# take a look at the function of the tokenizer
[token.text for token in spacy_en.tokenizer('Hello world!')]

['Hello', 'world', '!']

In [25]:
# define the tokenizer function
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings 
    (tokens) and reverses it
    authors of the paper found that reversing the order of the
    source sentence improved performance
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

In [26]:
# define the tokenizer function
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings 
    (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [27]:
tokenize_de("Guten Morgen!")

['!', 'Morgen', 'Guten']

In [28]:
tokenize_en("Good morning!")

['Good', 'morning', '!']

## Pytorch 2.0 Implementation

I notice that APIs of Pytorch 2.0 are different from those of Pytorch 1.0. So I rewrite the code
in Pytorch 2.0.

In [5]:
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [6]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

In [67]:
token_transform[SRC_LANGUAGE]('Guten Morgen!')

['Guten', 'Morgen', '!']

In [69]:
token_transform[SRC_LANGUAGE]('Good morning!')[::-1]

['!', 'morning', 'Good']

In [90]:
# get the data from original source
URL = {
    "train": "http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz",
    "valid": "http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz",
    "test": "http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz",
}


In [93]:
for (split, url) in URL.items():
    torchtext.utils.download_from_url(url, root="./textdata")

100%|██████████| 1.21M/1.21M [00:00<00:00, 2.45MB/s]
100%|██████████| 46.3k/46.3k [00:00<00:00, 1.25MB/s]
100%|██████████| 43.9k/43.9k [00:00<00:00, 864kB/s]


In [95]:
# extract the data
import tarfile

tar_file = ['training.tar.gz', 'validation.tar.gz', 'mmt16_task1_test.tar.gz']

for file in tar_file:
    with tarfile.open('./textdata/'+file, 'r:gz') as tar:
        tar.extractall('./textdata')

In [96]:
# take a look at the data
!ls ./textdata

mmt16_task1_test.tar.gz  test.en   train.en	    val.de  validation.tar.gz
test.de			 train.de  training.tar.gz  val.en


In [7]:
# read all files with .en or .de suffix
# and save them in a list
text_data_dict = {}

for file in os.listdir('./textdata'):
    if file.endswith('.en') or file.endswith('.de'):
        with open('./textdata/'+file, 'r') as f:
            text_data_dict[file] = f.read().splitlines()


In [105]:
# take a look at the data
text_data_dict.keys()

dict_keys(['val.en', 'train.de', 'test.de', 'train.en', 'val.de', 'test.en'])

In [8]:
# print out some examples
for key in text_data_dict.keys():
     print(f"------------{key}:\n {text_data_dict[key][0]}\n")

------------val.en:
 A group of men are loading cotton onto a truck

------------train.de:
 Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.

------------test.de:
 Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.

------------train.en:
 Two young, White males are outside near many bushes.

------------val.de:
 Eine Gruppe von Männern lädt Baumwolle auf einen Lastwagen

------------test.en:
 A man in an orange hat starring at something.



In [116]:
# print out the number of examples
print(f"Number of training examples: {len(text_data_dict['train.en'])}")
print(f"Number of validation examples: {len(text_data_dict['train.en'])}")
print(f"Number of test examples: {len(text_data_dict['test.en'])}")
print(f"Number of test examples: {len(text_data_dict['test.de'])}")

Number of training examples: 29001
Number of validation examples: 29001
Number of test examples: 1000
Number of test examples: 1000


In [14]:
# build up the train and valid data
train_data = {'split': 'train', 'src': [], 'trg': []}
valid_data = {'split': 'val', 'src': [], 'trg': []}
test_data = {'split': 'test', 'src': [], 'trg': []}

# define a function to build up the data
def tokenize_data():
    splits = [train_data, valid_data, test_data]

    for sdata in splits:
        key_temp_src = sdata['split'] + '.de'
        key_temp_trg = sdata['split'] + '.en'
        print(f"Processing {key_temp_src} and {key_temp_trg} ...")
        if sdata['split'] in ['train', 'val']:
            for i in range(len(text_data_dict[key_temp_src])-1):
                setenct_src_temp = text_data_dict[key_temp_src][i]
                setenct_trg_temp = text_data_dict[key_temp_trg][i]

                # tokenize the source and target sentences
                setenct_src_temp = token_transform[SRC_LANGUAGE](setenct_src_temp)[::-1]
                setenct_trg_temp = token_transform[TGT_LANGUAGE](setenct_trg_temp)

                # lower case the src and trg sentences
                setenct_src_temp = [t.lower() for t in setenct_src_temp]
                setenct_trg_temp = [t.lower() for t in setenct_trg_temp]

                # if the first or last token is '.', remove it
                # if setenct_src_temp[0] == '.':
                #     setenct_src_temp = setenct_src_temp[1:]
                # if setenct_src_temp[-1] == '.':
                #     setenct_src_temp = setenct_src_temp[:-1]
                
                # if setenct_trg_temp[0] == '.':
                #     setenct_trg_temp = setenct_trg_temp[1:]
                # if setenct_trg_temp[-1] == '.':
                #     setenct_trg_temp = setenct_trg_temp[:-1]

                # append the tokenized source and target sentences
                sdata['src'].append(setenct_src_temp)
                sdata['trg'].append(setenct_trg_temp)
        else:
            for i in range(len(text_data_dict[key_temp_src])):
                setenct_src_temp = text_data_dict[key_temp_src][i]
                setenct_trg_temp = text_data_dict[key_temp_trg][i]

                # tokenize the source and target sentences
                setenct_src_temp = token_transform[SRC_LANGUAGE](setenct_src_temp)[::-1]
                setenct_trg_temp = token_transform[TGT_LANGUAGE](setenct_trg_temp)

                # lower case the src and trg sentences
                setenct_src_temp = [t.lower() for t in setenct_src_temp]
                setenct_trg_temp = [t.lower() for t in setenct_trg_temp]

                # if setenct_src_temp[0] == '.':
                #     setenct_src_temp = setenct_src_temp[1:]
                # if setenct_src_temp[-1] == '.':
                #     setenct_src_temp = setenct_src_temp[:-1]
                
                # if setenct_trg_temp[0] == '.':
                #     setenct_trg_temp = setenct_trg_temp[1:]
                # if setenct_trg_temp[-1] == '.':
                #     setenct_trg_temp = setenct_trg_temp[:-1]

                # append the tokenized source and target sentences
                sdata['src'].append(setenct_src_temp)
                sdata['trg'].append(setenct_trg_temp)

tokenize_data()

Processing train.de and train.en ...
Processing val.de and val.en ...
Processing test.de and test.en ...


In [10]:
# take a look at the train_data
print(train_data['src'][0], '\n', train_data['trg'][0])

['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'] 
 ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [202]:
# print out test data example
print(test_data['src'][0], '\n', test_data['trg'][0])

['anstarrt', 'etwas', 'der', ',', 'hut', 'orangefarbenen', 'einem', 'mit', 'mann', 'ein'] 
 ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something']


In [203]:
# print out the valid data example
print(valid_data['src'][1], '\n', valid_data['trg'][1])

['sofa', 'einem', 'auf', 'raum', 'grünen', 'einem', 'in', 'schläft', 'mann', 'ein'] 
 ['a', 'man', 'sleeping', 'in', 'a', 'green', 'room', 'on', 'a', 'couch']


In [204]:
# check train_data
print(f"Number of training examples for src: {len(train_data['src'])}")
print(f"Number of training examples for trg: {len(train_data['trg'])}")

Number of training examples for src: 29000
Number of training examples for trg: 29000


In [15]:
# now add the BOS and EOS tokens to the src and trg sentences
BOS_WORD = '<s>'
EOS_WORD = '</s>'

# define a function to add the BOS and EOS tokens
def add_bos_eos(data):
    for i in range(len(data['src'])):
        data['trg'][i] = [BOS_WORD] + data['trg'][i] + [EOS_WORD]
        # since we reverse the source sentence
        # we need to add the EOS token at the beginning
        data['src'][i] = [EOS_WORD] + data['src'][i] + [BOS_WORD]

    return data

train_data = add_bos_eos(train_data)

In [16]:
# check train_data
print(f"Number of training examples for src: {len(train_data['src'])}")
# print out the first example
print(train_data['src'][0], '\n', train_data['trg'][0])

Number of training examples for src: 29000
['</s>', '.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei', '<s>'] 
 ['<s>', 'two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '</s>']


In [17]:
# now we will build up the vocabulary
# first we need to define the special tokens
UNK_WORD = "<unk>"

# define the special tokens
SRC_SPECIALS = [UNK_WORD, BOS_WORD, EOS_WORD]

# vocabulareis only shwon one time are treated as UNK_WORD
SRC_MIN_FREQ = 2
TRG_MIN_FREQ = 2

# build up the vocabularies
def build_vocab(data):
    # build up the vocabularies
    src_list = []
    for src in data['src']:
        src_list += src
    trg_list = []
    for trg in data['trg']:
        trg_list += trg
    # build up the counter  
    src_counter = Counter(src_list)
    trg_counter = Counter(trg_list)
    # sort the counter
    src_counter = sorted(src_counter.items(), key=lambda x: x[1], reverse=True)
    trg_counter = sorted(trg_counter.items(), key=lambda x: x[1], reverse=True)

    # create an ordered dictionary from the counter
    src_dict = OrderedDict(src_counter)
    trg_dict = OrderedDict(trg_counter)

    # build up the vocabularies
    src_vocab = vocab(src_dict, specials=SRC_SPECIALS, min_freq=SRC_MIN_FREQ)
    trg_vocab = vocab(trg_dict, specials=SRC_SPECIALS, min_freq=TRG_MIN_FREQ)

    return src_vocab, trg_vocab

In [18]:
train_src_vocab, train_trg_vocab = build_vocab(train_data)

In [19]:
print(f"Number of unique tokens in source (de) vocabulary: {len(train_src_vocab)}")
print(f"Number of unique tokens in target (en) vocabulary: {len(train_trg_vocab)}")

Number of unique tokens in source (de) vocabulary: 7852
Number of unique tokens in target (en) vocabulary: 5892


In [181]:
type(train_src_vocab)

torchtext.vocab.vocab.Vocab

In [20]:
# set up the default index for the UNK_WORD
for idx, token in enumerate(train_trg_vocab.get_itos()):
    if idx in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
        print(idx, token)

0 <unk>
1 <s>
2 </s>
3 a
4 .
5 in
6 the
7 on
8 man
9 is
10 and


In [21]:
for idx, token in enumerate(train_src_vocab.get_itos()):
    if idx in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
        print(idx, token)

0 <unk>
1 <s>
2 </s>
3 .
4 ein
5 einem
6 in
7 eine
8 ,
9 und
10 mit


In [22]:
# set up the default index for the UNK_WORD
# this step is very important
train_trg_vocab.set_default_index(train_trg_vocab['<unk>'])
train_src_vocab.set_default_index(train_src_vocab['<unk>'])

In [23]:
# with vocabularies, we can now convert the tokens into indices
def data_process(data, src_vocab, trg_vocab):
    # convert tokens into indices
    for i in range(len(data['src'])):
        data['src'][i] = src_vocab.forward(data['src'][i])
        data['trg'][i] = trg_vocab.forward(data['trg'][i])

    return data


train_data = data_process(train_data, train_src_vocab, train_trg_vocab)
valid_data = data_process(valid_data, train_src_vocab, train_trg_vocab)

In [24]:
train_data['src'][0]

[2, 3, 3098, 5373, 109, 14, 6, 87, 19, 83, 29, 252, 25, 17, 1]

In [25]:
train_data['trg'][0]

[1, 15, 23, 14, 24, 773, 16, 56, 79, 201, 1304, 4, 2]

In [26]:
for idx, token in enumerate(train_trg_vocab.get_itos()):
    if idx in train_data['trg'][0]:
        print(idx, token)

1 <s>
2 </s>
4 .
14 ,
15 two
16 are
23 young
24 white
56 outside
79 near
201 many
773 males
1304 bushes


In [27]:
for idx, token in enumerate(train_src_vocab.get_itos()):
    if idx in train_data['src'][0]:
        print(idx, token)

1 <s>
2 </s>
3 .
6 in
14 der
17 zwei
19 im
25 junge
29 männer
83 sind
87 freien
109 nähe
252 weiße
3098 büsche
5373 vieler


In [None]:
# build up the model
class Encoder(nn.Module):

    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()

        # define the embedding layer
        self.embedding = nn.Embedding(input_dim, emb_dim)

        # define the LSTM layer
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)

        # define the dropout layer
        self.dropout = nn.Dropout(dropout)