In [1]:
%load_ext autoreload
%autoreload 2

# [Torchtext Detailed Tutorial](https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/)

In [34]:
import warnings
warnings.filterwarnings('ignore')

import torch
import torchtext

from translator.languages import Language
from translator.data import EuroParl, make_dataset, make_fields
from translator.networks import Encoder, Decoder, Encode_Decoder_Model
from translator.utils import Directory, tokenize_sent, translate_sentence, load_checkpoint, save_checkpoint

SOURCE_DIR = '..'
directory = Directory(SOURCE_DIR)

ENGLISH, FRENCH = 'en', 'fr'

# Stage 1: Build Languages Vocab and Save it locally

In [33]:
english = Language('english')
english.load_language(directory.languages_path / 'en.p')

french = Language('french')
french.load_language(directory.languages_path / 'fr.p')

len(english.vocab), len(french.vocab)

(10004, 10004)

# Stage 2: Build Training Dataset

In [35]:
## create tabular dataset (run once)
# constants
EURO_PARL_DATA_PATH = directory.data_path / 'fr-en'

# dataset abstraction
euro_parl = EuroParl(
    data_dir=EURO_PARL_DATA_PATH, 
    lang1_name=ENGLISH, 
    lang2_name=FRENCH, 
    sample_size=50_000)
euro_parl.train_valid_test_split(valid_size=0.3, test_size=0.2)
euro_parl.to_csv(EURO_PARL_DATA_PATH)

sampling 50000 out of 2007724 samples


In [36]:
## build torch datasets
english_data_field = (ENGLISH, english.field)
french_data_field = (FRENCH, french.field)
train, val, test = make_dataset(
    english_data_field,
    french_data_field,
    directory.data_path / 'fr-en/train.csv',
    directory.data_path / 'fr-en/valid.csv',
    directory.data_path / 'fr-en/test.csv')

# Model

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# model parameters
load_model = False

sequenec_size_encoder = len(english.vocab)
sequenec_size_decoder = len(french.vocab)
output_size = len(french.vocab)

embedding_dim = 20
hidden_size = 5
num_layers = 1

encoder = Encoder(sequenec_size_encoder, embedding_dim, hidden_size, num_layers).to(device)
decoder = Decoder(sequenec_size_decoder, embedding_dim, output_size, hidden_size, num_layers).to(device)
model = Encode_Decoder_Model(encoder, decoder, device).to(device)

if load_model:
    checkpoint_path = '../data/models/checkpoint_epoch=4.pth.tar'
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    load_checkpoint(checkpoint, model)

cpu


In [6]:
# task: translate from English to French
'''
test_sentence = "The world is watching today France is issuing an order."
tokenized_sent = tokenize_sent(test_sentence, ENGLISH, english_field, device)

tokenized_sent.shape

h_0, c_0 = encoder(tokenized_sent)
start_token = torch.tensor([english_field.vocab.stoi['hello']]) # one word, one batch
output, hidden, cell = decoder(start_token, h_0, c_0)
vocab_probabilities = output.squeeze(0)
best_guess = torch.argmax(vocab_probabilities)
french_field.vocab.itos[best_guess]
'''


'\ntest_sentence = "The world is watching today France is issuing an order."\ntokenized_sent = tokenize_sent(test_sentence, ENGLISH, english_field, device)\n\ntokenized_sent.shape\n\nh_0, c_0 = encoder(tokenized_sent)\nstart_token = torch.tensor([english_field.vocab.stoi[\'hello\']]) # one word, one batch\noutput, hidden, cell = decoder(start_token, h_0, c_0)\nvocab_probabilities = output.squeeze(0)\nbest_guess = torch.argmax(vocab_probabilities)\nfrench_field.vocab.itos[best_guess]\n'

In [39]:
# training params
learning_rate = 0.001
num_epochs = 1
batch_size = 32

# making iterators
train_iterator = torchtext.data.BucketIterator(train, batch_size=batch_size, shuffle=True)

# training
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english_field.vocab.stoi["<pad>"]
criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_idx)

test_sentence = "The world is watching today France is issuing an order."
tokenized_sent = tokenize_sent(test_sentence, ENGLISH, english, device)

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")
    
    # setup model for evaluation
    model.eval()
    translated_sentence = translate_sentence(model, tokenized_sent, FRENCH, french, device)
    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):

        # Get input and targets and get to cuda
        inp_data = batch.en.to(device)
        target = batch.fr.to(device)

        # Forward prop
        output = model(inp_data, target, len(french.vocab))

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

[Epoch 0 / 1]
Translated example sentence: 
 limiter dans graves on on on on on ans avons dans publics on on ans avons dans publics on on ans avons dans publics on on ans avons dans publics on on ans avons dans publics on on ans avons dans publics on on ans avons dans publics on on
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154


KeyboardInterrupt: 

# Inference

In [76]:
load_model = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# model parameters
model_name = 'encoder_decoder_1'

sequenec_size_encoder = len(english.vocab)
sequenec_size_decoder = len(french.vocab)
output_size = len(french.vocab)

embedding_dim = 200
hidden_size = 100
num_layers = 2

encoder = Encoder(sequenec_size_encoder, embedding_dim, hidden_size, num_layers).to(device)
decoder = Decoder(sequenec_size_decoder, embedding_dim, output_size, hidden_size, num_layers).to(device)
model = Encode_Decoder_Model(encoder, decoder, device).to(device)

if load_model:
    checkpoint_path = directory.get_latest_checkpint(model_name)
    print(checkpoint_path)
    checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
    load_checkpoint(checkpoint, model)

cpu
../data/models/encoder_decoder_1/1609185187.pth.tar
=> Loading checkpoint


In [78]:
test_sentence = "The world is watching today France is issuing an order."
tokenized_sent = tokenize_sent(test_sentence, ENGLISH, english_field, device)

translate_sentence(model, tokenized_sent, dest_language=FRENCH, language_field=french.field, device=device)

'm. <unk> , , , , pas <unk> <unk> <eos>'

In [79]:
tokenized_sent

tensor([[   0],
        [   2],
        [ 193],
        [  10],
        [   0],
        [ 201],
        [   0],
        [  10],
        [5045],
        [  32],
        [ 176],
        [   4],
        [   0]])