# Translating English to Romanian with a RNN
I'm trying to get a better understanding of RNN's before I move to transformers so I will be implementing a RNN that translates english to romanian!  
I will be following this [tutorial](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) but will train it to translate it to romanian. Afterwards, I want to ask my model questions in English and have it respond in Japanese.

## Table of Contents


In [58]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cpu")

# Data Cleaning
Our data is from https://www.manythings.org/anki/ and is a text file.  The file is a tab separated list of translation pairs: `Hi.	もしもし`.

We will represent every word in our language as a one-hot vector. We'll need a unique index per word to use as the input and targets of our network.  
Our Lang class will keep track of word to index as well as index to word, and we'll keep track of the number of words and use the final index as the index of rare words.

In [59]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

The files are in unicode. To simplify the files, we will convert them to ASCII, make everything lowercase, and trim most of the punctuation.

In [60]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

## Filtering Data
There are a lot of example sentences so we'll only take the smaller sentences.

We're filtering so that the length of the of the sentences is less than 10 and they only start with certain prefixes. 

In [61]:
MAX_LENGTH = 10

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

## Reading the Data
To read the file, we'll split the file into lines, then split the lines into pairs; we'll also add a reverse function

In [69]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s/ron.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    pairs = [p[:2] for p in pairs]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [79]:
def prepare_data(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs('ron', 'eng', reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [80]:
input_lang, output_lang, pairs = prepare_data('ron', 'eng', reverse=True)

Reading lines...
Read 14237 sentence pairs
Trimmed to 11339 sentence pairs
Counting words...
Counted words:
eng 6602
ron 4717


In [81]:
for i in pairs:
    print(i)

['buna !', 'hi .']
['fugi !', 'run !']
['cine ?', 'who ?']
['fereste te !', 'duck !']
['foc !', 'fire !']
['ajutor !', 'help !']
['sari !', 'jump !']
['opreste te !', 'stop !']
['renunta !', 'stop !']
['asteapta !', 'wait !']
['salut .', 'hello !']
['buna !', 'hello !']
['haide !', 'hurry !']
['grabeste te !', 'hurry !']
['calmeaza te .', 'relax .']
['zambeste .', 'smile .']
['ataca !', 'attack !']
['atacati !', 'attack !']
['noroc !', 'cheers !']
['sanatate !', 'cheers !']
['noroc bun !', 'cheers !']
['stai pe loc !', 'freeze !']
['ridica te .', 'get up .']
['serios ?', 'really ?']
['multumim !', 'thanks !']
['multumesc .', 'thanks !']
['multumesc .', 'thanks .']
['intreaba l pe tom .', 'ask tom .']
['maxim !', 'awesome !']
['excelent !', 'awesome !']
['suna ma .', 'call me .']
['pleaca de aici !', 'get out !']
['pleaca de aici !', 'get out .']
['pleaca !', 'go away !']
['lasa ma .', 'go away .']
['la revedere !', 'goodbye !']
['rezista .', 'hold on .']
['sunt de acord .', 'i agree .'

['trebuie sa ma duc acum ?', 'do i have to go now ?']
['bei cafea ?', 'do you drink coffee ?']
['iti place sa pescuiesti ?', 'do you like fishing ?']
['ai nevoie de masina ?', 'do you need the car ?']
['vorbiti franceza ?', 'do you speak french ?']
['tom vrea o slujba ?', 'does tom want a job ?']
['te doare capul ?', 'does your head hurt ?']
['nu cere bani .', 'don t ask for money .']
['nici macar sa nu ma atingi .', 'don t even touch me .']
['nu l lasa pe tom sa doarma .', 'don t let tom sleep .']
['nu munci prea mult .', 'don t work too much .']
['orice minut conteaza .', 'every minute counts .']
['toti au aplaudat .', 'everybody applauded .']
['toata lumea are defecte .', 'everyone has faults .']
['toata lumea rade .', 'everyone s laughing .']
['fermierii sunt ingrijorati .', 'farmers are worried .']
['porneste motoarele .', 'fire up the engines .']
['da drumul la motoare .', 'fire up the engines .']
['urmareste masina rosie .', 'follow that red car .']
['libertatea nu este gratuita

['arati palid astazi .', 'you look pale today .']
['poti sa intri acum .', 'you may come in now .']
['poti pleca acasa acum .', 'you may go home now .']
['poate c o sa razi de mine .', 'you may laugh at me .']
['tu nu poti veni .', 'you may not come in .']
['trebuie sa studiezi din greu .', 'you must study hard .']
['trebuie sa studiezi mai mult .', 'you must study more .']
['ai nevoie de protectie .', 'you need protection .']
['ai trecut pe rosu .', 'you ran a red light .']
['ai fost foarte norocos .', 'you were very lucky .']
['nu vei avea nevoie de asta .', 'you won t need this .']
['ar trebui sa nu pleci .', 'you d better not go .']
['ai aproape dreptate .', 'you re almost right .']
['esti irezistibil .', 'you re irresistible .']
['esti in siguranta aici .', 'you re safe in here .']
['tu esti inginerul .', 'you re the engineer .']
['tu esti lipsit de scrupule .', 'you re unscrupulous .']
['esti foarte egoist .', 'you re very selfish .']
['ai fost de ajutor .', 'you ve been helpful 

['nimeni nu a venit la petrecere .', 'nobody came to the party .']
['nimeni nu poate sa o inteleaga .', 'nobody can understand it .']
['niciunul dintre ei nu e prezent .', 'none of them are present .']
['nu toate blondele sunt proaste .', 'not all blondes are dumb .']
['intotdeauna se poate gasi timp .', 'one can always find time .']
['sistemele noastre au fost avariate .', 'our systems were damaged .']
['te rog sa te intorci de ndata .', 'please come back at once .']
['va rog sa faceti la dreapta .', 'please make a right turn .']
['te rog sa ti pui pantofii .', 'please put your shoes on .']
['te rog asaza te aici si asteapta .', 'please sit here and wait .']
['asteptati cinci minute va rog .', 'please wait five minutes .']
['te rog sa astepti o jumatate de ora .', 'please wait half an hour .']
['citeste cat mai mult posibil .', 'read as much as possible .']
['adu ti aminte de ce esti aici .', 'remember why you re here .']
['ne vedem luni la scoala .', 'see you monday at school .']
['i

['sting lumina .', 'i m turning off the light .']
['am mancat prea mult astazi .', 'i ve eaten too much today .']
['mi a placut compania voastra .', 'i ve enjoyed your company .']
['am facut primul pas .', 'i ve taken the first step .']
['daca nu mananci mori .', 'if you don t eat you die .']
['daca bei nu conduce .', 'if you drink don t drive .']
['o sa ploua astazi ?', 'is it going to rain today ?']
['este ea singura ta fata ?', 'is she your only daughter ?']
['poate fi folosit drept cutit .', 'it can be used as a knife .']
['depinde de context .', 'it depends on the context .']
['nu se pune problema .', 'it is out of the question .']
['nici nu intra in discutie .', 'it is out of the question .']
['este dreptul lor sa voteze .', 'it is their right to vote .']
['a fost descoperit in .', 'it was discovered in .']
['a fost extrem de istovitor .', 'it was extremely grueling .']
['a fost incredibil de ireal .', 'it was incredibly surreal .']
['este putin complicat .', 'it s a little compl

['nu mi prea place sa fac aia .', 'i don t really like doing that .']
['nu mi amintesc sa fi cerut asta .', 'i don t remember asking for it .']
['eu nu cred ca tom este in boston .', 'i don t think tom is in boston .']
['nu cred ca tom a fost sincer .', 'i don t think tom was truthful .']
['nu cred ca stie cineva deja .', 'i don t think anyone knows yet .']
['nu cred ca imi place asta .', 'i don t think that i like that .']
['nu cred ca tom este lenes .', 'i don t think that tom is lazy .']
['nu vreau sa vorbesc cu tine .', 'i don t want to speak with you .']
['nu vreau sa vorbesc cu dumneavoastra .', 'i don t want to speak with you .']
['nu vreau sa astept atat de mult timp .', 'i don t want to wait that long .']
['i am explicat procesul .', 'i explained the process to him .']
['am dat locul meu doamnei in varsta .', 'i gave my seat to the old lady .']
['am o vorba cu tine .', 'i have a bone to pick with you .']
['am cateva bilete pe randul .', 'i have a few tickets in row .']
['am as

['sotia mea si cu mine traim in australia .', 'my wife and i live in australia .']
['nici un sistem de securitate nu este sigur .', 'no security system is foolproof .']
['nimeni nu va avea nevoie de o explicatie .', 'nobody will need an explanation .']
['nu toate cartile merita sa fie citite .', 'not all books are worth reading .']
['nu toate cartile merita citite .', 'not all books are worth reading .']
['unul din degetele lui tom s a rupt .', 'one of tom s fingers was broken .']
['operatiile sunt deja in desfasurare .', 'operations are already underway .']
['bugetul familiei noastre este pe rosu .', 'our family budget is in the red .']
['ma scuzati vorbiti engleza ?', 'pardon me do you speak english ?']
['te rog nu atinge exponatele .', 'please don t touch the exhibits .']
['te rog da mi un pahar de apa .', 'please give me a glass of water .']
['preincalzeste cuptorul la de grade .', 'preheat the oven to degrees .']
['inregistrarea a inceput pe octombrie .', 'registration began octob

# Seq2Seq Model
Seq2Seq models are models consisting of two RNN's: an encoder and a decoder. The encoder reads a sequence and outputs a single vector, the decoder reads that vector to produce an output sequence.

When you translate words directly from one language to another, the meaning is sometimes lost because the words are in different orders. This means it's difficult to produce a correct translation from just a sequence of words.  
We feed the sequence into an encoder, which ideally encodes the *meaning* of the input sentence into a single vector.

## The Encoder
The encoder outputs some value for every word in the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state as input for the next input word.

In [82]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

## The Decoder
The decoder takes the encoder output vectors and outputs a sequence of words to create the translation

### Simple Decoder
In the simplest Seq2Seq decoder, we only use the last output of the encoder, sometimes referred to as the *context vector* as it encodes context from the entire sequence. This context vector is used as the initial input for the hidden state of the decoder.  

At every step of decoding, the decoder is given an input token and a hidden state. The initial input token is the *SOS* token and the initial hidden state is the *context vector*.

In [83]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(self, DecoderRNN).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.ReLU(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Training
