[NLP From Scratch: Translation with a Sequence to Sequence Network and Attention](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#nlp-from-scratch-translation-with-a-sequence-to-sequence-network-and-attention)


This project aims to train a neural network to transilate from french to english


In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

In [2]:
import torch
import  torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [3]:
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

We’ll need a unique index per word to use as the inputs and targets of the networks later. To keep track of all this we will use a helper class called Lang which has word → index (word2index) and index → word (index2word) dictionaries, as well as a count of each word word2count which will be used to replace rare words later.

In [6]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name) -> None:
        self.name = name
        self.word2index = dict()
        self.word2count = dict()
        self.index2word = dict([(0, "SOS"), (1, "EOS")])
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split():
            self.addWord(word)


    def addWord(self, word):

        if word in self.word2index:
            self.word2count[word] += 1
        else:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1


In [7]:
# https://stackoverflow.com/a/518232/2809427

def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')



# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [8]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading Lines")

    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')
    
    pairs = [normalizeString(s) for s in l.split('\t') for l in lines]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = lang2
        output_lang = lang1
    else:
        input_lang = lang1
        output_lang = lang2

    return input_lang,output_lang, pairs