In [18]:
import os 
import torch 
import numpy as np
from torch import nn
import torch.optim as optim
from d2l import torch as d2l
from vocab import Vocab
from tqdm import tqdm

## Preprocessing

In [2]:
with open("../data/fra-eng/fra.txt") as f:
    raw_text = f.read()

In [3]:
def preprocess(text):
    text = text.replace('\u202f', ' ').replace('\xa0', ' ')
    no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text.lower())]
    return ''.join(out)

In [4]:
text = preprocess(raw_text)
text[:100]

'go .\tva !\nhi .\tsalut !\nrun !\tcours !\nrun !\tcourez !\nwho ?\tqui ?\nwow !\tça alors !\nfire !\tau feu !\nhel'

In [5]:
def tokenize(text):
    """
    Turn the raw text into a tuple[list[list[str]]], where the outer
    tuple
    """
    eng, fra = [], []
    for line in text.split("\n"):
        parts = line.split("\t")
        if len(parts) == 2:
            p1, p2 = parts[0], parts[1]
            eng.append(p1.split(" ") + ["<eos>"])
            fra.append(p2.split(" ") + ["<eos>"])        
    return (eng, fra)

In [6]:
tokenized = tokenize(text)
tokenized[0][:10]

[['go', '.', '<eos>'],
 ['hi', '.', '<eos>'],
 ['run', '!', '<eos>'],
 ['run', '!', '<eos>'],
 ['who', '?', '<eos>'],
 ['wow', '!', '<eos>'],
 ['fire', '!', '<eos>'],
 ['help', '!', '<eos>'],
 ['jump', '.', '<eos>'],
 ['stop', '!', '<eos>']]

## Further prepping the data, first just the english part

In [7]:
class EnglishDataset(torch.utils.data.Dataset):
    
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

In [9]:
# have to convert to integers oops 
eng = Vocab(tokenized[0])
indeces = []
for line in tqdm(tokenized[0]):
    indeces.append(eng[line])
print(indeces[:5])

100%|██████████| 167130/167130 [00:01<00:00, 157630.11it/s]

[[5906, 153, 424], [6459, 153, 424], [11242, 0, 424], [11242, 0, 424], [14551, 426, 424]]





In [10]:
# padding the english dictionary to get all the sequences to be of the same size
def pad(seq, length):
    """Pads the sequence to the appropriate length and returns the mask"""
    padding = length - len(seq)
    masking = [1] * len(seq) + [0] * padding
    return seq + [0] * padding, masking

In [12]:
# padding the dataset 
inputs, masks = [], []
longest = max([len(line) for line in indeces])
for line in indeces:
    input, mask = pad(line, longest)
    inputs.append(input)
    masks.append(mask)

In [42]:
def batchify(data):
    batched = torch.tensor(data)
    print(batched)
    return batched

In [43]:
BATCH_SIZE = 10
dataset = EnglishDataset(inputs)
data_iter = torch.utils.data.DataLoader(dataset, BATCH_SIZE, shuffle=True, collate_fn=batchify, num_workers=0)

## Trying to create a LSTM encoder only

In [44]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded)
        return output, hidden, cell # actual output, short term memory, long term memory

In [48]:
class Decoder(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, input, hidden, cell):
        embedded = self.embedding(input)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.out(output.squeeze(0))
        return prediction, hidden, cell

In [49]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg):
        encoder_outputs, hidden, cell = self.encoder(src)
        # TODO: FINISH THIS PART AND SEE HOW THE ENCODER WOULD CONNECT TO THE DECODER

SyntaxError: invalid syntax (2515453892.py, line 10)

In [None]:
input_dim = len(eng)
embedding_size = 10
hidden_dim = 32
dropout = 0.1
device = 'cpu'
encoder = Encoder(input_dim, embedding_size, hidden_dim, dropout)
criterion = nn.MSELoss()  # Example loss function
optimizer = optim.Adam(encoder.parameters(), lr=0.001)