In [59]:
import cv2 
import numpy as np 
import os 
import matplotlib.pyplot as plt 
import time 
import torch 
import torch.nn as nn 
from tqdm import tqdm
import pandas as pd
import torch.nn as nn 
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim

In [60]:
data = pd.read_csv('/kaggle/input/asl-sentences/asl_sentences.tsv', sep='\t', header=None, names=['input', 'target'])

In [61]:
data = data[1:]

In [62]:
data

Unnamed: 0,input,target
1,meet you please,Please meet you.
2,yes you sit,"Yes, you can sit down."
3,sit please,Please sit down.
4,sit please,Please sit down.
5,yes you sit,"Yes, you can sit down."
...,...,...
96,meet you please,Please meet you.
97,hello my name Ben,"Hello, my name is Ben."
98,meet you please,Please meet you.
99,hello my name Ben,"Hello, my name is Ben."


In [63]:
def tokenize(text):
    return text.lower().strip().split()

In [64]:
all_input_words = []
all_target_words = []
for idx,row in data.iterrows():
    all_input_words.extend(tokenize(row['input']))
    all_target_words.extend(tokenize(row['target'])) # Đảm bảo đây là row['target']
PAD_TOKEN = '<PAD>'
SOS_TOKEN = '<SOS>'
EOS_TOKEN = '<EOS>'

input_vocab = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN] + list(set(all_input_words))

target_vocab = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN] + list(set(all_target_words))

input_word2idx = {word : idx for idx,word in enumerate(input_vocab)}
input_idx2word = {idx : word for idx,word in input_word2idx.items()} # Cách này đúng

target_word2idx = {word : idx for idx,word in enumerate(target_vocab)}
target_idx2word = {idx : word for idx,word in enumerate(target_vocab)} # Cách xây dựng trực tiếp target_idx2word từ target_vocab



print("\n--- DEBUG VOCAB CREATION ---")
print(f"target_vocab (len {len(target_vocab)}): {target_vocab}")
print(f"target_word2idx['<EOS>']: {target_word2idx.get('<EOS>')}")
eos_idx = target_word2idx.get('<EOS>')
if eos_idx is not None:
    print(f"target_idx2word[{eos_idx}]: {target_idx2word.get(eos_idx)}")
print(f"Keys in target_idx2word (first 10 of {len(target_idx2word)}): {sorted(list(target_idx2word.keys()))[:10]}")
print("--- END DEBUG VOCAB CREATION ---\n")


--- DEBUG VOCAB CREATION ---
target_vocab (len 20): ['<PAD>', '<SOS>', '<EOS>', 'yes,', 'to', 'nice', 'name', 'ben.', 'hello,', 'eva.', 'you.', 'down.', 'you', 'my', 'sit', 'please', 'meet', 'can', 'is', 'john.']
target_word2idx['<EOS>']: 2
target_idx2word[2]: <EOS>
Keys in target_idx2word (first 10 of 20): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
--- END DEBUG VOCAB CREATION ---



In [65]:
len(input_vocab), len(target_vocab) 


(15, 20)

In [66]:
def encode_input(text):
    return [input_word2idx[word] for word in tokenize(text)]
def encode_target(text):
    return [target_word2idx[SOS_TOKEN]] + [target_word2idx[word] for word in tokenize(text)] + [target_word2idx[EOS_TOKEN]]

In [67]:
class Seq2SeqDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].apply(encode_input).tolist()
        self.targets = df['target'].apply(encode_target).tolist()
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]
        
dataset = Seq2SeqDataset(data)

In [68]:
def collate_fn(batch):
    inputs, targets = zip(*batch)
    
    input_lens = [len(seq) for seq in inputs]
    target_lens = [len(seq) for seq in targets]
    
    max_input_len = max(input_lens)
    max_target_len = max(target_lens)
    
    padded_inputs = []
    padded_targets = []
    
    for seq in inputs:
        padded_seq = seq + [input_word2idx[PAD_TOKEN]] * (max_input_len - len(seq))
        padded_inputs.append(padded_seq)
        
    for seq in targets:
        padded_seq = seq + [target_word2idx[PAD_TOKEN]] * (max_target_len - len(seq))
        padded_targets.append(padded_seq)
        
    return torch.tensor(padded_inputs), torch.tensor(input_lens), torch.tensor(padded_targets), torch.tensor(target_lens)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [69]:
class Encoder(nn.Module):
    def __init__(self, input_vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
    
    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.gru(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden

In [70]:
class Decoder(nn.Module):
    def __init__(self, target_vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(target_vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, target_vocab_size)
    
    def forward(self, x, hidden):
        x = x.unsqueeze(1)  # batch_size x 1
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded, hidden)
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden

In [71]:
def train_epoch(encoder, decoder, dataloader, encoder_optimizer, decoder_optimizer, criterion, device):
    encoder.train()
    decoder.train()
    
    total_loss = 0
    
    for inputs, input_lens, targets, target_lens in dataloader:
        inputs, input_lens = inputs.to(device), input_lens.to(device)
        targets, target_lens = targets.to(device), target_lens.to(device)
        
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        encoder_outputs, hidden = encoder(inputs, input_lens)
        
        batch_size = inputs.size(0)
        max_target_len = targets.size(1)
        
        # decoder input start with <SOS>
        decoder_input = targets[:, 0]  
        decoder_hidden = hidden
        
        loss = 0
        # Teacher forcing ratio
        teacher_forcing_ratio = 0.5
        
        for t in range(1, max_target_len):
            output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(output, targets[:, t])
            
            teacher_force = np.random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = targets[:, t] if teacher_force else top1
        
        loss.backward()
        
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        total_loss += loss.item() / max_target_len
    
    return total_loss / len(dataloader)

In [72]:
def encode_input(text): 
    return [input_word2idx[word] for word in tokenize(text)]

In [73]:
def evaluate(encoder, decoder, sentence, input_word2idx, target_word2idx, target_idx2word, encode_input_func, max_len=20, device='cpu'): # đổi tên encode_input để tránh nhầm lẫn
    encoder.eval()
    decoder.eval()

    # Sử dụng hàm encode_input_func được truyền vào
    tokens = encode_input_func(sentence) # Đổi tên ở đây nếu cần
    print(f"Input sentence: '{sentence}'")
    print(f"Tokens: {tokens}")
    print(f"Input vocab size for check: {len(input_word2idx)}")


    inputs = torch.tensor(tokens).unsqueeze(0).to(device)
    lengths = torch.tensor([len(tokens)]).to(device)

    with torch.no_grad():
        encoder_outputs, hidden = encoder(inputs, lengths)
        decoder_input = torch.tensor([target_word2idx['<SOS>']]).to(device)
        decoder_hidden = hidden

        decoded_words = []
 

        for i in range(max_len):
            output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            top1 = output.argmax(1).item()
   

            if top1 not in target_idx2word:
                print(f"----> CRITICAL: Index {top1} is NOT a key in target_idx2word (max index is {len(target_idx2word)-1})")
            
            if top1 == target_word2idx['<EOS>']:
                print("Predicted EOS token.")
                break

            word = target_idx2word.get(top1, '<UNK>') # Default to <UNK>
            if word == '<UNK>':
                print(f"----> WARNING: Predicted token is <UNK> for index {top1}")
            decoded_words.append(word)

            decoder_input = torch.tensor([top1]).to(device)
    
    return ' '.join(decoded_words)

In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embed_size = 256
hidden_size = 512

encoder = Encoder(len(input_vocab), embed_size, hidden_size).to(device)
decoder = Decoder(len(target_vocab), embed_size, hidden_size).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=target_word2idx[PAD_TOKEN])

num_epochs = 100
for epoch in range(num_epochs):
    loss = train_epoch(encoder, decoder, dataloader, encoder_optimizer, decoder_optimizer, criterion, device)
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

Epoch 1, Loss: 1.9429
Epoch 2, Loss: 1.1226
Epoch 3, Loss: 0.4915
Epoch 4, Loss: 0.1519
Epoch 5, Loss: 0.0997
Epoch 6, Loss: 0.0551
Epoch 7, Loss: 0.0463
Epoch 8, Loss: 0.0455
Epoch 9, Loss: 0.0311
Epoch 10, Loss: 0.0277
Epoch 11, Loss: 0.0177
Epoch 12, Loss: 0.0133
Epoch 13, Loss: 0.0124
Epoch 14, Loss: 0.0091
Epoch 15, Loss: 0.0080
Epoch 16, Loss: 0.0076
Epoch 17, Loss: 0.0034
Epoch 18, Loss: 0.0035
Epoch 19, Loss: 0.0039
Epoch 20, Loss: 0.0019
Epoch 21, Loss: 0.0017
Epoch 22, Loss: 0.0014
Epoch 23, Loss: 0.0016
Epoch 24, Loss: 0.0015
Epoch 25, Loss: 0.0013
Epoch 26, Loss: 0.0012
Epoch 27, Loss: 0.0010
Epoch 28, Loss: 0.0011
Epoch 29, Loss: 0.0011
Epoch 30, Loss: 0.0009
Epoch 31, Loss: 0.0010
Epoch 32, Loss: 0.0009
Epoch 33, Loss: 0.0008
Epoch 34, Loss: 0.0008
Epoch 35, Loss: 0.0008
Epoch 36, Loss: 0.0007
Epoch 37, Loss: 0.0008
Epoch 38, Loss: 0.0008
Epoch 39, Loss: 0.0007
Epoch 40, Loss: 0.0006
Epoch 41, Loss: 0.0007
Epoch 42, Loss: 0.0006
Epoch 43, Loss: 0.0005
Epoch 44, Loss: 0.00

In [77]:
def encode_input_for_eval(sentence):
    return [input_word2idx.get(word, input_word2idx['<PAD>']) for word in sentence.lower().split()]

predicted_sentence = evaluate(encoder, decoder, "hello my name John", input_word2idx, target_word2idx, target_idx2word, encode_input_for_eval, device=device)
print(predicted_sentence)

Input sentence: 'hello my name John'
Tokens: [7, 9, 5, 4]
Input vocab size for check: 15
Predicted EOS token.
hello, my name is john.


In [76]:
torch.save(encoder.state_dict(),"/kaggle/working/encoder.pth")
torch.save(decoder.state_dict(),"/kaggle/working/decoder.pth")