In [None]:
!pip install torchtext==0.17.2

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
!pip install datasets
from datasets import load_dataset

data = load_dataset("harouzie/vi_en-translation")

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [None]:
src_lang = 'English'
tgt_lang = 'Vietnamese'

token_transform = {}
vocab_transform = {src_lang: None, tgt_lang: None}

token_transform[src_lang] = get_tokenizer('basic_english')
token_transform[tgt_lang] = get_tokenizer('basic_english')

unk_id, pad_id, sos_id, eos_id = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']
def yield_tokens(data_iter, lang):
    for data in data_iter[lang]:
        yield token_transform[lang](data)

for lang in [src_lang, tgt_lang]:
    train_iter = data['train']

    vocab_transform[lang] = build_vocab_from_iterator(
        yield_tokens(train_iter, lang),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )
    vocab_transform[lang].set_default_index(unk_id)

In [None]:
max_len = 100

def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func


def tensor_transform(token_ids):
    return torch.tensor([sos_id] + token_ids + [eos_id])


text_transform = {
    lang: sequential_transforms(
        token_transform[lang],
        vocab_transform[lang],
        tensor_transform
    ) for lang in [src_lang, tgt_lang]
}

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for sample in batch:
        src_sample, tgt_sample = sample[src_lang], sample[tgt_lang]
        src_batch.append(text_transform[src_lang](src_sample).to(dtype=torch.int64))
        tgt_batch.append(text_transform[tgt_lang](tgt_sample).to(dtype=torch.int64))

    src_batch = pad_sequence(src_batch, padding_value=pad_id, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=pad_id, batch_first=True)

    return src_batch, tgt_batch

In [None]:
batch_size = 16

train_dataloader = DataLoader(
    data['train'],
    batch_size=batch_size,
    collate_fn=collate_fn
)
valid_dataloader = DataLoader(
    data['valid'],
    batch_size=batch_size,
    collate_fn=collate_fn
)
test_dataloader = DataLoader(
    data['test'],
    batch_size=batch_size,
    collate_fn=collate_fn
)

In [None]:
class TransformerConfig:

    embedding_dimension: int = 512
    num_attention_heads: int = 8
    attention_dropout_p: float = 0.0
    hidden_dropout_p: float = 0.0
    mlp_ratio: int = 4
    encoder_depth: int = 3
    decoder_depth: int = 3

    src_vocab_size: int = len(vocab_transform["English"])
    tgt_vocab_size: int = len(vocab_transform["Vietnamese"])

    max_src_len: int = 512
    max_tgt_len: int = 512
    learn_pos_embed: bool = False

class PositionalEncoding(nn.Module):


    def __init__(self, max_len, embed_dim, requires_grad=False):
        super(PositionalEncoding, self).__init__()

        self.max_len = max_len
        self.embed_dim = embed_dim
        self.requires_grad = requires_grad

        self.encodings = self._build_positional_encodings()

    def _build_positional_encodings(self):

        encoding = torch.zeros(self.max_len, self.embed_dim, dtype=torch.float)
        postion_idx = torch.arange(0, self.max_len, dtype=torch.float).reshape(-1,1)
        embed_dim_skip_idx = torch.arange(0, self.embed_dim, step=2, dtype=torch.float)

        encoding[:, 0::2] = torch.sin(postion_idx / (10000 ** (embed_dim_skip_idx / self.embed_dim)))
        encoding[:, 1::2] = torch.cos(postion_idx / (10000 ** (embed_dim_skip_idx / self.embed_dim)))

        encoding = nn.Parameter(encoding, requires_grad=self.requires_grad)

        return encoding
    def forward(self, x):


        seq_len = x.shape[1]


        encodings = self.encodings[:seq_len]

        x = x + encodings

        return x


class Embeddings(nn.Module):

    """
    All the embeddings we need for the source and target langauge. Both source and target need:

    - Token Embeddings
    - Positional Embedings
    """

    def __init__(self, config):
        super(Embeddings, self).__init__()

        self.src_embeddings = nn.Embedding(config.src_vocab_size, config.embedding_dimension)
        self.tgt_embeddings = nn.Embedding(config.tgt_vocab_size, config.embedding_dimension)

        self.src_positional_encodings = PositionalEncoding(config.max_src_len,
                                                           config.embedding_dimension,
                                                           config.learn_pos_embed)
        self.tgt_positional_encodings = PositionalEncoding(config.max_tgt_len,
                                                           config.embedding_dimension,
                                                           config.learn_pos_embed)

    def forward_src(self, input_ids):
        embeddings = self.src_embeddings(input_ids)
        embeddings = self.src_positional_encodings(embeddings)
        return embeddings

    def forward_tgt(self, input_ids):
        embeddings = self.tgt_embeddings(input_ids)
        embeddings = self.tgt_positional_encodings(embeddings)
        return embeddings

class Attention(nn.Module):
    """
    Regular Self-Attention but in this case we utilize flash_attention
    incorporated in the F.scaled_dot_product_attention to speed up our training.
    """
    def __init__(self, config):
        super(Attention, self).__init__()


        self.config = config


        assert config.embedding_dimension % config.num_attention_heads == 0, "Double check embedding dim divisible by number of heads"


        self.head_dim = config.embedding_dimension // config.num_attention_heads


        self.q_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
        self.k_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
        self.v_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)

        self.out_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)


    def forward(self,
                src,
                tgt=None,
                attention_mask=None,
                causal=False):


        batch, src_len, embed_dim = src.shape


        if tgt is None:
            q = self.q_proj(src).reshape(batch, src_len, self.config.num_attention_heads, self.head_dim).transpose(1,2).contiguous()
            k = self.k_proj(src).reshape(batch, src_len, self.config.num_attention_heads, self.head_dim).transpose(1,2).contiguous()
            v = self.v_proj(src).reshape(batch, src_len, self.config.num_attention_heads, self.head_dim).transpose(1,2).contiguous()

            if attention_mask is not None:

                attention_mask = attention_mask.bool()
                attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1,1,src_len,1)

            attention_out = F.scaled_dot_product_attention(q,k,v,
                                                           attn_mask=attention_mask,
                                                           dropout_p=self.config.attention_dropout_p if self.training else 0.0,
                                                           is_causal=causal)


        else:
            tgt_len = tgt.shape[1]

            q = self.q_proj(tgt).reshape(batch, tgt_len, self.config.num_attention_heads, self.head_dim).transpose(1,2).contiguous()
            k = self.k_proj(src).reshape(batch, src_len, self.config.num_attention_heads, self.head_dim).transpose(1,2).contiguous()
            v = self.v_proj(src).reshape(batch, src_len, self.config.num_attention_heads, self.head_dim).transpose(1,2).contiguous()

            if attention_mask is not None:

                attention_mask = attention_mask.bool()
                attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1,1,tgt_len,1)

            attention_out = F.scaled_dot_product_attention(q,k,v,
                                                           attn_mask=attention_mask,
                                                           dropout_p=self.config.attention_dropout_p if self.training else 0.0,
                                                           is_causal=False)


        attention_out = attention_out.transpose(1,2).flatten(2)
        attention_out = self.out_proj(attention_out)

        return attention_out
class FeedForward(nn.Module):
    """
    Regular MLP module after our attention computation.
    """
    def __init__(self, config):
        super(FeedForward, self).__init__()

        hidden_size = config.embedding_dimension * config.mlp_ratio
        self.intermediate_dense = nn.Linear(config.embedding_dimension, hidden_size)
        self.activation = nn.GELU()
        self.intermediate_dropout = nn.Dropout(config.hidden_dropout_p)

        self.output_dense = nn.Linear(hidden_size, config.embedding_dimension)
        self.output_dropout = nn.Dropout(config.hidden_dropout_p)

    def forward(self, x):
        x = self.intermediate_dense(x)
        x = self.activation(x)
        x = self.intermediate_dropout(x)

        x = self.output_dense(x)
        x = self.output_dropout(x)
        return x

class TransformerEncoderLayer(nn.Module):

    """
    Stacks together a Self-Attention module and MLP Layer
    """

    def __init__(self, config):
        super(TransformerEncoderLayer, self).__init__()

        self.enc_attention = Attention(config)
        self.dropout = nn.Dropout(config.hidden_dropout_p)
        self.layer_norm = nn.LayerNorm(config.embedding_dimension)
        self.feed_forward = FeedForward(config)
        self.final_layer_norm = nn.LayerNorm(config.embedding_dimension)

    def forward(self, x, attention_mask=None):

        x = x + self.dropout(self.enc_attention(x, attention_mask=attention_mask))
        x = self.layer_norm(x)

        x = x + self.feed_forward(x)
        x = self.final_layer_norm(x)

        return x

class TransformerDecoderLayer(nn.Module):

    """
    Stacks together a Causal-Attention of our target language, Cross Attention with encoded source language,
    and a MLP layer
    """

    def __init__(self, config):
        super(TransformerDecoderLayer, self).__init__()

        self.dec_attention = Attention(config)
        self.dec_attention_dropout = nn.Dropout(config.hidden_dropout_p)
        self.dec_attention_layernorm = nn.LayerNorm(config.embedding_dimension)

        self.cross_attention = Attention(config)
        self.cross_attention_dropout = nn.Dropout(config.hidden_dropout_p)
        self.cross_attention_layernorm = nn.LayerNorm(config.embedding_dimension)

        self.feed_forward = FeedForward(config)
        self.final_layer_norm = nn.LayerNorm(config.embedding_dimension)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):

        tgt = tgt + self.dec_attention_dropout(self.dec_attention(tgt, attention_mask=tgt_mask, causal=True))
        tgt = self.dec_attention_layernorm(tgt)

        tgt = tgt + self.cross_attention_dropout(self.cross_attention(src, tgt, attention_mask=src_mask))
        tgt = self.cross_attention_layernorm(tgt)

        tgt = tgt + self.feed_forward(tgt)
        tgt = self.final_layer_norm(tgt)

        return tgt

class Transformer(nn.Module):


    def __init__(self, config):
        super(Transformer, self).__init__()

        self.config = config

        self.encodings = Embeddings(config)

        self.encoder = nn.ModuleList(
            [TransformerEncoderLayer(config) for _ in range(config.encoder_depth)]
        )

        self.decoder = nn.ModuleList(
            [TransformerDecoderLayer(config) for _ in range(config.decoder_depth)]
        )

        self.head = nn.Linear(config.embedding_dimension, config.tgt_vocab_size)

    def forward(self,
                src_ids,
                tgt_ids,
                src_attention_mask=None,
                tgt_attention_mask=None):

        src_embeddings = self.encodings.forward_src(src_ids)
        tgt_embeddings = self.encodings.forward_tgt(tgt_ids)

        for layer in self.encoder:
            src_embeddings = layer(src_embeddings,
                                   src_attention_mask)

        for layer in self.decoder:
            tgt_embeddings = layer(src_embeddings,
                                   tgt_embeddings,
                                   src_attention_mask,
                                   tgt_attention_mask)

        pred = self.head(tgt_embeddings)

        return pred

    def inference(self,
                  src_ids,
                  tgt_start_id=2,
                  tgt_end_id=3,
                  max_len=512):

        tgt_ids = torch.tensor([tgt_start_id], device=src_ids.device).reshape(1,1)


        src_embeddings = self.encodings.forward_src(src_ids)
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings)


        for i in range(max_len):

            tgt_embeddings = self.encodings.forward_tgt(tgt_ids)
            for layer in self.decoder:
                tgt_embeddings = layer(src_embeddings,
                                       tgt_embeddings)


            tgt_embeddings = tgt_embeddings[:, -1]


            pred = self.head(tgt_embeddings)
            pred = pred.argmax(axis=-1).unsqueeze(0)
            tgt_ids = torch.cat([tgt_ids,pred], axis=-1)

            if torch.all(pred == tgt_end_id):
                break

        return tgt_ids.squeeze().cpu().tolist()

In [None]:
config = TransformerConfig()

model = Transformer(config)

In [None]:
import torch
import pickle

checkpoint = torch.load("/content/model_after.pth", map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

model.load_state_dict(checkpoint)


model.eval()


In [None]:
!pip install faiss-cpu

In [None]:
import pandas as pd
import numpy as np
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer


file_path = "/content/Dataset_Final.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")


column_name = "English"


if df[column_name].dtype == object:
    print("Dữ liệu là văn bản -> Dùng TF-IDF để tạo vector")

    text_data = df[column_name].astype(str).tolist()
    vectorizer = TfidfVectorizer(max_features=300)
    vectors = vectorizer.fit_transform(text_data).toarray().astype(np.float32)

else:
    print("Dữ liệu là số -> Dùng trực tiếp")
    vectors = df.select_dtypes(include=[np.number]).to_numpy().astype(np.float32)


vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)


vector_dim = vectors.shape[1]
index = faiss.IndexFlatIP(vector_dim)
index.add(vectors)

print(f"Đã tạo FAISS index với {index.ntotal} vector, chiều: {vector_dim}")

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics.pairwise import cosine_similarity

def translate_with_reference(input_text, model, df, text_transform, vocab_transform, config, device):
    src_lang = "English"
    tgt_lang = "Vietnamese"
    pad_id = 0
    sos_id = 2
    eos_id = 3


    input_tensor = text_transform[src_lang](input_text).unsqueeze(0).to(device)

    with torch.no_grad():
        src_embedding = model.encodings.forward_src(input_tensor)


    english_sentences = df["English"].tolist()
    ref_tensors = [text_transform[src_lang](sent) for sent in english_sentences]
    ref_batch = pad_sequence(ref_tensors, padding_value=pad_id, batch_first=True).to(device)

    with torch.no_grad():
        ref_embeddings = model.encodings.forward_src(ref_batch)


    src_embedding_mean = src_embedding.mean(dim=1)
    ref_embeddings_mean = ref_embeddings.mean(dim=1)
    similarities = F.cosine_similarity(src_embedding_mean, ref_embeddings_mean, dim=1)
    ref_idx = similarities.argmax().item()
    ref_sentence_en = english_sentences[ref_idx]


    ref_tensor = text_transform[src_lang](ref_sentence_en).unsqueeze(0).to(device)
    with torch.no_grad():
        ref_embedding = model.encodings.forward_src(ref_tensor)


    combined_embedding = torch.cat([ref_embedding, src_embedding], dim=1)
    combined_embedding = combined_embedding[:, :config.max_src_len, :]


    ref_len = ref_embedding.shape[1]
    src_len = src_embedding.shape[1]
    combined_len = min(ref_len + src_len, config.max_src_len)
    src_attention_mask = torch.ones(1, combined_len, dtype=torch.int64).to(device)


    with torch.no_grad():
        for layer in model.encoder:
            combined_embedding = layer(combined_embedding, attention_mask=src_attention_mask)


    src_embedding_processed = combined_embedding[:, ref_len:, :]


    def inference_with_embedding(model, src_embedding, tgt_start_id=sos_id, tgt_end_id=eos_id, max_len=512):
        tgt_ids = torch.tensor([tgt_start_id], device=src_embedding.device).reshape(1, 1)
        with torch.no_grad():
            for _ in range(max_len):
                tgt_embeddings = model.encodings.forward_tgt(tgt_ids)
                for layer in model.decoder:
                    tgt_embeddings = layer(src_embedding, tgt_embeddings)
                tgt_embeddings = tgt_embeddings[:, -1]
                pred = model.head(tgt_embeddings)
                pred = pred.argmax(dim=-1).unsqueeze(0)
                tgt_ids = torch.cat([tgt_ids, pred], dim=-1)
                if torch.all(pred == tgt_end_id):
                    break
        return tgt_ids.squeeze().cpu().tolist()


    model.eval()
    tgt_ids = inference_with_embedding(model, src_embedding_processed)


    vocab = vocab_transform[tgt_lang]
    translated_sentence = " ".join([vocab.lookup_token(id) for id in tgt_ids if id not in [pad_id, sos_id, eos_id]])
    return translated_sentence

In [None]:
def translate_without_reference(input_text, model, text_transform, vocab_transform, config, device):

    src_lang = "English"
    tgt_lang = "Vietnamese"
    pad_id = 0
    sos_id = 2
    eos_id = 3


    input_tensor = text_transform[src_lang](input_text).unsqueeze(0).to(device)


    with torch.no_grad():
        src_embedding = model.encodings.forward_src(input_tensor)
        for layer in model.encoder:
            src_embedding = layer(src_embedding)


    def inference_without_rag(src_embedding, tgt_start_id=sos_id, tgt_end_id=eos_id, max_len=512):
        tgt_ids = torch.tensor([tgt_start_id], device=src_embedding.device).reshape(1, 1)
        with torch.no_grad():
            for _ in range(max_len):
                tgt_embeddings = model.encodings.forward_tgt(tgt_ids)
                for layer in model.decoder:
                    tgt_embeddings = layer(src_embedding, tgt_embeddings)
                tgt_embeddings = tgt_embeddings[:, -1]
                pred = model.head(tgt_embeddings)
                pred = pred.argmax(dim=-1).unsqueeze(0)
                tgt_ids = torch.cat([tgt_ids, pred], dim=-1)
                if torch.all(pred == tgt_end_id):
                    break
        return tgt_ids.squeeze().cpu().tolist()


    model.eval()
    tgt_ids = inference_without_rag(src_embedding)


    vocab = vocab_transform[tgt_lang]
    translated_sentence = " ".join([vocab.lookup_token(id) for id in tgt_ids if id not in [pad_id, sos_id, eos_id]])
    return translated_sentence

In [None]:
import numpy as np
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
input_text = "Anxiety disorders can interfere with daily life and cause severe mental health issues."

df = df.dropna(subset=["English"])
df["English"] = df["English"].astype(str)

translated_text_with_rag = translate_with_reference(input_text, model, df, text_transform, vocab_transform, config, device)
print(f"Bản dịch với RAG: {translated_text_with_rag}")

translated_text_without_rag = translate_without_reference(input_text, model, text_transform, vocab_transform, config, device)
print(f"Bản dịch không RAG: {translated_text_without_rag}")

In [None]:
import numpy as np
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"


sentences = [
    "Hepatitis B virus infection",
    "Doctors are concerned about Other disorders of nervous system in diseases classified elsewhere in the community",
    "Dengue hemorrhagic fever needs careful monitoring to prevent complications.",
    "Chronic kidney disease requires long-term treatment.",
    "Asthma is a chronic disease that affects the airways and causes breathing difficulties.",
    "Anxiety disorders can interfere with daily life and cause severe mental health issues."
]


df = df.dropna(subset=["English"])
df["English"] = df["English"].astype(str)


for input_text in sentences:

    translated_text_with_rag = translate_with_reference(input_text, model, df, text_transform, vocab_transform, config, device)
    print(f"Câu gốc: {input_text}")
    print(f"Bản dịch với RAG: {translated_text_with_rag}")


    translated_text_without_rag = translate_without_reference(input_text, model, text_transform, vocab_transform, config, device)
    print(f"Bản dịch không RAG: {translated_text_without_rag}")


    print("__________________________")