In [1]:
!pip install --upgrade sentencepiece accelerate peft bitsandbytes

Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.17.0-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch

In [2]:
import torch
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import PeftModel
from tqdm.auto import tqdm

2025-08-14 14:40:56.358242: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755182456.580197      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755182456.642812      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# --- CÁC ĐƯỜNG DẪN ---
# EN -> VI
en_vi_base_model_name = "Helsinki-NLP/opus-mt-en-vi"
en_vi_adapter_path = "/kaggle/input/opus-checkpoint-56250" 
en_vi_input_csv = "/kaggle/input/medical-private-test/en.csv"

# VI -> EN
vi_en_base_model_name = "Helsinki-NLP/opus-mt-vi-en"
vi_en_adapter_path = "/kaggle/input/opus-checkpoint-28125" 
vi_en_input_csv = "/kaggle/input/medical-private-test/vi.csv"

device = "cuda" if torch.cuda.is_available() else "cpu"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

def load_finetuned_model(base_model_name, adapter_path):
    """Tải model gốc, áp dụng và hợp nhất adapter LoRA."""
    print(f"Loading model: {base_model_name}")
    base_model = AutoModelForSeq2SeqLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map={"":0}
    )
    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    model = PeftModel.from_pretrained(base_model, adapter_path)
    model = model.merge_and_unload()
    model.eval()
    print(f"Model {base_model_name} is ready.")
    return model, tokenizer

def translate_series(series, model, tokenizer, batch_size=16):
    """Dịch một cột (Series) của DataFrame."""
    sentences = series.tolist()
    predictions = []
    for i in tqdm(range(0, len(sentences), batch_size), desc=f"Translating {len(sentences)} sentences"):
        batch = sentences[i:i + batch_size]
        with torch.no_grad():
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
            generated_ids = model.generate(
                **inputs, max_new_tokens=128, num_beams=5, no_repeat_ngram_size=2, early_stopping=True
            )
            batch_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        predictions.extend(batch_preds)
    return predictions

# --- BƯỚC 1: TẢI CẢ HAI MODEL ---
model_en_vi, tokenizer_en_vi = load_finetuned_model(en_vi_base_model_name, en_vi_adapter_path)
model_vi_en, tokenizer_vi_en = load_finetuned_model(vi_en_base_model_name, vi_en_adapter_path)

# --- BƯỚC 2: ĐỌC VÀ DỊCH TỪNG FILE ---
print("\n--- Starting EN -> VI Translation ---")
df_en_vi = pd.read_csv(en_vi_input_csv)
translated_vietnamese = translate_series(df_en_vi['English'], model_en_vi, tokenizer_en_vi)

print("\n--- Starting VI -> EN Translation ---")
df_vi_en = pd.read_csv(vi_en_input_csv)
translated_english = translate_series(df_vi_en['Vietnamese'], model_vi_en, tokenizer_vi_en)

Loading model: Helsinki-NLP/opus-mt-en-vi


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/289M [00:00<?, ?B/s]



Model Helsinki-NLP/opus-mt-en-vi is ready.
Loading model: Helsinki-NLP/opus-mt-vi-en


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/289M [00:00<?, ?B/s]

Model Helsinki-NLP/opus-mt-vi-en is ready.

--- Starting EN -> VI Translation ---


Translating 1000 sentences:   0%|          | 0/63 [00:00<?, ?it/s]


--- Starting VI -> EN Translation ---


Translating 1000 sentences:   0%|          | 0/63 [00:00<?, ?it/s]

In [4]:
# --- 5. Process and Split the Results ---
print("Cleaning and organizing results...")

# Function to clean the output text (remove potential artifacts)
def clean_output(text):
    # Remove any special tokens or instructions that might be left over
    cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
    # Remove any other potential special tokens left over
    cleaned = re.sub(r'<\|.*?\|>', '', cleaned).strip()
    return cleaned

# Extract the cleaned text for each translation direction
translated_english = [clean_output(out) for out in translated_english]
translated_vietnamese = [clean_output(out) for out in translated_vietnamese]

Cleaning and organizing results...


In [5]:
# --- 6. Create the Final DataFrame and Save ---
print("Creating the final DataFrame...")

# Create a new DataFrame with all the original and translated columns
results_df = pd.DataFrame({
    #'Original_Vietnamese': df_test['Vietnamese'],
    'English': translated_english,
    #'Original_English': df_test['English'],
    'Vietnamese': translated_vietnamese
})

# Display the first few rows to check
print("\n--- Sample of Results ---")
print(results_df.head())
print("--------------------------")

output_file = '/kaggle/working/results.csv'
# Save the final DataFrame to a CSV file
try:
    results_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"\nDone. CSV file with bidirectional translations saved successfully at: {output_file}")
except Exception as e:
    print(f"\nError: Could not save the file. Reason: {e}")

Creating the final DataFrame...

--- Sample of Results ---
                                             English  \
0  Inhalation (A18.0.05), the region of the phosp...   
1  Absorptions of image diagnosis of other parts ...   
2      The tendon and muscle damage in the arm range   
3                         Intravenous nerve syndrome   
4        Fake joints after fixed spinal cord (M96.0)   

                                          Vietnamese  
0  Đánh giá các triệu chứng lâm sàng, cận lâm lòn...  
1  Đánh giá triệu chứng lâm sàng, cận lâm lòng củ...  
2   Có mối liên quan giữa rối loạn chức năng âm đạo.  
3  Nhiễm dịch đường truyền trên V là một bệnh phổ...  
4  Các triệu chứng chính là viêm họng, tắc mũi, v...  
--------------------------

Done. CSV file with bidirectional translations saved successfully at: /kaggle/working/results.csv
