In [None]:
import os
import warnings
import pandas as pd
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

# Directory where models are saved
checkpoint_dir = "./vit5-finetuned-vietnews-3"
model_name = "/kaggle/input/train2/kaggle/working/vit5-finetuned-vietnews-2/checkpoint-4000"

# Load the tokenizer (this doesn't change)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()

# Load the dataset
train_data = pd.read_csv("/kaggle/input/datasetkhai/articles_training.tsv", sep="\t")
print(f'Train data: {len(train_data)}')

train_data, valid_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Define a custom dataset class
class VietNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_target_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        input_text = "vietnews: " + item['content'] + "</s>"
        target_text = item['tags']

        # Tokenize input and target
        inputs = self.tokenizer(
            input_text, max_length=self.max_input_length,
            padding="max_length", truncation=True, return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text, max_length=self.max_target_length,
            padding="max_length", truncation=True, return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": targets["input_ids"].squeeze(0),
        }

train_dataset = VietNewsDataset(train_data, tokenizer)
valid_dataset = VietNewsDataset(valid_data, tokenizer)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./vit5-finetuned-vietnews-3",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=500,
    save_steps=500,
    eval_steps=500,
    gradient_accumulation_steps=2,
    fp16=True,  # Enable mixed precision for faster training
    report_to="none",
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


# Save the fine-tuned model after training
trainer.save_model(checkpoint_dir)

# Optionally save tokenizer
tokenizer.save_pretrained(checkpoint_dir)

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

In [None]:
from torch.utils.data import DataLoader
import torch
import warnings
import pandas as pd
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from tqdm import tqdm
class VietNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_target_length=64):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        input_text = "vietnews: " + item['content'] + "</s>"
        target_text = item['tags']

        # Tokenize input and target
        inputs = self.tokenizer(
            input_text, max_length=self.max_input_length,
            padding="max_length", truncation=True, return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text, max_length=self.max_target_length,
            padding="max_length", truncation=True, return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": targets["input_ids"].squeeze(0),
        }

model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/vit5-finetuned-vietnews-3/checkpoint-2000").cuda()
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/vit5-finetuned-vietnews-3/checkpoint-2000")

################################################### 100 samles
test_data = pd.read_csv("/kaggle/input/dataset/train1.tsv", sep="\t")
test_data = test_data.head(300)
test_dataset = VietNewsDataset(test_data, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8)

model.eval()
results = []
for batch in tqdm(test_dataloader, desc="Generating Predictions"):
    with torch.no_grad():
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            no_repeat_ngram_size=3,
        )

    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_targets = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

    for pred, target in zip(decoded_preds, decoded_targets):
        results.append({"Prediction": pred, "Target": target})

def calculate_accuracy(results):
    total_correct = 0
    total_target = 0
    for idx, result in enumerate(tqdm(results, desc="Calculating Accuracy")):
        prediction_tags = set(result['Prediction'].split(','))
        target_tags = set(result['Target'].split(','))

        correct_matches = prediction_tags.intersection(target_tags)
        num_correct_matches = len(correct_matches)

        total_correct += num_correct_matches
        total_target += len(target_tags)

        accuracy = num_correct_matches / len(target_tags) * 100
        print(prediction_tags)
        print(target_tags)
        print(f'Local Accuracy: {(num_correct_matches/len(target_tags)):.2f}')

    print(f'Overall Accuracy: {(total_correct/total_target):.2f}')

calculate_accuracy(results)

In [None]:
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset
import torch
from tqdm import tqdm

class VietNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        input_text = "vietnews: " + item['content'] + "</s>"

        # Tokenize input
        inputs = self.tokenizer(
            input_text, max_length=self.max_input_length,
            padding="max_length", truncation=True, return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
        }

test_data = pd.read_csv('testall.tsv', sep='\t')

tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/vit5-finetuned-vietnews-3/checkpoint-2000")
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/vit5-finetuned-vietnews-3/checkpoint-2000").cuda()

test_dataset = VietNewsDataset(test_data, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=64)

model.eval()
predictions = []

for batch in tqdm(test_dataloader, desc="Generating Predictions"):
    with torch.no_grad():
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            no_repeat_ngram_size=3,
        )

    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    for pred in decoded_preds:
        predictions.append(pred)

output_df = pd.DataFrame({'content': test_data['content'], 'tags': predictions})
# output_df.to_csv('predictions.csv', index=False)
output_df.to_csv('predictions.tsv', sep='\t', index=False)
