# Import libraries

In [None]:
!pip install datasets
!pip install evaluate rouge_score
!pip install accelerate

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoConfig, MBartForConditionalGeneration, PretrainedConfig
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
import numpy as np
import pandas as pd
import accelerate

# Model

In [None]:
class AddLayer(nn.Module):
    def __init__(self, len_output):
        super(AddLayer, self).__init__()
        self.linear1 =  nn.Linear(64001,1024)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(1024, len_output)
        
    def forward(self,x):
        x = self.linear1(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

class CustomeLinear(MBartForConditionalGeneration):
    def __init__(self, len_output, config):
        super(CustomeLinear, self).__init__(config)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
        del self.lm_head
        self.lm_head = AddLayer(len_output)    
        self.init_output_layer()

    def init_output_layer(self):
        for name, param in self.lm_head.named_parameters():
            if "weight" in name:
                nn.init.xavier_uniform_(param)
            elif "bias" in name:
                # Initialize biases to zeros or with a specific value
                nn.init.constant_(param, 0.0)  # Initialize biases to zeros

model_path = "vinai/bartpho-word"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = PretrainedConfig.from_pretrained(model_path, forced_eos_token_id=2)
model = CustomeLinear(len_output=64001, config=config).to(device)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt")

# Dataset

In [None]:
data = load_dataset("Valleyy/final_nlp_data")
def data_processing(dataset):
    document = dataset["Segmented_document"]
    summary = dataset["Segmented_summary"]
    doc_tokenizer = tokenizer(document, max_length = 1024, truncation = True)
    sum_tokenizer = tokenizer(summary, max_length = 1024, truncation = True)
    input_ids = doc_tokenizer["input_ids"]
    attention_mask = doc_tokenizer["attention_mask"]
    labels = sum_tokenizer["input_ids"]
    dataset = DatasetDict({"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels})
    return dataset
dataset = data.map(data_processing, remove_columns=[ 'Document', 'Summary', 'Segmented_document','Segmented_summary', 'Dataset'], batched = True, batch_size = 128)

# Training

In [None]:
metric = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions[predictions == -100] = 1
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.eos_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True,)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
args = Seq2SeqTrainingArguments(output_dir = "/kaggle/working/",
                                evaluation_strategy="epoch",
                                save_strategy="epoch",
                               per_device_train_batch_size=3,
                               per_device_eval_batch_size=3,
                               learning_rate=1e-4,
                               weight_decay=1e-2,
                               load_best_model_at_end =True,
                               predict_with_generate=True,
                               num_train_epochs=5,
                               logging_strategy="epoch",
                               generation_max_length=256,
                               save_total_limit = 1,
                               fp16=True,
                               remove_unused_columns=False)

trainer = Seq2SeqTrainer(model=model,
                        args=args,
                        train_dataset=dataset["train"],
                        eval_dataset=dataset["valid"],
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics,
                        data_collator = data_collator)

In [None]:
torch.cuda.empty_cache()
torch.cuda.memory_allocated()
PYTORCH_CUDA_ALLOC_CONF=expandable_segments=256

In [None]:
import wandb
wandb.login(key="695d88b353e77947e7dab7e474bbac18056e9978")

wandb.init(project="NLP-bartpho")

In [None]:
trainer.train()

In [None]:
model.push_to_hub(repo_id="MinhViet/bartpho-linear", use_auth_token="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")

In [None]:
import os

# Path to the directory to save the model
save_model_dir = '/kaggle/working/save_model'

# Check if the directory exists, if not, create it
if not os.path.exists(save_model_dir):
    os.makedirs(save_model_dir)
    print(f"The directory '{save_model_dir}' has been created.")
else:
    print(f"The directory '{save_model_dir}' already exists.")

In [None]:
trainer.save_model("/kaggle/working/save_model")