# Import libraries

In [1]:
!pip install datasets
!pip install evaluate rouge_score
!pip install accelerate
!pip install transformers

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a4020957f707341c9c2d6d2e9ec3a9a3fa76a2e6645fc865f96b42e8ce12fc0d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.2 rouge_score-0.1.2


In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
import numpy as np
import pandas as pd
import accelerate


2024-05-30 10:50:07.743785: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-30 10:50:07.743876: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-30 10:50:07.915512: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from transformers.models.mbart.modeling_mbart import shift_tokens_right


# Model

In [4]:
model_path ="vinai/bartpho-word"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model, return_tensors = "pt")

config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


# Dataset

In [5]:
data = load_dataset("Valleyy/extract_plus_original_nlp")

Downloading readme:   0%|          | 0.00/596 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 30.2M/30.2M [00:00<00:00, 76.7MB/s]
Downloading data: 100%|██████████| 7.52M/7.52M [00:00<00:00, 39.6MB/s]


Generating train split:   0%|          | 0/8363 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/2091 [00:00<?, ? examples/s]

In [6]:
def data_processing(dataset):
    document = dataset["Extracted_document"]
    summary = dataset["Segmented_summary"]
    doc_tokenizer = tokenizer(document, max_length = 1024, truncation = True)
    sum_tokenizer = tokenizer(summary, max_length = 1024, truncation = True)
    input_ids = doc_tokenizer["input_ids"]
    attention_mask = doc_tokenizer["attention_mask"]
    labels = sum_tokenizer["input_ids"]
    dataset = DatasetDict({"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels})
    return dataset

In [7]:
dataset = data.map(data_processing, remove_columns=[ 'Document', 'Summary', 'Segmented_document','Segmented_summary', 'Dataset','Extracted_document'], batched = True, batch_size = 128)

Map:   0%|          | 0/8363 [00:00<?, ? examples/s]

Map:   0%|          | 0/2091 [00:00<?, ? examples/s]

# Training

In [8]:
metric = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions[predictions == -100] = 1
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.eos_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True,)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [9]:
#model1 = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

In [10]:
class CustomTrainer(Seq2SeqTrainer):
    def __init__(self, *args, loss_length_penalty=0.6, margin_value=0.0, a=1.0, b=1.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_length_penalty = loss_length_penalty
        self.margin_value = margin_value
        self.a = a
        self.b = b
        self.tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-word")
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        pad_token_id = self.tokenizer.pad_token_id
        # Generate silver summaries
        with torch.no_grad():
            silver_tgt_ids = model.module.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                use_cache=False,
                decoder_start_token_id=model.module.config.decoder_start_token_id,
                num_beams=1,
                max_length=model.module.config.max_length,
            )
            
            # Calculate negative logits
        p_lm_logits = model(inputs["input_ids"], attention_mask=inputs["attention_mask"], decoder_input_ids=shift_tokens_right(labels, pad_token_id))["logits"]
        n_lm_logits = model(inputs["input_ids"], attention_mask=inputs["attention_mask"], decoder_input_ids=shift_tokens_right(silver_tgt_ids, pad_token_id))["logits"]

        # Calculate custom loss

        loss, p_score, n_score = self.cal_loss(p_lm_logits, n_lm_logits, labels, silver_tgt_ids)
        outputs.loss = loss

        return (loss, outputs) if return_outputs else loss
    
    def cal_loss(self, p_lm_logits, n_lm_logits, p_tgt_ids, n_tgt_ids):
        p_lodprobs = torch.nn.functional.log_softmax(p_lm_logits, dim=-1)
        n_lodprobs = torch.nn.functional.log_softmax(n_lm_logits, dim=-1)
        p_tgt_ids[p_tgt_ids == -100] = self.tokenizer.pad_token_id
        n_tgt_ids[n_tgt_ids == -100] = self.tokenizer.pad_token_id
        if p_tgt_ids.dim() == p_lodprobs.dim() - 1:
            p_tgt_ids = p_tgt_ids.unsqueeze(-1)
        if n_tgt_ids.dim() == n_lodprobs.dim() - 1:
            n_tgt_ids = n_tgt_ids.unsqueeze(-1)
        '''
        print("p_lodprobs shape:", p_lodprobs.shape)
        print("n_lodprobs shape:", n_lodprobs.shape)
        print("p_tgt_ids shape:", p_tgt_ids.shape)
        print("n_tgt_ids shape:", n_tgt_ids.shape)
        max_index = p_lodprobs.size(-1) - 1
        min_index = 0
        out_of_bounds = (p_tgt_ids < min_index) | (p_tgt_ids > max_index)
        print("Indices out of bounds:", out_of_bounds)
        
        indices_true = torch.nonzero(out_of_bounds, as_tuple=False)

        # print out_of_bounds index
        for index in indices_true:
            value = p_tgt_ids[index[0], index[1], index[2]].item()
            print(f"Value at index {index}: {value}")
        '''
        p_logprobs = p_lodprobs.gather(dim=-1, index=p_tgt_ids)
        n_logprobs = n_lodprobs.gather(dim=-1, index=n_tgt_ids)

        p_pad_mask = p_tgt_ids.eq(self.tokenizer.pad_token_id)
        n_pad_mask = n_tgt_ids.eq(self.tokenizer.pad_token_id)

        p_logprobs.masked_fill_(p_pad_mask, 0.0)
        n_logprobs.masked_fill_(n_pad_mask, 0.0)

        p_logprobs = p_logprobs.squeeze(-1)
        n_logprobs = n_logprobs.squeeze(-1)

        p_pad_mask = p_pad_mask.squeeze(-1)
        n_pad_mask = n_pad_mask.squeeze(-1)

        p_sum_logprobs = p_logprobs.sum(dim=-1)
        n_sum_logprobs = n_logprobs.sum(dim=-1)

        p_length = p_tgt_ids.size(1) - p_pad_mask.sum(dim=-1)
        n_length = n_tgt_ids.size(1) - n_pad_mask.sum(dim=-1)

        p_score = p_sum_logprobs / (p_length ** self.loss_length_penalty)
        n_score = n_sum_logprobs / (n_length ** self.loss_length_penalty)

        ce_loss = - p_sum_logprobs / p_length

        con_loss = torch.nn.functional.relu(n_score - p_score + self.margin_value)

        loss = con_loss * self.a + ce_loss * self.b

        loss = loss.mean()

        return loss, -p_score.mean(), -n_score.mean()



In [11]:
args = Seq2SeqTrainingArguments(output_dir = "/kaggle/working/",
                                evaluation_strategy="epoch",
                                save_strategy="epoch",
                               per_device_train_batch_size=1,
                               per_device_eval_batch_size=1,
                               learning_rate=1e-4,
                               weight_decay=1e-2,
                               load_best_model_at_end =True,
                               predict_with_generate=True,
                               num_train_epochs=4,
                               logging_strategy="epoch",
                               generation_max_length=1024,
                                save_total_limit = 1,
                               fp16=True)

custom_trainer = CustomTrainer(model=model,
                        args=args,
                        train_dataset=dataset["train"],
                        eval_dataset=dataset["valid"],
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics,
                        data_collator = data_collator,
                        loss_length_penalty=0.8,
                        margin_value=1.5,
                        a=1.0,
                        b=1.0,)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
torch.cuda.empty_cache()
torch.cuda.memory_allocated()
PYTORCH_CUDA_ALLOC_CONF=expandable_segments=256

In [13]:
import wandb
wandb.login(key="")

wandb.init(project="SMALL_FINE_TUNE_BART")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdoanh-earth99[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240530_105109-imlm364e[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mstellar-puddle-56[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/doanh-earth99/SMALL_FINE_TUNE_BART[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/doanh-earth99/SMALL_FINE_TUNE_BART/runs/imlm364e[0m


In [14]:
custom_trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.0032,2.766715,0.5135,0.2211,0.3369,0.337,39.9048
2,2.0492,2.569997,0.5501,0.2555,0.3627,0.3627,42.6227
3,1.1441,2.7088,0.5743,0.2643,0.3728,0.3727,30.4816
4,0.4595,2.861406,0.5859,0.2745,0.3803,0.3801,32.3066


Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=16728, training_loss=1.6639956780337422, metrics={'train_runtime': 33592.3282, 'train_samples_per_second': 0.996, 'train_steps_per_second': 0.498, 'total_flos': 1.859730029169869e+16, 'train_loss': 1.6639956780337422, 'epoch': 4.0})

In [15]:
model.push_to_hub(repo_id="Valleyy/extract_nlp_cl", use_auth_token="")



README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Non-default generation parameters: {'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Valleyy/extract_nlp_cl/commit/890f9a901c8a4a8d63057d9108b3a0723806d910', commit_message='Upload MBartForConditionalGeneration', commit_description='', oid='890f9a901c8a4a8d63057d9108b3a0723806d910', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
tokenizer.push_to_hub(repo_id="Valleyy/extract_nlp_cl", use_auth_token="")



CommitInfo(commit_url='https://huggingface.co/Valleyy/extract_nlp_cl/commit/b9caea5387cd1df8fa97db969bc8f690325d3f1a', commit_message='Upload tokenizer', commit_description='', oid='b9caea5387cd1df8fa97db969bc8f690325d3f1a', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
import os

# Path to the directory to save the model
save_model_dir = '/kaggle/working/save_model'

# Check if the directory exists, if not, create it
if not os.path.exists(save_model_dir):
    os.makedirs(save_model_dir)
    print(f"The directory '{save_model_dir}' has been created.")
else:
    print(f"The directory '{save_model_dir}' already exists.")

The directory '/kaggle/working/save_model' has been created.


In [18]:
trainer.save_model("/kaggle/working/save_model")

NameError: name 'trainer' is not defined