In [None]:
import random
import numpy as np
import nltk
import os
import torch
import json  
from bert_score import score
import pandas as pd
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel, PeftConfig
from datasets import load_dataset
from evaluate import load
from IPython.display import display, HTML
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, MT5ForConditionalGeneration
from tqdm import tqdm 
from torch.utils.data import DataLoader


nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     /home/mdaffarudiyanto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mdaffarudiyanto/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_random_seed(7)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
raw_datasets = load_dataset(
    "mdaffarudiyanto/GR_custom_dataset",
    'stratified_articles_below_1000',
    data_dir="/home/student/Documents/MDR/custom_liputan6_data",
    ignore_verifications=True
)

model_checkpoint = "google/mt5-large"
model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint, load_in_8bit = True, device_map = "auto")
# model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
rouge_metric = load("rouge")

max_input_length = 512
max_target_length = 128
prefix = "summarize: "

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
lora_config = LoraConfig(
    r=256,
    lora_alpha=256,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="lora_only",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 75,497,472 || all params: 1,305,078,784 || trainable%: 5.7849


In [5]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["clean_article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(text_target=examples["clean_summary"], max_length=max_target_length, truncation=True, padding="max_length")
 
    model_inputs["labels"] = labels["input_ids"]

    model_inputs["labels"] = [
    [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
    for labels_example in model_inputs["labels"]
    ]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)



Map:   0%|          | 0/171963 [00:00<?, ? examples/s]

In [6]:
batch_size = 24
model_name = model_checkpoint.split("/")[-1]
output_dir = f"GR-{model_name}-LoRA-3-1"
args = Seq2SeqTrainingArguments(
    output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",  
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
)
label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8)



In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {
        "rouge1": rouge_result["rouge1"]* 100,
        "rouge2": rouge_result["rouge2"]* 100,
        "rougeL": rouge_result["rougeL"]* 100,
        "rougeLsum": rouge_result["rougeLsum"]* 100
    }

    return {k: round(v, 4) for k, v in result.items()}

In [8]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
model.config.use_cache = False

trainer.train()

  0%|          | 0/35830 [00:00<?, ?it/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 3.8583, 'grad_norm': 0.5766535401344299, 'learning_rate': 9.860452135082335e-05, 'epoch': 0.07}
{'loss': 1.5106, 'grad_norm': 0.6792715787887573, 'learning_rate': 9.720904270164667e-05, 'epoch': 0.14}
{'loss': 1.4457, 'grad_norm': 0.5304698348045349, 'learning_rate': 9.581356405247e-05, 'epoch': 0.21}
{'loss': 1.4192, 'grad_norm': 0.5255179405212402, 'learning_rate': 9.441808540329333e-05, 'epoch': 0.28}
{'loss': 1.3752, 'grad_norm': 0.45272547006607056, 'learning_rate': 9.302260675411667e-05, 'epoch': 0.35}
{'loss': 1.3625, 'grad_norm': 0.46935856342315674, 'learning_rate': 9.162712810494e-05, 'epoch': 0.42}
{'loss': 1.3568, 'grad_norm': 0.4943619966506958, 'learning_rate': 9.023164945576334e-05, 'epoch': 0.49}
{'loss': 1.3299, 'grad_norm': 0.5023877024650574, 'learning_rate': 8.883617080658666e-05, 'epoch': 0.56}
{'loss': 1.3232, 'grad_norm': 0.5666508078575134, 'learning_rate': 8.744069215741e-05, 'epoch': 0.63}
{'loss': 1.3116, 'grad_norm': 0.7473479509353638, 'learning_ra



  0%|          | 0/896 [00:00<?, ?it/s]

{'eval_loss': 1.0792471170425415, 'eval_rouge1': 30.7314, 'eval_rouge2': 19.5819, 'eval_rougeL': 28.0315, 'eval_rougeLsum': 29.1585, 'eval_runtime': 1916.7673, 'eval_samples_per_second': 11.214, 'eval_steps_per_second': 0.467, 'epoch': 1.0}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.2792, 'grad_norm': 0.45638561248779297, 'learning_rate': 7.906782026235e-05, 'epoch': 1.05}
{'loss': 1.2691, 'grad_norm': 0.5121920108795166, 'learning_rate': 7.767234161317332e-05, 'epoch': 1.12}
{'loss': 1.2615, 'grad_norm': 1.1628835201263428, 'learning_rate': 7.627686296399665e-05, 'epoch': 1.19}
{'loss': 1.257, 'grad_norm': 0.36726051568984985, 'learning_rate': 7.488138431481999e-05, 'epoch': 1.26}
{'loss': 1.2664, 'grad_norm': 0.49681922793388367, 'learning_rate': 7.348590566564332e-05, 'epoch': 1.33}
{'loss': 1.2532, 'grad_norm': 0.4368649125099182, 'learning_rate': 7.209042701646666e-05, 'epoch': 1.4}
{'loss': 1.2503, 'grad_norm': 0.9414704442024231, 'learning_rate': 7.069494836728998e-05, 'epoch': 1.47}
{'loss': 1.2436, 'grad_norm': 0.4505850076675415, 'learning_rate': 6.929946971811332e-05, 'epoch': 1.54}
{'loss': 1.2328, 'grad_norm': 0.31621021032333374, 'learning_rate': 6.790399106893665e-05, 'epoch': 1.6}
{'loss': 1.2374, 'grad_norm': 0.605460524559021, 'learnin



  0%|          | 0/896 [00:00<?, ?it/s]

{'eval_loss': 1.0441831350326538, 'eval_rouge1': 31.1356, 'eval_rouge2': 19.9742, 'eval_rougeL': 28.4552, 'eval_rougeLsum': 29.5827, 'eval_runtime': 1922.3674, 'eval_samples_per_second': 11.181, 'eval_steps_per_second': 0.466, 'epoch': 2.0}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.2129, 'grad_norm': 0.5214790105819702, 'learning_rate': 5.9531119173876635e-05, 'epoch': 2.02}
{'loss': 1.2196, 'grad_norm': 0.45561397075653076, 'learning_rate': 5.8135640524699976e-05, 'epoch': 2.09}
{'loss': 1.2, 'grad_norm': 0.6206215023994446, 'learning_rate': 5.674016187552331e-05, 'epoch': 2.16}
{'loss': 1.2091, 'grad_norm': 0.39133739471435547, 'learning_rate': 5.534468322634664e-05, 'epoch': 2.23}
{'loss': 1.2014, 'grad_norm': 0.5446147918701172, 'learning_rate': 5.394920457716998e-05, 'epoch': 2.3}
{'loss': 1.2043, 'grad_norm': 0.5717746615409851, 'learning_rate': 5.2553725927993305e-05, 'epoch': 2.37}
{'loss': 1.2047, 'grad_norm': 0.7855800986289978, 'learning_rate': 5.115824727881664e-05, 'epoch': 2.44}
{'loss': 1.197, 'grad_norm': 0.4432491958141327, 'learning_rate': 4.9762768629639966e-05, 'epoch': 2.51}
{'loss': 1.1849, 'grad_norm': 0.41299527883529663, 'learning_rate': 4.8367289980463307e-05, 'epoch': 2.58}
{'loss': 1.1838, 'grad_norm': 0.4671424627304077, 'l



  0%|          | 0/896 [00:00<?, ?it/s]

{'eval_loss': 1.024794340133667, 'eval_rouge1': 31.0846, 'eval_rouge2': 19.9549, 'eval_rougeL': 28.3784, 'eval_rougeLsum': 29.494, 'eval_runtime': 1921.5781, 'eval_samples_per_second': 11.186, 'eval_steps_per_second': 0.466, 'epoch': 3.0}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.1896, 'grad_norm': 0.4501349925994873, 'learning_rate': 3.99944180854033e-05, 'epoch': 3.0}
{'loss': 1.1775, 'grad_norm': 0.41838252544403076, 'learning_rate': 3.859893943622663e-05, 'epoch': 3.07}
{'loss': 1.1752, 'grad_norm': 0.449942946434021, 'learning_rate': 3.720346078704996e-05, 'epoch': 3.14}
{'loss': 1.1711, 'grad_norm': 0.4604440927505493, 'learning_rate': 3.580798213787329e-05, 'epoch': 3.21}
{'loss': 1.1721, 'grad_norm': 0.47075650095939636, 'learning_rate': 3.4412503488696626e-05, 'epoch': 3.28}
{'loss': 1.1735, 'grad_norm': 0.43812647461891174, 'learning_rate': 3.301702483951995e-05, 'epoch': 3.35}
{'loss': 1.1649, 'grad_norm': 0.4246861934661865, 'learning_rate': 3.162154619034329e-05, 'epoch': 3.42}
{'loss': 1.1681, 'grad_norm': 0.4019034802913666, 'learning_rate': 3.022606754116662e-05, 'epoch': 3.49}
{'loss': 1.169, 'grad_norm': 0.40389928221702576, 'learning_rate': 2.883058889198995e-05, 'epoch': 3.56}
{'loss': 1.1523, 'grad_norm': 0.4047265350818634, 'lea



  0%|          | 0/896 [00:00<?, ?it/s]

{'eval_loss': 1.0112848281860352, 'eval_rouge1': 31.3254, 'eval_rouge2': 20.1852, 'eval_rougeL': 28.6607, 'eval_rougeLsum': 29.7518, 'eval_runtime': 1921.2515, 'eval_samples_per_second': 11.187, 'eval_steps_per_second': 0.466, 'epoch': 4.0}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.1493, 'grad_norm': 0.5039042830467224, 'learning_rate': 1.9062238347753282e-05, 'epoch': 4.05}
{'loss': 1.1436, 'grad_norm': 0.43603894114494324, 'learning_rate': 1.7666759698576613e-05, 'epoch': 4.12}
{'loss': 1.1337, 'grad_norm': 0.41771361231803894, 'learning_rate': 1.6271281049399943e-05, 'epoch': 4.19}
{'loss': 1.1543, 'grad_norm': 0.49262088537216187, 'learning_rate': 1.4875802400223277e-05, 'epoch': 4.26}
{'loss': 1.1488, 'grad_norm': 0.471996933221817, 'learning_rate': 1.3480323751046608e-05, 'epoch': 4.33}
{'loss': 1.1453, 'grad_norm': 0.4908730089664459, 'learning_rate': 1.2084845101869942e-05, 'epoch': 4.4}
{'loss': 1.1438, 'grad_norm': 0.4520229697227478, 'learning_rate': 1.0689366452693274e-05, 'epoch': 4.47}
{'loss': 1.1527, 'grad_norm': 0.5432513952255249, 'learning_rate': 9.293887803516608e-06, 'epoch': 4.54}
{'loss': 1.1536, 'grad_norm': 0.47976547479629517, 'learning_rate': 7.898409154339938e-06, 'epoch': 4.61}
{'loss': 1.1553, 'grad_norm': 0.45382902026176



  0%|          | 0/896 [00:00<?, ?it/s]

{'eval_loss': 1.0041303634643555, 'eval_rouge1': 31.4683, 'eval_rouge2': 20.255, 'eval_rougeL': 28.7584, 'eval_rougeLsum': 29.8822, 'eval_runtime': 1913.1327, 'eval_samples_per_second': 11.235, 'eval_steps_per_second': 0.468, 'epoch': 5.0}
{'train_runtime': 94123.6574, 'train_samples_per_second': 9.135, 'train_steps_per_second': 0.381, 'train_loss': 1.2565770755291916, 'epoch': 5.0}


TrainOutput(global_step=35830, training_loss=1.2565770755291916, metrics={'train_runtime': 94123.6574, 'train_samples_per_second': 9.135, 'train_steps_per_second': 0.381, 'total_flos': 2.770680372485161e+18, 'train_loss': 1.2565770755291916, 'epoch': 5.0})

In [9]:
best_model_path = os.path.join(output_dir, "best_model")
trainer.save_model(best_model_path)
print(f"Best model saved to {best_model_path}")

Best model saved to GR-mt5-large-LoRA-3-1/best_model


In [None]:
peft_model_id = "mT5-large_LoRA_3-1_results"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)


GENERATE SUMMARIES

In [3]:
peft_model_id = "mT5-small_LoRA_3_results"
config = PeftConfig.from_pretrained(peft_model_id)

model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
 
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()
 
print("Peft model loaded")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Peft model loaded


In [4]:
raw_datasets = load_dataset(
    "mdaffarudiyanto/GR_custom_dataset",
    'stratified_articles_below_1000',
    data_dir="custom_liputan6_data",
    ignore_verifications=True
)

max_input_length = 512
max_target_length = 128
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["clean_article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(text_target=examples["clean_summary"], max_length=max_target_length, truncation=True, padding="max_length")
  
    model_inputs["labels"] = labels["input_ids"]

    model_inputs["labels"] = [
    [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
    for labels_example in model_inputs["labels"]
    ]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/21494 [00:00<?, ? examples/s]

Map:   0%|          | 0/21500 [00:00<?, ? examples/s]

Map:   0%|          | 0/171963 [00:00<?, ? examples/s]

In [5]:
def collate_fn(batch):
    articles = [item['clean_article'] for item in batch]
    summaries = [item['clean_summary'] for item in batch]

    inputs = tokenizer(
        articles,
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=max_input_length
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    return inputs, articles, summaries

batch_size = 64
test_dataloader = DataLoader(
    raw_datasets["test"],
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False
)

In [6]:
def generate_and_compare_summaries(model, tokenizer, dataloader, device):
    model.eval()

    display_data = []
    all_generated_summaries = []
    all_reference_summaries = []

    for inputs, articles, summaries in tqdm(dataloader, desc="Processing"):
        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=max_target_length,
                num_beams=5,
                early_stopping=True
            )

        generated_summaries = tokenizer.batch_decode(
            summary_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        for article, reference_summary, generated_summary in zip(articles, summaries, generated_summaries):
            display_data.append({
                "Original Text": article,
                "Reference Summary": reference_summary,
                "Generated Summary": generated_summary
            })
            all_generated_summaries.append(generated_summary)
            all_reference_summaries.append(reference_summary)

    P, R, F1 = score(
        all_generated_summaries,
        all_reference_summaries,
        model_type="bert-base-multilingual-cased",
        num_layers=9,
        lang='id',
        verbose=True
    )

    avg_precision = P.mean().item() * 100
    avg_recall = R.mean().item() * 100
    avg_f1 = F1.mean().item() * 100

    rouge_metric = load("rouge")
    rouge_scores = rouge_metric.compute(
        predictions=all_generated_summaries,
        references=all_reference_summaries,
        use_stemmer=True
    )
    
    rouge1 = rouge_scores['rouge1'] * 100
    rouge2 = rouge_scores['rouge2'] * 100
    rougeL = rouge_scores['rougeL'] * 100
    rougeLsum = rouge_scores['rougeLsum'] * 100
    
    print(f"Average Precision: {avg_precision:.2f}")
    print(f"Average Recall: {avg_recall:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")
    
    print(f"ROUGE Scores:")
    print(f"ROUGE-1 F1 Score: {rouge1:.2f}")
    print(f"ROUGE-2 F1 Score: {rouge2:.2f}")
    print(f"ROUGE-L F1 Score: {rougeL:.2f}")
    print(f"ROUGE-Lsum F1 Score: {rougeLsum:.2f}")
     
    scores_dict = {
        'BERTScore': {
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1
        },
        'ROUGE': {
            'ROUGE-1': rouge1,
            'ROUGE-2': rouge2,
            'ROUGE-L': rougeL,
            'ROUGE-Lsum': rougeLsum
        }
    }

    with open('evaluation_scores.json', 'w') as f:
        json.dump(scores_dict, f, indent=4)

    return pd.DataFrame(display_data)

summary_comparison_df = generate_and_compare_summaries(
    model,
    tokenizer,
    test_dataloader,
    device
)

summary_comparison_df.to_csv("LoRA_summary_results.csv", index=False)

display(HTML(summary_comparison_df.head(30).to_html(escape=False)))


Processing:   0%|          | 0/336 [00:00<?, ?it/s]

Processing: 100%|██████████| 336/336 [55:09<00:00,  9.85s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/669 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/336 [00:00<?, ?it/s]

done in 70.03 seconds, 307.02 sentences/sec
Average Precision: 78.31
Average Recall: 78.23
Average F1 Score: 78.22
ROUGE Scores:
ROUGE-1 F1 Score: 43.77
ROUGE-2 F1 Score: 28.97
ROUGE-L F1 Score: 37.99
ROUGE-Lsum F1 Score: 37.99


NameError: name 'HTML' is not defined