<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/mlflow/summarization/T5_large_Finetune_multi_news_summarization_mlflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install mlflow pyngrok  --quiet
get_ipython().system_raw("mlflow ui --port 5000 &")

from pyngrok import ngrok


# Terminate open tunnels if exist
ngrok.kill()

In [3]:
# Transformers installation
! pip install -q --disable-pip-version-check py7zr sentencepiece loralib peft trl
! pip install -q    wandb bitsandbytes
! pip install datasets evaluate rouge_score -q
! pip install transformers[torch] -q
! pip install accelerate -U -q
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [4]:
ngrok.kill()

In [5]:
from google.colab import userdata
NGROK_AUTH_TOKEN  = userdata.get('NGROK')

ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

MLflow Tracking UI: https://074f-34-171-42-25.ngrok-free.app


In [6]:

import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from torch import cuda, bfloat16
import transformers

import torch
import torch.nn as nn
from google.colab import userdata

In [7]:

from google.colab import output
output.enable_custom_widget_manager()

from transformers.utils import logging
logging.set_verbosity_error()

os.environ["TRANSFORMERS_VERBOSITY"] = "error"

In [8]:


device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
device


'cuda:0'

# Load multi_news dataset
https://huggingface.co/datasets/multi_news

In [9]:
from datasets import load_dataset

dataset  = load_dataset("multi_news", trust_remote_code=True)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

In [11]:

print(f"Train dataset size: {len(dataset['train'])}")
print(f"test dataset size: {len(dataset['test'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")

Train dataset size: 44972
test dataset size: 5622
Validation dataset size: 5622


In [12]:
dataset['train'][100]['document']

'Katy Perry is all about breaking conventional beauty rules, from her love of everything technicolor and coated in glitter, to her no-brows, black lipstick Met Gala look. So, of course, the pop star — and face of CoverGirl — was the perfect person to help announce that the beauty brand has named its first-ever male CoverGirl, social media star James Charles. \n \n According to a press release from the brand, all CoverGirls “are role models and boundary-breakers, fearlessly expressing themselves, standing up for what they believe, and redefining what it means to be beautiful,” and who better to embody that ethos than Instagram sensation James Charles. After launching his beauty account a year ago, the teen has since quickly attracted hundreds of thousands of followers (427,000 to be exact) thanks to his unique, transformative approach to makeup artistry. \n \n RELATED PHOTOS: Katy Perry’s Most Outrageous Twitpics \n \n While Charles’ partnership with the brand kicks off today, we’ll hav

In [13]:

len(dataset['train'][100]['document'])

6217

In [14]:

len(dataset['train'][100]['summary'])

1268

# There are two fields that you'll want to use:

- text: the text of the bill which'll be the input to the model.
- summary: a condensed version of text which'll be the model target.
# Preprocess
The next step is to load a T5 tokenizer to process text and summary:

Model--> https://huggingface.co/google-t5/t5-large

In [15]:
from transformers import AutoTokenizer

PROJECT = "T5-large-Summarization"
MODEL_NAME = "google-t5/t5-large"
DATASET = "multi_news"
checkpoint = model_id = MODEL_NAME
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [16]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [17]:

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5622 [00:00<?, ? examples/s]

In [18]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5622
    })
})

In [19]:

len(tokenized_dataset['train'][100]['labels']), len(tokenized_dataset['train'][100]['input_ids'])


(128, 1024)

In [20]:
tokenized_dataset['train'][100]['labels']

[3,
 104,
 156,
 3,
 9,
 2335,
 54,
 36,
 2753,
 6,
 113,
 31,
 7,
 12,
 497,
 3,
 9,
 388,
 54,
 31,
 17,
 36,
 3,
 9,
 5620,
 517,
 23,
 52,
 40,
 5,
 461,
 2818,
 6,
 8,
 9244,
 349,
 31,
 7,
 750,
 23909,
 6,
 7482,
 63,
 18786,
 6,
 2162,
 2549,
 5417,
 38,
 8,
 166,
 664,
 96,
 254,
 1890,
 279,
 32,
 63,
 121,
 30,
 160,
 4601,
 543,
 5,
 5417,
 6,
 3,
 9,
 1003,
 18,
 1201,
 18,
 1490,
 96,
 25149,
 9244,
 2377,
 976,
 708,
 338,
 9244,
 163,
 3,
 9,
 215,
 977,
 68,
 65,
 641,
 183,
 3974,
 26,
 72,
 145,
 314,
 17093,
 10076,
 30,
 4601,
 6,
 8,
 454,
 2999,
 6029,
 1844,
 2279,
 5,
 2150,
 12,
 2449,
 6,
 5417,
 56,
 2385,
 16,
 1424,
 6,
 2281,
 6,
 11,
 1125,
 6543,
 21,
 96,
 5231,
 7263,
 107,
 1]

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load T5 with [AutoModelForSeq2SeqLM](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSeq2SeqLM):


At this point, only three steps remain:

1. Define your training hyperparameters in [Seq2SeqTrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the ROUGE metric and save the training checkpoint.
2. Pass the training arguments to [Seq2SeqTrainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [21]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer


# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)

In [22]:
def print_number_of_trainable_model_parameters(model, tag="original_model", to_wandb=False):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()

    if to_wandb:
      print({f'{tag}': {"trainable_model_params":trainable_model_params}})
      print({f'{tag}': {"all_model_params":all_model_params}})
      print({f'{tag}': {"percentage_of_trainable_model_parameters": 100 * trainable_model_params}} )

    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params}%"

In [23]:
print_number_of_trainable_model_parameters(model, to_wandb=True)

{'original_model': {'trainable_model_params': 737668096}}
{'original_model': {'all_model_params': 737668096}}
{'original_model': {'percentage_of_trainable_model_parameters': 73766809600}}


'trainable model parameters: 737668096\nall model parameters: 737668096\npercentage of trainable model parameters: 100.0%'

In [24]:
repository_id = f"{checkpoint.split('/')[1]}-{DATASET}"
repository_id

't5-large-multi_news'

In [25]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [26]:
import evaluate

rouge = evaluate.load("rouge")

In [27]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [28]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
dataset_id = "multi_news"
# Hugging Face repository id
repository_id = f"{checkpoint.split('/')[1]}-{DATASET}"

In [29]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [30]:
# Pick a name that you like and reflects the nature of the runs that you will be recording to the experiment.
mlflow.set_experiment("Fine_tuning_summarization")

<Experiment: artifact_location='mlflow-artifacts:/177008978387414440', creation_time=1732131782203, experiment_id='177008978387414440', last_update_time=1732131782203, lifecycle_stage='active', name='Fine_tuning_summarization', tags={}>

In [31]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

2009

In [32]:
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,

    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False, #change to bf16=True for XPU
    bf16=True,
    logging_dir=f"{repository_id}/logs",

# Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=3000,
    save_strategy="steps",
    save_steps=3000,
    logging_steps=500,
    load_best_model_at_end=True,
    report_to="mlflow",
)



In [33]:
PATH ="/content/drive/MyDrive/MODELS"

In [34]:
!rm -rf /content/drive/MyDrive/MODELS/flan-T5-fine-tune

In [35]:
!mkdir /content/drive/MyDrive/MODELS/flan-T5-fine-tune
custom_path = "/content/drive/MyDrive/MODELS/flan-T5-fine-tune"

In [36]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,

)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
model.generation_config.max_new_tokens = 128 # generate exactly 128 tokens when predicting
model.generation_config.min_new_tokens = 128

# Pipelines / Tasks Types
https://huggingface.co/docs/transformers/en/main_classes/pipelines

In [37]:
with mlflow.start_run(run_name="fine_tune_model") as run:
    train_results = trainer.train()
    print(train_results.metrics)
    trainer.model.save_pretrained(custom_path)
    trainer.data_collator.tokenizer.save_pretrained(custom_path)

    transformers_model = {"model": trainer.model, "tokenizer": trainer.data_collator.tokenizer}
    task = "summarization"
    model_info = mlflow.transformers.log_model(
        transformers_model=transformers_model,
        artifact_path="text_summarizer",
        task=task,
    )
    print(model_info.metadata)

{'loss': 3.2325, 'grad_norm': 2.484375, 'learning_rate': 1.966648879402348e-05, 'epoch': 0.06670224119530416}
{'loss': 2.6066, 'grad_norm': 2.09375, 'learning_rate': 1.933297758804696e-05, 'epoch': 0.13340448239060831}
{'loss': 2.5412, 'grad_norm': 1.8984375, 'learning_rate': 1.899946638207044e-05, 'epoch': 0.2001067235859125}
{'loss': 2.4997, 'grad_norm': 7.25, 'learning_rate': 1.866595517609392e-05, 'epoch': 0.26680896478121663}
{'loss': 2.486, 'grad_norm': 1.7421875, 'learning_rate': 1.8332443970117397e-05, 'epoch': 0.3335112059765208}
{'loss': 2.4926, 'grad_norm': 2.09375, 'learning_rate': 1.799893276414088e-05, 'epoch': 0.400213447171825}
{'eval_loss': 2.27348256111145, 'eval_rouge1': 0.3784, 'eval_rouge2': 0.1219, 'eval_rougeL': 0.2184, 'eval_rougeLsum': 0.2184, 'eval_gen_len': 128.0, 'eval_runtime': 5241.7065, 'eval_samples_per_second': 1.073, 'eval_steps_per_second': 0.179, 'epoch': 0.400213447171825}
{'loss': 2.4892, 'grad_norm': 1.96875, 'learning_rate': 1.7665421558164354e-0

README.md:   0%|          | 0.00/8.47k [00:00<?, ?B/s]



None
🏃 View run fine_tune_model at: http://127.0.0.1:5000/#/experiments/177008978387414440/runs/64ae3c4e52854bcf82de3c50f93a55f3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/177008978387414440


In [80]:
MLFLOW_TRACKING_URI="databricks"
# Specify the workspace hostname and token
DATABRICKS_HOST="https://adb-2467347032368999.19.azuredatabricks.net/"
DATABRICKS_TOKEN=userdata.get('DATABRCKS_TTOKEN')

In [81]:

if "MLFLOW_TRACKING_URI" not in os.environ:
    os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_TRACKING_URI
if "DATABRICKS_HOST" not in os.environ:
    os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
if "DATABRICKS_TOKEN" not in os.environ:
    os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

In [88]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [89]:
mlflow.set_experiment("/Users/***REMOVED***/summarization_fine_tuning")

2024/11/21 17:01:36 INFO mlflow.tracking.fluent: Experiment with name '/Users/***REMOVED***/summarization_fine_tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/837187481682972', creation_time=1732208495910, experiment_id='837187481682972', last_update_time=1732208495910, lifecycle_stage='active', name='/Users/***REMOVED***/summarization_fine_tuning', tags={'mlflow.experiment.sourceName': '/Users/***REMOVED***/summarization_fine_tuning',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': '***REMOVED***',
 'mlflow.ownerId': '1331640755799986'}>

In [124]:
mlflow.end_run()
del trainer

In [96]:
with mlflow.start_run() as run :
  model_info = mlflow.transformers.log_model(
          transformers_model=transformers_model,
          artifact_path="text_summarizer",
          task=task,
      )



Uploading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]



🏃 View run indecisive-cub-192 at: https://adb-2467347032368999.19.azuredatabricks.net/ml/experiments/837187481682972/runs/a3cb2376e04b48c8b654ac42401dd387
🧪 View experiment at: https://adb-2467347032368999.19.azuredatabricks.net/ml/experiments/837187481682972


In [97]:
print(model_info.metadata)

None


In [98]:
run.to_dictionary()


{'info': {'artifact_uri': 'dbfs:/databricks/mlflow-tracking/837187481682972/a3cb2376e04b48c8b654ac42401dd387/artifacts',
  'end_time': None,
  'experiment_id': '837187481682972',
  'lifecycle_stage': 'active',
  'run_id': 'a3cb2376e04b48c8b654ac42401dd387',
  'run_name': 'indecisive-cub-192',
  'run_uuid': 'a3cb2376e04b48c8b654ac42401dd387',
  'start_time': 1732208655077,
  'status': 'RUNNING',
  'user_id': ''},
 'data': {'metrics': {},
  'params': {},
  'tags': {'mlflow.runColor': '#da4c4c',
   'mlflow.runName': 'indecisive-cub-192',
   'mlflow.source.name': '/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py',
   'mlflow.source.type': 'LOCAL',
   'mlflow.user': '1331640755799986'}}}

In [99]:
run.data

<RunData: metrics={}, params={}, tags={'mlflow.runColor': '#da4c4c',
 'mlflow.runName': 'indecisive-cub-192',
 'mlflow.source.name': '/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': '1331640755799986'}>

In [100]:
import transformers
from mlflow.models import infer_signature
from mlflow.transformers import generate_signature_output
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [126]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

0

In [127]:
model_info.artifact_path


'text_summarizer'

In [128]:
 model_info.model_uri

'runs:/a3cb2376e04b48c8b654ac42401dd387/text_summarizer'

In [104]:
summarization_components = mlflow.transformers.load_model(
    model_info.model_uri, return_type="components"
)

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

2024/11/21 17:06:06 INFO mlflow.transformers: 'runs:/a3cb2376e04b48c8b654ac42401dd387/text_summarizer' resolved as 'dbfs:/databricks/mlflow-tracking/837187481682972/a3cb2376e04b48c8b654ac42401dd387/artifacts/text_summarizer'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [120]:
del summarization_components

In [105]:
summarization_components.keys()

dict_keys(['task', 'framework', 'device', 'torch_dtype', 'model', 'tokenizer'])

In [106]:
import torch
from tqdm.auto import tqdm

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [107]:

! pip install  evaluate  textstat  -q

In [122]:
del reconstructed_pipeline

In [52]:
reconstructed_pipeline = transformers.pipeline(**summarization_components)

In [74]:
test1= dataset['test'][100]['document']

In [108]:
reconstructed_pipeline(test1)

[{'summary_text': '0:58 KC\'s worst floods 1:54 KC floods: a history of the city Pause 1:55 KC Floods: A history of Kansas City 1:56 KC Firefighters recall worst flood in history 1:57 KC firefighters remember worst flood 1:59 Kansas City\'s most devastating floods Pause 2:07 KC Police say they\'re still looking for Kara Kopetsky, 17, and Jessica Runions, 21, who went missing in September 2016. The families of both women have been in the area. The Missouri State Highway Patrol said Tuesday afternoon. "'}]

In [109]:
dataset['test'][100].keys()

dict_keys(['document', 'summary'])

In [110]:
df_test = dataset['test'].to_pandas()

In [111]:
df_test.columns = ['inputs', 'summary']

In [112]:
df_test.head()

Unnamed: 0,inputs,summary
0,GOP Eyes Gains As Voters In 11 States Pick Gov...,– It's a race for the governor's mansion in 11...
1,\n \n \n \n UPDATE: 4/19/2001 Read Richard Met...,– It turns out Facebook is only guilty of abou...
2,It's the Golden State's latest version of the ...,– Not a big fan of Southern California? Neithe...
3,The seed for this crawl was a list of every ho...,– Why did Microsoft buy Nokia's phone business...
4,After a year in which liberals scored impressi...,– The Supreme Court is facing a docket of high...


In [113]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

52

In [115]:

with mlflow.start_run() as run:

    results = mlflow.evaluate(
         model_info.model_uri,
         df_test[:100],
        targets="summary",  # specify which column corresponds to the expected output
        model_type="text-summarization",  # model type indicates which metrics are relevant for this task
        evaluators="default",
    )
results.metrics

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

2024/11/21 17:13:33 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.


🏃 View run painted-doe-711 at: https://adb-2467347032368999.19.azuredatabricks.net/ml/experiments/837187481682972/runs/f558b0651c554a7185dd8eec734e829b
🧪 View experiment at: https://adb-2467347032368999.19.azuredatabricks.net/ml/experiments/837187481682972


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.13 GiB. GPU 0 has a total capacity of 39.56 GiB of which 1.08 GiB is free. Process 765051 has 38.48 GiB memory in use. Of the allocated memory 37.25 GiB is allocated by PyTorch, and 746.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)