In [1]:
!pip install peft
!pip install accelerate
!pip install bitsandbytes
!pip install datasets
!pip install trl
!pip install wandb

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.13.0->peft)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.13.0->peft)
  Using cached nvidia_cufft_cu12-11.

In [2]:
!pip freeze > requirements.txt

In [None]:
from google.colab import drive
import torch
import pandas as pd
import json
import os
from datasets import Dataset
from tqdm import tqdm
from collections import Counter
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, AutoProcessor, LlavaNextProcessor, LlavaNextForConditionalGeneration
from PIL import Image
from contextlib import nullcontext
from trl.commands.cli_utils import init_zero_verbose, SFTScriptArguments, TrlParser
from trl.env_utils import strtobool
from trl import (
    ModelConfig,
    RichProgressCallback,
    SFTConfig,
    SFTTrainer,
    get_peft_config,
    get_quantization_config,
    get_kbit_device_map,
)

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
with open('/content/drive/MyDrive/ML-Quiz-XRay-ReportGeneration/data/annotation_quiz_all.json', 'r') as f:
  data = json.load(f)

In [None]:
def merger(report):
  temp = ''
  for key, value in report.items():
    if value and len(value) > 0:
      temp += f"{key}: {value},"
  return temp[:-1]

In [None]:
def process_data(data):
  merged_data = []
  for item in data:
    merged_data.append({"id": item['id'], "report": merger(item['report'])})
  return pd.DataFrame(merged_data)

In [None]:
train_df = process_data(data['train'])

In [None]:
file_list = [os.path.join(dp, f) for dp, dn, filenames in os.walk('/content/drive/MyDrive/ML-Quiz-XRay-ReportGeneration/data/images') for f in filenames if os.path.splitext(f)[1] == '.png']

In [None]:
file_id_mapper = dict()
for file_path in file_list:
  id = file_path.split('/')[-2]
  file_id_mapper[file_path] = id

In [None]:
train_dataset = []
for key, value in file_id_mapper.items():
  result = train_df[train_df['id'] == value.strip()]
  if len(result) > 0:
    report = result['report'].values.tolist()[0]
    data = [
          {

              "role": "user",
              "content": [
                  {"type": "image"},
                  {"type": "text", "text": """Describe the image in more details from clinical perspective. Your description should focuss on five anatomical regions: lungs, heart, mediastinal, bone, others. If the part of the report (one or many line) is about lungs than put into "lungs", so as heart, bones, and mediastinal. If one line is part of two regions, add it into both the regions. However, if you can not put any part of the report into these four regions than put it into "others\. You will follow this format name of anatomical region : your_report_for_that_region, followed by comma. An example is  \"bone: No acute bony abnormality.,heart: Normal cardiac contours.,lung: No focal consolidation. No large pleural effusion. No pneumothorax.,mediastinal: Normal mediastinal contours.\" """},
              ],
          },
          {
            "role": "assistant",
            "content": [{"type": "text", "text": report},]
         }
    ]
  train_dataset.append({
      "id": value,
      "messages": data,
      "images":key
  })

In [None]:
lora_r = 16
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)


In [None]:
LLAVA_CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""

In [None]:
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto",quantization_config=bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path, trust_remote_code=True, use_fast=True)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model.config._name_or_path, trust_remote_code=True)
processor.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules= find_all_linear_names(model),
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
model = prepare_model_for_kbit_training(model)
lora_model = get_peft_model(model, peft_config)

In [None]:
lora_model.print_trainable_parameters()

trainable params: 44,515,328 || all params: 7,611,262,976 || trainable%: 0.5849


In [None]:
class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            #print(example)
            messages = example["messages"]
            text = self.processor.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=False
            )
            texts.append(text)
            image = Image.open(example["images"])
            images.append(image)

        batch = self.processor(texts, images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch

In [None]:
data_collator = LLavaDataCollator(processor)

In [None]:
dataset = Dataset.from_list(train_dataset)

In [None]:
TRL_USE_RICH = strtobool(os.getenv("TRL_USE_RICH", "0"))
init_context = nullcontext() if not TRL_USE_RICH else console.status("[bold green]Initializing the SFTTrainer...")
save_context = (
    nullcontext()
    if not TRL_USE_RICH
    else console.status(f"[bold green]Training completed! Saving the model to {training_args.output_dir}")
)


In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ML-Quiz-XRay-ReportGeneration/Model/",
    learning_rate=1.4e-5,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    gradient_accumulation_steps=5,
    logging_steps=5,
    save_steps = 20,
    report_to="wandb",
    push_to_hub=True,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True
)

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
with init_context:
  trainer = SFTTrainer(
      model=model,
      args=training_args,
      train_dataset=dataset,
      dataset_text_field="text",  # need a dummy field
      tokenizer=tokenizer,
      peft_config=peft_config,
      callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
      data_collator=data_collator,
      dataset_kwargs={"skip_prepare_dataset": True},
  )

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss
5,2.7959
10,2.5214
15,2.2183
20,1.8741
25,1.3971
30,1.2962
35,0.9235
40,0.8886
45,0.8358
50,0.8929




Step,Training Loss
5,2.7959
10,2.5214
15,2.2183
20,1.8741
25,1.3971
30,1.2962
35,0.9235
40,0.8886
45,0.8358
50,0.8929




KeyboardInterrupt: 