# Change Cache Directory

In [None]:
# https://saturncloud.io/blog/how-to-change-huggingface-transformers-default-cache-directory-a-stepbystep-guide/
import os

os.environ['HF_HOME'] = '/content/cache'

# Initialization
* WandB init

In [None]:
!pip install --upgrade torch transformers

In [None]:
!pip install --upgrade wandb

In [None]:
!pip install --upgrade deepspeed

In [None]:
!pip install bert_score

In [None]:
!pip install --upgrade peft

In [None]:
import numpy as np
import pandas as pd
import torch
import wandb
import datetime
import typing

In [None]:
# Log in to WandB account
wandb.login()

# change model, dataset size, etc.
"""
model: LLaVA, MiniGPT4
dataset_size: 1K, 2K, ..., 7K
"""

unique_id = datetime.datetime.now().strftime("%Y.%m.%d_%H.%M.%S")

# Initialize a new WandB run
# wandb.init(project="synergistic-dataset", name=unique_id)

# Load Dataset
* generated by LLaVA-v1.3-13b, Vicuna-7b-v1.5 and SDXL

## Load

In [None]:
!pip install datasets

In [None]:
!git lfs install
!git clone https://huggingface.co/datasets/MaoXun/Synergy-General-MultimodalPairs

In [None]:
!mv Synergy-General-MultimodalPairs SDXL

## Construct JSON

In [None]:
# unzip each dataset
import os
from zipfile import ZipFile

folder_path = '/content/SDXL'

for filename in os.listdir(folder_path):
    if filename.endswith(".zip"):
        filepath = os.path.join(folder_path, filename)

        with ZipFile(filepath, 'r') as zip_ref:
            zip_ref.extractall(folder_path)
            print(f"Extracted: {filename}")

In [None]:
# remove zip files
!rm -rf /content/SDXL/*.zip

In [None]:
rounds = 7
batches = [20, 20, 20, 20, 15, 20, 20]
m = 10
n = 5

for r in range(rounds):
  b = batches[r]

  unzip_path = f"{folder_path}/{r+1}_{b}_{m}_{n}";

  for file in os.listdir(unzip_path):
    if file.endswith('.zip'):
        zip_path = os.path.join(unzip_path, file)
        with ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(unzip_path+'/images')

In [None]:
%cd /content

import json
import ast

"""
for multi-round multi-batch datas,
we organize to this way
{
    "id": "{r}_{i}_{j}",
    "image": "SDXL/{r}_{b}_{m}_{n}/images/{i}/{j}.jpg",
    "conversations": [
      {
        "from": "human",
        "value": "{I}\n<image>"
      },
      {
        "from": "gpt",
        "value": "{ans}"
      }
    ],
    ...
}
"""
rounds = 7
batches = [20, 20, 20, 20, 15, 20, 20]
m = 10
n = 5

fixed_instruction = "Please describe this image in detail."

datas = []
for r in range(rounds):
    b = batches[r]
    base_file_path = f"/content/SDXL/{r+1}_{b}_{m}_{n}";
    S_path = f"{base_file_path}/S.csv"
    S_df = pd.read_csv(S_path)

    for i, row in S_df.iterrows():
      for j, (column, value) in enumerate(row.items()):
        # ignore the D_0
        if column == "D_init":
          continue

        value = ast.literal_eval(value)
        img, output = value
        img = img.replace("/content/images/","")

        img = f"{base_file_path}/images/{img}"

        data = {
            "id": f"{r+1}_{b}_{i}_{j}",
            "conversations": [
                { "from": "human", "value": f"{fixed_instruction}\n<image>"},
                { "from": "gpt", "value": output}
            ],
            "image": img,
        }

        datas.append(data)

# convert it to json file
file_name = "data_lora.json"
with open(file_name, "w") as json_file:
    json.dump(datas, json_file)

In [None]:
datas[0]

## Helper Methods

In [None]:
def save_data_by_size(size):
  """
  split the data by size
  e.g. size = 3000 -> save 3000 entries to train_data_lora.json
  """
  with open("/content/data_lora.json", "r") as json_file:
    datas = json.load(json_file)

  train_datas = datas[:size]

  with open("/content/train_data_lora.json", "w") as json_file:
    json.dump(train_datas, json_file)

# Train and Evaluate Models on Different Size of this Dataset
* 1K, 2K, ..., 7K
* training and evaluating
* log response to WandB
---
* Models:
  * LLaVA
  * MiniGPT4
  

In [None]:
size = [1000, 2000, 3000, 4000, 5000, 6000, 7000]

## LLaVA

### Setup

In [None]:
!git clone https://github.com/haotian-liu/LLaVA.git

In [None]:
!cd /content/LLaVA

In [None]:
!pip install --upgrade pip
!pip install -e .

In [None]:
!pip install -e ".[train]"
!pip install flash-attn --no-build-isolation

### Train

In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
%cd /content
%mkdir checkpoints
%cd checkpoints

In [None]:
# download multimodal projecter
!wget https://huggingface.co/liuhaotian/llava-336px-pretrain-vicuna-7b-v1.3/resolve/main/mm_projector.bin

In [None]:
unique_id = datetime.datetime.now().strftime("%Y.%m.%d_%H.%M.%S")

config = {
    "model": "LLaVA",
    "dataset_size": 3000,
    "project_name": "synergistic-dataset",
    "name": unique_id
}

In [None]:
os.environ["WANDB_API_KEY"] = "c024ab583d4aab78b0ca7f9329635c7843aea5e2"
os.environ["WANDB_ENTITY"] = "codingmaoxun"
os.environ["WANDB_PROJECT"] = config["project_name"]
os.environ["WANDB_NAME"] = config["name"]

In [None]:
# using LLaVA LoRA fine-tune script
# 1. freeze the original weights
# 2. calculate trainable weights and split into A, B
# 3. training the LoRA weights
# 4. upload lora weights

# note:
# from LLaVA page, it seems to fine-tune in two stages
# LLaVa connects pre-trained CLIP ViT-L/14 visual encoder and large language model Vicuna, using a simple projection matrix. We consider a two-stage instruction-tuning procedure:
# Stage 1: Pre-training for Feature Alignment. Only the projection matrix is updated, based on a subset of CC3M.
# Stage 2: Fine-tuning End-to-End. Both the projection matrix and LLM are updated for two different use senarios:

"""
config:
    dataset_size
"""

def write_fine_tune_script(bash_script_path, config):
    bash_script_content = f"""#!/bin/bash
################## VICUNA ##################
PROMPT_VERSION=v1
MODEL_VERSION="vicuna-7b-v1.3"
################## VICUNA ##################

deepspeed /content/LLaVA/llava/train/train_mem.py \
    --deepspeed /content/LLaVA/scripts/zero2.json \
    --lora_enable True \
    --model_name_or_path $MODEL_VERSION \
    --version $PROMPT_VERSION \
    --data_path /content/train_data_lora.json \
    --image_folder "" \
    --vision_tower openai/clip-vit-large-patch14 \
    --pretrain_mm_mlp_adapter /content/checkpoints/mm_projector.bin \
    --mm_vision_select_layer -2 \
    --mm_use_im_start_end False \
    --mm_use_im_patch_token False \
    --bf16 True \
    --output_dir /content/checkpoints/llava-$MODEL_VERSION-{config["dataset_size"]}-finetune_lora \
    --num_train_epochs 1 \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 1 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 50000 \
    --save_total_limit 1 \
    --learning_rate 2e-5 \
    --weight_decay 0. \
    --warmup_ratio 0.03 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --tf32 True \
    --model_max_length 2048 \
    --gradient_checkpointing True \
    --lazy_preprocess True \
    --dataloader_num_workers 4 \
    --report_to wandb
"""

    with open(bash_script_path, 'w') as file:
        file.write(bash_script_content)
        os.chmod(bash_script_path, 0o755)

In [None]:
import torch

print(torch.cuda.is_available())

In [None]:
%cd /content/LLaVA

In [None]:
import subprocess
import json

"""
The final lora weights is at "/content/checkpoints/llava-$MODEL_VERSION-finetune_lora"
"""

# save different data size to train_data_lora.json
size = [1000, 2000, 3000, 4000, 5000, 6000, 7000]
bash_path = "/content/lora_fine_tune.sh"

for i in range(len(size)):
    write_fine_tune_script(bash_path, {
        "dataset_size": size[i]
    })
    save_data_by_size(size[i])

    # set new wandb environ
    os.environ["WANDB_NAME"] = config["name"]+"_size_"+str(size[i])

    result = subprocess.run([bash_path], capture_output=True, text=True, shell=True)

    # Print the standard output and error (if any)
    print("STDOUT:", result.stdout)
    print("STDERR:", result.stderr)

### Get the 100 image response

In [None]:
fixed_instruction = "Please describe the image in detail."

In [None]:
# save response path
response_base_bath = "/content/response"

In [None]:
img_paths = [
    f"/content/testing/random_images/image_{i}.jpg"
    for i in range(100)
]

### LLaVA Helper

In [None]:
!pip install --upgrade pip setuptools wheel

In [None]:
!pip install --upgrade sentencepiece

In [None]:
%cd /content/LLaVA

In [None]:
import argparse
import torch

from llava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    IMAGE_PLACEHOLDER,
)
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import (
    process_images,
    tokenizer_image_token,
    get_model_name_from_path,
)

from PIL import Image

import requests
from PIL import Image
from io import BytesIO
import re


def image_parser(args):
    out = args.image_file.split(args.sep)
    return out


def load_image(image_file):
    if image_file.startswith("http") or image_file.startswith("https"):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert("RGB")
    else:
        image = Image.open(image_file).convert("RGB")
    return image


def load_images(image_files):
    out = []
    for image_file in image_files:
        image = load_image(image_file)
        out.append(image)
    return out


def eval_model(args):
    # Model
    disable_torch_init()

    model_name = get_model_name_from_path(args.model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(
        args.model_path, args.model_base, model_name
    )

    qs = args.query
    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
    if IMAGE_PLACEHOLDER in qs:
        if model.config.mm_use_im_start_end:
            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
        else:
            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
    else:
        if model.config.mm_use_im_start_end:
            qs = image_token_se + "\n" + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

    if "llama-2" in model_name.lower():
        conv_mode = "llava_llama_2"
    elif "mistral" in model_name.lower():
        conv_mode = "mistral_instruct"
    elif "v1.6-34b" in model_name.lower():
        conv_mode = "chatml_direct"
    elif "v1" in model_name.lower():
        conv_mode = "llava_v1"
    elif "mpt" in model_name.lower():
        conv_mode = "mpt"
    else:
        conv_mode = "llava_v0"

    if args.conv_mode is not None and conv_mode != args.conv_mode:
        print(
            "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
                conv_mode, args.conv_mode, args.conv_mode
            )
        )
    else:
        args.conv_mode = conv_mode

    conv = conv_templates[args.conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    image_files = image_parser(args)
    images = load_images(image_files)
    image_sizes = [x.size for x in images]
    images_tensor = process_images(
        images,
        image_processor,
        model.config
    ).to(model.device, dtype=torch.float16)

    input_ids = (
        tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
        .unsqueeze(0)
        .cuda()
    )

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=images_tensor,
            image_sizes=image_sizes,
            do_sample=True if args.temperature > 0 else False,
            temperature=args.temperature,
            top_p=args.top_p,
            num_beams=args.num_beams,
            max_new_tokens=args.max_new_tokens,
            use_cache=True,
        )

    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

    return outputs

In [None]:
from typing import List

def generate_description_of_image(
    lora_model_path: str,
    model_base: str,
    img_paths: List[str],
) -> str:
  model_name = get_model_name_from_path(lora_model_path)
  prompt = fixed_instruction
  imageFiles = " ".join(img_paths)

  args = type('Args', (), {
      "model_path": lora_model_path,
      "model_base": model_base,
      "model_name": model_name,
      "query": prompt,
      "conv_mode": None,
      "image_file": imageFiles,
      "sep": " ",
      "temperature": 0.7,
      "top_p": 0.9,
      "num_beams": 3,
      "max_new_tokens": 150,
  })()

  outputs = eval_model(args)
  print(outputs)

  return outputs

### Merge LoRA and Generate Response

In [None]:
%cd /content/LLaVA

In [None]:
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path

In [None]:
def construct_llava_lora_model_path(datasize):
  lora_model_path = f"/content/checkpoints/llava-vicuna-7b-v1.3-{datasize}-finetune_lora"
  return lora_model_path

In [None]:
llava_response = {
    "1000": [],
    "2000": [],
    "3000": [],
    "4000": [],
    "5000": [],
    "6000": [],
    "7000": []
}

In [None]:
model_base = "lmsys/vicuna-7b-v1.3"

for i in range(len(size)):
  lora_model_path = construct_llava_lora_model_path(size[i])

  for img_path in img_paths:
    # generate response and store
    response = generate_description_of_image(
        lora_model_path = lora_model_path,
        model_base = model_base,
        img_paths = [img_path]
    )

    llava_response[str(size[i])].append(response)

  # save model
  # save_model_path = "/content/checkpoints/checkpoint-llava-7-20-10-5-vicuna-7b-v1.3/"
  # model.save_pretrained(save_model_path)
  # tokenizer.save_pretrained(save_model_path)

In [None]:
print(llava_response["1000"][40]+"\n"+llava_response["7000"][40])

In [None]:
len(llava_response["1000"])

In [None]:
# save response to jsonl
import json

llava_response_folder = f"{response_base_bath}/llava"
llava_response_path = f"{response_base_bath}/llava/llava_response.jsonl"
os.makedirs(llava_response_folder, exist_ok=True)

with open(llava_response_path, "w") as f:
    for key, value in llava_response.items():
        json_obj = {key: value}
        json_str = json.dumps(json_obj)
        f.write(json_str + '\n')

## MiniGPT4

In [None]:
# save response
minigpt4_response_path = f"{response_base_bath}/minigpt4/minigpt4_response.jsonl"


# Compute the BERTScore with Benchmark Model
* run the downloading 100 images first
* benchmark model: GPT4(need money), LLaVA-34B-v1.6(too big), LLaVA-13B-v1.6

## GPT4-vision Generate Response (COST!!)

In [None]:
# 100 images from GPT4 responses
# pasted by asking GPT4
gpt_responses = []

In [None]:
import base64
import requests

# OpenAI API Key
api_key = "YOUR OPEN AI TOKEN"

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

for img_path in img_paths:
  # Getting the base64 string
  base64_image = encode_image(img_path)


  payload = {
    "model": "gpt-4-vision-preview",
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": "Please describe the image in detail"
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{base64_image}"
            }
          }
        ]
      }
    ],
    "max_tokens": 1000
  }

  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  data = response.json()
  response_text = data["choices"][0]["message"]["content"]
  gpt_responses.append(response_text)

  print(f"Usage: { data['usage'] }")
  print(f"Preview response: { response_text[:20] }")


In [None]:
len(gpt_responses), gpt_responses[0]

In [None]:
import json

gpt4_response_folder = f"{response_base_bath}/gpt4"
gpt4_response_path = f"{response_base_bath}/gpt4/gpt4_response.jsonl"
os.makedirs(gpt4_response_folder, exist_ok=True)

with open(gpt4_response_path, 'w') as file:
    for string in gpt_responses:
        json_object = json.dumps(string)
        file.write(json_object + '\n')

## Helper

In [None]:
def read_jsonl_to_list(file_path):
    data_list = []
    with open(file_path, 'r') as file:
        for line in file:
            data_list.append(json.loads(line))
    return data_list

In [None]:
def read_jsonl_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            for k, v in json_obj.items():
              data_dict[k] = v
    return data_dict

In [None]:
from bert_score import BERTScorer

def calculate_bertscore(cands, refs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    scorer = BERTScorer("microsoft/deberta-xlarge-mnli", use_fast_tokenizer=True, device=device)

    precision, recall, f1 = scorer.score(cands, refs)

    # use recall score as reward
    return recall.numpy()

## LLaVA

In [None]:
# read jsonl
llava_responses = read_jsonl_to_dict("/content/response/llava/llava_response.jsonl")

In [None]:
gpt_responses = read_jsonl_to_list("/content/response/gpt4/gpt4_response.jsonl")
gpt_responses = [gpt_response.replace('\n',' ') for gpt_response in gpt_responses]

In [None]:
from collections import defaultdict

bert_score_llava_gpt = defaultdict(list)

In [None]:
for size, llava_res in llava_responses.items():
  print(f"processing size {size}...")
  for i in range(len(llava_res)):
    # truncation
    ref = gpt_responses[i]
    cand = llava_res[i]

    score = calculate_bertscore([cand], [ref])[0]
    bert_score_llava_gpt[size].append(score)

In [None]:
# load to pickle
import pickle

with open('/content/response/score/llava_score.pickle', 'wb') as file:
    pickle.dump(bert_score_llava_gpt, file)

In [None]:
# get from pickle
import pickle

with open('/content/response/score/llava_score.pickle', 'rb') as file:
    bert_score_llava_gpt = pickle.load(file)

print(bert_score_llava_gpt)

In [None]:
for size, scores in bert_score_llava_gpt.items():
  print(f"Dataset Size: {size}\n")
  print(f"Mean: {np.mean(scores)}")
  print(f"Std: {np.std(scores)}")
  print("="*10)

In [None]:
# log BERTScore to wandb
name = "llava_gpt_bert_score"
wandb.init(
    project="synergistic-dataset",
    entity="codingmaoxun",
    name=name
)

In [None]:
# Compute mean scores and standard deviations
mean_scores = []
std_devs = []
for dataset_size, scores in bert_score_llava_gpt.items():
    mean_score = np.mean(scores)
    std_dev = np.std(scores)

    mean_scores.append(mean_score)
    std_devs.append(std_dev)

dataset_sizes = [int(key) for key in list(bert_score_llava_gpt.keys())]

for i in range(len(dataset_sizes)):
  wandb.log(
      {
        "llava_score/mean": mean_scores[i],
        "llava_score/std": std_devs[i],
      },
      step=dataset_sizes[i]
  )

wandb.finish()

## MiniGPT4

# Download 100 testing images (Executed Once)

In [None]:
folder_path = os.path.join("/content/testing/random_images")

os.makedirs(folder_path)

In [None]:
import requests

def download_random_images(num_images, folder_path):
    for i in range(num_images):
        response = requests.get(f"https://picsum.photos/300/300?random={i}")
        filename = f"{folder_path}/image_{i}.jpg"
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {filename}")

# Example usage
num_images = 100
download_random_images(num_images, folder_path)