Tutorial 4 (OpenAI vision API)
======================


## About

In this part of the assignment you will explore the openAI Vision API.

**Note**: This tutorial is **optional**. You will need openAI API keys to complete it. Note that if you are using your personal account, completing this tutorial will cost you some money. But not much, one round of experiment (as it is outlined here) will cost less than 1$. 


<hr> 

* The <b><font color='red'>red</font></b> color indicates the task that should be done, like <b><font color='red'>[TODO]</font></b>: ...
* Addicitional comments, hints are in <b><font color='blue'>blue</font></b>. For example <b><font color='blue'>[HINT]</font></b>: ...

### Useful links

https://platform.openai.com/docs/guides/vision

https://platform.openai.com/docs/guides/fine-tuning/vision

https://openai.com/api/pricing/

## Prelimiaries

In [None]:
# !pip install datasets
# !pip install fiftyone
# !pip install scikit-learn
# !pip install tensorboard jupyter-tensorboard
# !pip install tqdm
# !pip install openai

In [None]:
import os
import gdown
import json
import zipfile

import base64
import requests
from io import BytesIO
from pathlib import Path

import numpy as np
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter

from datasets import load_dataset
from datasets import Dataset, DatasetDict

import fiftyone as fo

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
# make plots a bit nicer
plt.matplotlib.rcParams.update({'font.size': 18, 'font.family': 'serif'})

## Auxilary functions

### Data related

In [None]:
def create_mini_dataset(
    dataset: Dataset,
    classes_list: list,
    used_indexes: list[int] = None,
    blacklist_classes: list[int] = None,
    n_times: int = None,
    max_size: int = None
) -> tuple[list[dict], list]:

    used_indexes = set(used_indexes or [])
    blacklist_classes = set(blacklist_classes or [])
    
    available_indices = [i for i in range(len(dataset)) if i not in used_indexes]
    filtered_dataset = [dataset[i] for i in available_indices]
    
    max_size = max_size or len(filtered_dataset)
    n_times = n_times or (max_size // len(classes_list))
    
    counter = {label: 0 for label in range(len(classes_list))}
    selected_samples = []
    selected_indexes = []
    samples_count = 0

    for i, sample in zip(available_indices, filtered_dataset):
        label = sample['label']
        if label not in blacklist_classes and counter[label] < n_times and samples_count < max_size:
            selected_samples.append(sample)
            selected_indexes.append(i)
            counter[label] += 1
            samples_count += 1

    return selected_samples, selected_indexes

In [None]:
def resize_image_to_largest_side(image: Image.Image, large_side: int = 512) -> Image.Image:
    # Determine the scaling factor based on the larger dimension
    scale_factor = large_side / max(image.size)

    # Calculate new dimensions using the scaling factor
    new_dimensions = tuple(int(dim * scale_factor) for dim in image.size)

    # Resize and return the image
    return image.resize(new_dimensions, Image.Resampling.LANCZOS)

In [None]:
def create_hf_cocoo_dataset(path_coco_o:str , path_data:str, seed:int =42, test_ratio=0.3):
    def load_image(example):
        example['image'] = Image.open(example['image_path'])
        return example

    if not os.path.exists(path_coco_o):
        url = 'https://drive.google.com/uc?id=1aBfIJN0zo_i80Hv4p7Ch7M8pRzO37qbq'
        zip_file_path = os.path.join(path_data, 'ood_coco.zip')
        gdown.download(url, zip_file_path, quiet=False)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(path_data)

    cocoo_classes_list = os.listdir(path_coco_o)
    all_elements_coco = [
        (os.path.join(path_coco_o, label, 'val2017', img), index) 
        for index, label in enumerate(cocoo_classes_list) 
        for img in os.listdir(os.path.join(path_coco_o, label, 'val2017'))
    ]

    np.random.seed(seed)
    indices = np.arange(len(all_elements_coco))
    np.random.shuffle(indices)
    n_test = int(len(indices) * test_ratio)

    train_indices, test_indices = indices[n_test:], indices[:n_test]
    datasets = {}

    for split, split_indices in zip(['train', 'test'], [train_indices, test_indices]):
        split_data = [(all_elements_coco[i][0], all_elements_coco[i][1]) for i in split_indices]
        image_paths, labels = zip(*split_data)
        dataset = Dataset.from_dict({'image_path': image_paths, 'label': labels})
        datasets[split] = dataset.map(load_image, remove_columns=['image_path'])

    return DatasetDict(datasets), cocoo_classes_list

### Prompts and all that

In [None]:
base_prompt = '''
Please analyze the provided image and determine its class.
    
The eligible classes are as follows:
{class_names_list}
    
Select the class that best represents the image from this list. 
Do not include any additional information or commentary in your response. 
Ensure the predicted class is among the eligible classes, and respond with only the class name.
'''

In [None]:
def encode_image_by_path(image_path: str | Path) -> str:
    with open(image_path, 'rb') as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def encode_image(pil_image: Image.Image) -> str:
    with BytesIO() as buffer:
        pil_image.convert('RGB').save(buffer, format='JPEG')
        return base64.b64encode(buffer.getvalue()).decode('utf-8')

def parse_response(
    response: object, 
    class_labels: list[str]
) -> str | None:
    
    try:
        if response.ok:
            data = response.json()
            pred = data.get('choices', [{}])[0].get('message', {}).get('content')
            return pred if pred in class_labels else None
    except ValueError as e:
        print(f'Error parsing JSON: {e}')
        return None

def generate_request_with_image(
    base64_image: str,
    classes_list: list[str],     
    true_label: str | None = None,
    openai_model_version: str = 'gpt-4o-2024-08-06'
) -> dict[str, object]:
    
    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user', 'content': base_prompt.format(class_names_list=classes_list)},
        {'role': 'user', 'content': [{
            'type': 'image_url',
            'image_url': {
                'url': f'data:image/jpeg;base64,{base64_image}'
            }
        }]}
    ]
    
    if true_label:
        messages.append({'role': 'assistant', 'content': true_label})
    
    return {
        'messages': messages,
        'model': openai_model_version
    }

def generate_requests(
    data_set: list[dict[str, object]], 
    class_labels: list[str],
    mode: str = 'test',
    model_version: str = 'gpt-4o-2024-08-06'    
) -> list[dict[str, object]]:
    
    def process_sample(sample: dict[str, object]) -> dict[str, object]:
        pil_image = (
            sample['image'] if 'image' in sample 
            else Image.open(sample['image_path'])
        )
        
        resized_image = (
            resize_image_to_largest_side(pil_image) 
            if max(pil_image.size) > 512 
            else pil_image
        )
        
        true_label = (
            class_labels[sample['label']]
            if mode == 'train' 
            else None
        )
        
        return generate_request_with_image(
            encode_image(resized_image),
            class_labels,
            true_label=true_label,
            openai_model_version=model_version
        )
    
    return list(map(process_sample, data_set))

## Load data

In [None]:
# Set the local folder with the data
path_data = './data'
os.makedirs(path_data, exist_ok=True)

In [None]:
# Load cifar10 dataset
cifar10_dataset = load_dataset('cifar10', cache_dir=path_data)
cifar10_dataset = cifar10_dataset.rename_column(original_column_name='img', new_column_name='image')
cifar10_classes_list = cifar10_dataset['train'].features['label'].names

In [None]:
# Load DTD dataset
dtd_dataset = load_dataset('tanganke/dtd', cache_dir=path_data)
dtd_classes_list = dtd_dataset['train'].features['label'].names

In [None]:
# Load COCO-O dataset
path_coco_o = os.path.join(path_data, 'ood_coco')
cocoo_dataset, cocoo_classes_list = create_hf_cocoo_dataset(path_coco_o, path_data)

## Create and select 'mini' testsets

In [None]:
cifar10_mini_test, cifar10_mini_test_indexes = create_mini_dataset(cifar10_dataset['test'], cifar10_classes_list, max_size=100)
dtd_mini_test, dtd_mini_test_indexes = create_mini_dataset(dtd_dataset['test'], dtd_classes_list, max_size=100)
cocoo_mini_test, cocoo_mini_test_indexes = create_mini_dataset(cocoo_dataset['test'], cocoo_classes_list, max_size=100)

In [None]:
#mini_testset, mini_class_labels = cifar10_mini_test, cifar10_classes_list
#mini_testset, mini_class_labels = dtd_mini_test, dtd_classes_list
mini_testset, mini_class_labels = cocoo_mini_test, cocoo_classes_list

## Zero shot 

In [None]:
from openai import OpenAI

# Get openAI key or provide one by hand 
client = OpenAI()
api_key = client.api_key
#api_key = ...

# Prepare headers
headers = {
  'Content-Type': 'application/json',
  'Authorization': f'Bearer {api_key}'
}

In [None]:
# Generate requests
test_requests = generate_requests(mini_testset, mini_class_labels, mode='test')

In [None]:
labels_ground_truth = []
labels_predictations = []
for req_json, sample in tqdm(zip(test_requests, mini_testset)):
       
    # Send request
    response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=req_json)

    # Parse response
    pred = parse_response(response, mini_class_labels)
    if pred:
        labels_predictations.append(pred)
        labels_ground_truth.append(mini_class_labels[sample['label']])

In [None]:
# Report
print(f'* Response success rate: {len(labels_predictations) / len(mini_testset)}\n')
print('* Classification report:\n')
print(classification_report(labels_ground_truth, labels_predictations, target_names=mini_class_labels))

## Fine-tuning

### Prepare data

In [None]:
# Create train and validation set 
# We will use the small subset to speed-up and make things not costly

In [None]:
mini_train, mini_train_indexes = create_mini_dataset(cocoo_dataset['train'], cocoo_classes_list, max_size=500)
mini_val, _ = create_mini_dataset(cocoo_dataset['train'], cocoo_classes_list, max_size=200, used_indexes=mini_train_indexes)
mini_classes_list = cocoo_classes_list

In [None]:
# Inspect one sample 
# indx = 0
# print(mini_train[indx]['label'])
# print(mini_classes_list[mini_train[indx]['label']])
# mini_train[indx]['image']

### Cleaning the data 

As of today (Oct 2024) openAI has restrictions on images containing 

* People
* Faces
* Children
* CAPTCHAs

You can read more [here](https://platform.openai.com/docs/guides/fine-tuning/content-moderation-policy).

To clean 'people related content' we will use YOLO detector, read here if you would like to learn more this [lib](https://docs.ultralytics.com/) (we will also cover YOLO during our 'object detection' part of the course. 

In [None]:
# Load a pred model from YOLO
from ultralytics import YOLO
model_det = YOLO('yolo11n.pt')

In [None]:
def is_person_present(image):
    det_results = model_det(image, verbose=False)[0]
    detected_classes = [det_results.names[int(cls.item())] for cls in det_results.boxes.cls]
    return 'person' in detected_classes

In [None]:
mini_train = [sample for sample in tqdm(mini_train) if not is_person_present(sample['image'])]
mini_val = [sample for sample in tqdm(mini_val) if not is_person_present(sample['image'])]

### Stats of train and test set

In [None]:
# Dataset sizes
print('* Sizes:')
print('train: ', len(mini_train))
print('val:   ', len(mini_val))

# Classes 
print('\n* Classes:')
print('train: ', sorted([mini_classes_list[indx] for indx in set([x['label'] for x in mini_train])]))
print('val:   ', sorted([mini_classes_list[indx] for indx in set([x['label'] for x in mini_val])]))

# Inspect the distribution of the data
print('\n* Distritubions:')
print('train: ', sorted(Counter([mini_classes_list[x['label']] for x in mini_train]).most_common(), key= lambda x :x[0]))
print('val:   ', sorted(Counter([mini_classes_list[x['label']] for x in mini_val]).most_common(), key= lambda x :x[0]))

### Fine-tune

In [None]:
# Generate requests to openAI
train_requests = generate_requests(mini_train, mini_classes_list, mode='train')
val_requests = generate_requests(mini_val, mini_classes_list, mode='train')

In [None]:
# Prepare files
fname_train = os.path.join(path_data, 'data_train_openai.jsonl')
with open(fname_train, 'w') as file:
    for entry in train_requests_selected:
        json_line = json.dumps(entry)
        file.write(json_line + '\n')
print(f'Data written to {fname_train}')

fname_val = os.path.join(path_data, 'data_val_openai.jsonl')
with open(fname_val, 'w') as file:
    for entry in val_requests_selected:
        json_line = json.dumps(entry)
        file.write(json_line + '\n')

print(f'Data written to {fname_val}')

In [None]:
# Just to check if the files are there and the sizes are resonable
!ls -alh ./data/*.jsonl

In [None]:
# Send files to openAI
training_file_upload_response = client.files.create(
  file=open(fname_train, 'rb'),
  purpose='fine-tune'
)
print(training_file_upload_response)

validation_file_upload_response = client.files.create(
  file=open(fname_val, 'rb'),
  purpose='fine-tune'
)
print(validation_file_upload_response)

In [None]:
# Launch the fine-tuning job
# You can monitor the status here: https://platform.openai.com/finetune/

fine_tuning_response = client.fine_tuning.jobs.create(
    training_file=training_file_upload_response.id,
    validation_file=validation_file_upload_response.id,
    suffix='cocoo',
    model='gpt-4o-2024-08-06'
)
fine_tuning_response

In [None]:
# After a while we can probe how things are going
status_response=client.fine_tuning.jobs.retrieve(fine_tuning_response.id)
status_response

### Predict

In [None]:
# Get the model name from 'status_response' once fine-tuning is finished or from https://platform.openai.com/finetune/ 
model_name_finetuned = ''

In [None]:
# Generate requests
test_requests = generate_requests(mini_testset, mini_class_labels, mode='test', model_version=model_name_finetuned)

In [None]:
labels_ground_truth = []
labels_predictations = []
for req_json, sample in tqdm(zip(test_requests, mini_testset)):
       
    # Send request
    response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=req_json)

    # Parse response
    pred = parse_response(response, mini_class_labels)
    if pred:
        labels_predictations.append(pred)
        labels_ground_truth.append(mini_class_labels[sample['label']])

In [None]:
# Report
print(f'* Response success rate: {len(labels_predictations) / len(mini_testset)}\n')
print('* Classification report:\n')
print(classification_report(labels_ground_truth, labels_predictations, target_names=mini_class_labels))

## Another dataset

<b><font color="red">[TODO]</font></b>: Conduct zero-shot experiments for DTD dataset and cifar10 datasets. Compare accuracy to previous experiments.