# 1. Install Dependencies 🙅


In [None]:
# !pip install transformers trl datasets bitsandbytes peft qwen-vl-utils wandb accelerate torch torchvision torchaudio
!pip install litellm datasets
!pip install python-dotenv

Collecting litellm
  Downloading litellm-1.62.1-py3-none-any.whl.metadata (37 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting python-dotenv>=0.2.0 (from litellm)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting tiktoken>=0.7.0 (from litellm)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading litellm-1.62.1-py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.2-py3-non

In [None]:
# Load environment variables from .env file
load_dotenv()

# 2. Load RAW DRAMA Annotation Dataset 📁




In [None]:
import pandas as pd

file_path = 'data/drama_annotation.json'

df = pd.read_json(file_path)

df.head()

Unnamed: 0,s3_fileUrl,s3_instructionReference,Risk,Suggestions (to the ego car),Intention of Ego-vehicle,Caption,Scene Classifier,geometry,Agent-classifier,Attributes of pedestrian,...,vehicle_motion_direction,other_vehicle_motion_direction,vehicle_reason_for_behaviour,other_vehicle_reason_for_behaviour,infrastructure_name,other_infrastructure_name,infrastructure_location,other_infrastructure_location,infrastructure_state,other_infrastructure_state
0,https://s3-us-west-2.amazonaws.com/hrius.scale...,https://s3-us-west-2.amazonaws.com/hrius.scale...,Yes,"be aware or cautious (of the important object,...",Straight,"There is a red cone indicating no stopping, on...",,"[[718, 975], [816, 975], [816, 1199], [718, 11...",Infrastructure,[],...,[],,,,traffic cone,,on the left side,,other,Indicating no stopping
1,https://s3-us-west-2.amazonaws.com/hrius.scale...,https://s3-us-west-2.amazonaws.com/hrius.scale...,Yes,Slow down,,"There is a pedestrian wearing a white T-shirt,...",Narrow lane,"[[1313, 716], [1409, 716], [1409, 1003], [1313...",Pedestrian,[wearing white t-shirt],...,[],,,,,,,,,
2,https://s3-us-west-2.amazonaws.com/hrius.scale...,https://s3-us-west-2.amazonaws.com/hrius.scale...,Yes,"be aware or cautious (of the important object,...",Straight,"There is a pedestrian wearing white clothing, ...",Narrow lane,"[[1238, 757], [1302, 757], [1302, 905], [1238,...",Pedestrian,[],...,[],,,,,,,,,
3,https://s3-us-west-2.amazonaws.com/hrius.scale...,https://s3-us-west-2.amazonaws.com/hrius.scale...,Yes,"be aware or cautious (of the important object,...",Straight,There is a silver sedan parked on the left sid...,Narrow lane,"[[850, 782], [1093, 782], [1093, 962], [850, 9...",Vehicle,[],...,[],,,,,,,,,
4,https://s3-us-west-2.amazonaws.com/hrius.scale...,https://s3-us-west-2.amazonaws.com/hrius.scale...,Yes,Yield,Straight,"There is a pedestrian wearing black clothing, ...",Intersection,"[[1452, 738], [1560, 738], [1560, 960], [1452,...",Pedestrian,[],...,[],,,,,,,,,


# 3. Prepare LLM For Constructing QA pairs 📎

> Define LLM Prompt

In [None]:
PROMPT = """
You are an advanced AI model that analyzes images from a traffic ego car’s perspective. You will receive an image showing a traffic scene. Please do the following:

1. **Observe the image carefully** and note the key elements (vehicles, pedestrians, road markings, signs, etc.).
2. **Generate exactly five (5) Q&A pairs** about the scene.
3. For each Q&A pair:
   - Provide a **Question** that a person might ask about the scene.
   - Identify and include the **Category Topic** for the question, based on the following predefined list:
     - Object Detection and Recognition
     - Spatial Relationships and Positioning
     - Traffic Rules and Compliance
     - Dynamic Elements and Movement
     - Scene Context and Prediction
     - Weather and Visibility Conditions
     - Road Infrastructure and Features
   - Provide a **Chain-of-Thought** (reasoning steps or thought process leading to the answer).
   - Provide an **Answer** that directly addresses the question.

4. **Format your output** in valid JSON, structured as follows:

{
  "qa_pairs": [
    {
      "category_topic": "Object Detection and Recognition",
      "question": "Question 1",
      "chain_of_thought": "Reasoning steps.",
      "answer": "final answer."
    },
    {
      "category_topic": "Spatial Relationships and Positioning",
      "question": "Question 2",
      "chain_of_thought": "...",
      "answer": "..."
    },
    ...
  ]
}

Make sure each "chain_of_thought" is clear, and each "answer" is accurate. Do not include extra commentary beyond these fields.
"""

> Define Invoke LLM using litellm

In [None]:
from litellm import completion
import time

def invoke_llm(message: list[dict], model: str, temperature=0.7):

    try:
        response = completion(
            model=model, messages=message, temperature=temperature, retry_strategy="exponential_backoff_retry"
        )
        llm_response = response["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"An error occurred during LLM invocation: {e}")
        time.sleep(10)
        try:
            response = completion(
                model=model,
                messages=message,
                temperature=temperature,
                retry_strategy="exponential_backoff_retry",
            )
            llm_response = response["choices"][0]["message"]["content"]
        except Exception as e:
            print(f"An error occurred during LLM invocation: {e}")
            return "An error occurred while processing your request. Please try again later."

    return llm_response


# 5. Prompt LLM for Constructing QA Pairs  ❓

In [None]:
import os
import base64
import json
from tqdm import tqdm
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define model and pattern for parsing
model = "gpt-4o"

# Lists to store parsed data, errors, and responses
parsed_data = []
errors = []
responses = []  # This will store all the LLM responses
image_directory = 'images/'

# Function to process each row
def process_row(idx, row):
    try:
        image_path = os.path.join(image_directory, f'image_{idx}.png')
        with open(image_path, 'rb') as image_file:
            image_base64 = base64.b64encode(image_file.read()).decode('utf-8')
            image_base64_url = f"data:image/png;base64,{image_base64}"

        message_template = [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": PROMPT
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": image_base64_url
                            }
                        }
                    ]
                }
            ]


        # Call the LLM
        llm_response = invoke_llm(message=message_template, model=model)

        # Store the LLM response
        response_entry = {'index': idx, 'response': llm_response}

        # Parse the questions and answers from the LLM response
        x = llm_response.replace('```json', '').replace('```', '')
        raw_json = json.loads(x)['qa_pairs']

        # Process each match and format the data
        row_data = [{
            'file_name': f'image_{idx}.png',
            'image_id': idx,
            # 'QA': raw_json,
            'category_topic': qa.get('category_topic'),
            'question': qa.get('question'),
            'thinking': qa.get('thinking'),
            'answer': qa.get('answer'),
        }]

        return {'data': row_data, 'error': None, 'response': response_entry}

    except Exception as e:
        print(f"Error invoking LLM for index {idx}: {e}")
        return {'data': None, 'error': (idx, str(e)), 'response': None}

# Number of workers (can be adjusted)
num_workers = 8

# Run the process using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    future_to_idx = {executor.submit(process_row, idx, row): idx for idx, row in df2.iterrows()}

    for future in tqdm(as_completed(future_to_idx), total=len(future_to_idx)):
        result = future.result()
        if result.get('data'):
            parsed_data.extend(result['data'])
        if result.get('error'):
            errors.append(result['error'])
        if result.get('response'):
            responses.append(result['response'])

# Create DataFrame from parsed data
qa_df2 = pd.DataFrame(parsed_data)

# Add file_name column
# qa_df.insert(0, 'file_name', qa_df['image_id'].apply(lambda idx: f'image_{idx}.png'))

# Display final DataFrame
qa_df2.head()


100%|██████████| 10/10 [00:10<00:00,  1.09s/it]


Unnamed: 0,file_name,image_id,conversation
0,image_5.png,5,[{'category_topic': 'Object Detection and Reco...
1,image_7.png,7,[{'category_topic': 'Object Detection and Reco...
2,image_2.png,2,[{'category_topic': 'Object Detection and Reco...
3,image_1.png,1,[{'category_topic': 'Object Detection and Reco...
4,image_4.png,4,[{'category_topic': 'Object Detection and Reco...


In [None]:
qa_df2.iloc[0]['conversation']

[{'category_topic': 'Object Detection and Recognition',
  'question': 'What types of vehicles are present in the scene?',
  'chain_of_thought': 'I observe a parked car on the left side of the street and several bicycles positioned near the buildings. This indicates the presence of both a motor vehicle and bicycles.',
  'answer': 'There is a parked car and several bicycles.'},
 {'category_topic': 'Spatial Relationships and Positioning',
  'question': 'Where are the pedestrians located in relation to the road?',
  'chain_of_thought': 'The pedestrians are positioned on the road, with some on the left side and others walking towards the camera, indicating they are crossing the street.',
  'answer': 'The pedestrians are on the road, some crossing towards the camera.'},
 {'category_topic': 'Traffic Rules and Compliance',
  'question': 'Are the pedestrians following traffic rules in this scene?',
  'chain_of_thought': 'The pedestrians are walking in a crosswalk area, which suggests they are a

In [None]:

qa_directory = './images/hazardqa'

qa_df2.to_csv(qa_directory)

# Publish QA dataset to HugingFace 🤗

In [None]:
from huggingface_hub import login
from google.colab import userdata

login(os.getenv('HF_API_KEY'))

In [None]:
import random
import polars as pl
import datasets

# 1. Convert the HF dataset to Polars
df = dataset.to_polars()

# 2. Extract unique image_ids
unique_ids = df.select(pl.col("image_id")).unique().to_series().to_list()

# 3. Shuffle and create splits (train/val/test)
random.seed(42)
random.shuffle(unique_ids)

total_ids = len(unique_ids)
train_size = int(0.7 * total_ids)
val_size = int(0.15 * total_ids)
test_size = total_ids - train_size - val_size

train_ids = set(unique_ids[:train_size])
val_ids   = set(unique_ids[train_size:train_size + val_size])
test_ids  = set(unique_ids[train_size + val_size:])

# 4. Filter the Polars DataFrame by each split
df_train = df.filter(pl.col("image_id").is_in(train_ids))
df_val   = df.filter(pl.col("image_id").is_in(val_ids))
df_test  = df.filter(pl.col("image_id").is_in(test_ids))

# 5. Convert filtered Polars DataFrame back to Hugging Face Datasets
train_dataset = datasets.Dataset.from_pandas(df_train.to_pandas())
val_dataset   = datasets.Dataset.from_pandas(df_val.to_pandas())
test_dataset  = datasets.Dataset.from_pandas(df_test.to_pandas())

# 6. Create the DatasetDict
split_dataset = datasets.DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# 7. (Optional) Verify no overlaps and completeness, if needed
#    This mirrors your original checks with sets, but done on train_ids, val_ids, and test_ids

print(split_dataset)


NameError: name 'dataset' is not defined

In [None]:
import datasets
import random

qa_directory = 'images/'

# Load the dataset
dataset = datasets.load_dataset("imagefolder", data_dir=qa_directory)['train']

# Step 1: Extract unique 'image_id's
df = dataset.to_pandas()
unique_ids = df['image_id'].unique()

# Step 2: Shuffle and split 'image_id's
random.seed(42)  # Set seed for reproducibility
random.shuffle(unique_ids)

total_ids = len(unique_ids)
train_size = int(0.7 * total_ids)
val_size = int(0.15 * total_ids)
test_size = total_ids - train_size - val_size

train_ids = unique_ids[:train_size]
val_ids = unique_ids[train_size:train_size+val_size]
test_ids = unique_ids[train_size+val_size:]

# Step 3: Filter the original dataset
train_dataset = dataset.filter(lambda x: x['image_id'] in train_ids)
val_dataset = dataset.filter(lambda x: x['image_id'] in val_ids)
test_dataset = dataset.filter(lambda x: x['image_id'] in test_ids)

# Step 4: Verify no overlaps and completeness
train_set = set(train_ids)
val_set = set(val_ids)
test_set = set(test_ids)

assert train_set.isdisjoint(val_set), "Train and validation sets have overlapping image_ids."
assert train_set.isdisjoint(test_set), "Train and test sets have overlapping image_ids."
assert val_set.isdisjoint(test_set), "Validation and test sets have overlapping image_ids."

total_unique = set(unique_ids)
splits_union = train_set.union(val_set).union(test_set)
assert splits_union == total_unique, "Some image_ids are missing from the splits."

# Create a DatasetDict
split_dataset = datasets.DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Print the resulting DatasetDict
print(split_dataset)


Resolving data files:   0%|          | 0/17785 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/17784 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [None]:
split_dataset.push_to_hub("Tami3/HazardQA-Reasoning-v0.3")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/1662 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Map:   0%|          | 0/1662 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Map:   0%|          | 0/1661 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1065 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Tami3/HazardQA-Reasoning-v0.3/commit/78b451db48bd08c8d13fa4bc59c85b92c96be207', commit_message='Upload dataset', commit_description='', oid='78b451db48bd08c8d13fa4bc59c85b92c96be207', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Tami3/HazardQA-Reasoning-v0.3', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Tami3/HazardQA-Reasoning-v0.3'), pr_revision=None, pr_num=None)