In [1]:
# === Config & imports ===
import os, io, json, time, random, traceback, base64
from pathlib import Path
import requests
from typing import List, Dict, Any

# Paths (relative to this notebook's folder)
NB_DIR = Path.cwd()
VLAT_JSON = (NB_DIR.parent / 'data' / 'VLAT' / 'vlat_skip.json')
CALVI_JSON = (NB_DIR.parent / 'data' / 'CALVI' / 'calvi.json')

# Ollama chat endpoint / model
API_URL = 'http://127.0.0.1:11434/api/chat'  # instead of localhost
REQUEST_TIMEOUT = 600  # seconds; first few calls can be slow
MODEL   = 'llava:7b'

# Experiment options
NUM_RUNS     = 3         # number of repetitions over the dataset
MAX_RETRIES  = 3         # retries for transient API errors
RETRY_DELAY  = 2         # seconds
TEMPERATURE  = 0.0       # deterministic
MAX_TOKENS   = 300       # response cap (if your server honors it)

# Prompts
VLAT_PROMPT = (
"""I am about to show you an image and ask you a multiple choice question about that image. 
Please structure your response in the following format:
Answer: [Enter the exact text of your chosen option]
Explanation: [Provide your reasoning]
Select the BEST answer, based only on the chart and not external knowledge. DO NOT GUESS.
If you are not sure about your answer or your answer is based on a guess, select "Omit".
Choose your answer ONLY from the provided options."""
)
CALVI_PROMPT = (
"""I am about to show you an image and ask you a multiple choice question about that image. 
Please structure your response in the following format:
Answer: [Enter the exact text of your chosen option(s)]
Explanation: [Provide your reasoning]
Select the BEST answer, based only on the chart and not external knowledge.
Choose your answer ONLY from the provided options."""
)

print('Notebook directory:', NB_DIR)
print('VLAT JSON expected at:', VLAT_JSON)
print('CALVI JSON expected at:', CALVI_JSON)


Notebook directory: C:\Users\Melita\CSE 4001\VLM-Eval-Research\scripts
VLAT JSON expected at: C:\Users\Melita\CSE 4001\VLM-Eval-Research\data\VLAT\vlat_skip.json
CALVI JSON expected at: C:\Users\Melita\CSE 4001\VLM-Eval-Research\data\CALVI\calvi.json


In [2]:
# --- CSV helper: minimal, single-row append with header on first write ---
import os, csv

def append_to_csv(csv_path, data_dict, fieldnames=None):
    """
    Append one row (dict) to a CSV.
    - Creates file with header on first write.
    - Uses utf-8 and newline='' to avoid blank lines on Windows.
    - fieldnames: optional explicit column order. Defaults to data_dict.keys().
    """
    file_exists = os.path.exists(csv_path)
    with open(csv_path, mode="a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=(fieldnames or list(data_dict.keys())))
        if not file_exists:
            writer.writeheader()
        writer.writerow(data_dict)

In [3]:
# === Utilities: ping server, image encoding, robust JSON loading ===
def ping_ollama(url: str = API_URL) -> bool:
    try:
        r = requests.get(url.replace('/api/chat','/api/tags'), timeout=5)
        return r.ok
    except Exception:
        return False

def b64_from_file(path: Path) -> str:
    with open(path, 'rb') as f:
        return base64.b64encode(f.read()).decode('utf-8')

def resolve_image_paths(json_path: Path, questions: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
    base = json_path.parent
    for q in questions:
        raw = q.get('image_path', '')
        p = Path(raw)
        if not p.is_absolute():
            candidates = [
                base / p,                       # same folder as JSON
                base / 'images' / p.name,       # a common layout
                base.parent / p,                # one level up
                Path.cwd() / p                  # notebook CWD (fallback)
            ]
            chosen = None
            for c in candidates:
                if c.exists():
                    chosen = c
                    break
            if chosen is None:
                # keep best-guess (relative to JSON folder) so errors are informative
                chosen = (base / p)
            q['image_path'] = str(chosen)
        else:
            q['image_path'] = str(p)
    return questions

def load_questions_file(file_path: Path) -> List[Dict[str,Any]]:
    file_path = Path(file_path)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # expected schema: top-level key 'questions'
    items = data.get('questions', data)
    if not isinstance(items, list):
        raise ValueError('Expected a list under key "questions" or the file to be a list of items.')
    return resolve_image_paths(file_path, items)

def call_llava(image_path: Path, prompt: str) -> str:
    # encode image
    img_b64 = b64_from_file(image_path)
    payload = {
        'model': MODEL,
        'messages': [
            {
                'role': 'user',
                'content': prompt,
                'images': [img_b64]
            }
        ],
        'stream': False,
        'options': {
            'temperature': TEMPERATURE,
        }
    }
    for attempt in range(1, MAX_RETRIES+1):
        try:
            r = requests.post(API_URL, json=payload, timeout=REQUEST_TIMEOUT)
            r.raise_for_status()
            data = r.json()
            return data.get('message',{}).get('content','')
        except Exception as e:
            if attempt == MAX_RETRIES:
                raise
            time.sleep(RETRY_DELAY)
    return ''


In [4]:
# === Sanity checks ===
print('Ollama running? ->', ping_ollama())
print('VLAT JSON exists? ->', VLAT_JSON.exists())
print('CALVI JSON exists? ->', CALVI_JSON.exists())

# If these return False, fix the paths above or move your files accordingly.


Ollama running? -> True
VLAT JSON exists? -> True
CALVI JSON exists? -> True


In [5]:
# === Peek at the first question and verify its image path resolution ===
try:
    vlat_qs = load_questions_file(VLAT_JSON)
    if not vlat_qs:
        print('VLAT has 0 questions!')
    else:
        q0 = vlat_qs[0]
        print('Sample VLAT item:')
        for k in ['id','question','image_path']:
            print(' ', k+':', q0.get(k))
        print('Image exists? ->', Path(q0['image_path']).exists())
except Exception as e:
    print('Error loading VLAT:', e)

try:
    calvi_qs = load_questions_file(CALVI_JSON)
    if not calvi_qs:
        print('CALVI has 0 questions!')
    else:
        q0 = calvi_qs[0]
        print('Sample CALVI item:')
        for k in ['id','question','image_path']:
            print(' ', k+':', q0.get(k))
        print('Image exists? ->', Path(q0['image_path']).exists())
except Exception as e:
    print('Error loading CALVI:', e)


Sample VLAT item:
  id: 1
  question: What was the price of a barrel of oil in February 2015?
  image_path: C:\Users\Melita\CSE 4001\VLM-Eval-Research\data\VLAT\images\LineChart.png
Image exists? -> True
Sample CALVI item:
  id: 43
  question: What is the trend sales in gift shop A from Jan to Dec?
  image_path: C:\Users\Melita\CSE 4001\VLM-Eval-Research\data\CALVI\images\question43.png
Image exists? -> True


In [6]:

# === Main experiment runner ===
import csv
from time import perf_counter

def _opt_str(opts):
    """Convert list/dict options into a readable string for CSV."""
    if isinstance(opts, (list, tuple)):
        return " | ".join(str(x) for x in opts)
    if isinstance(opts, dict):
        return " | ".join(f"{k}) {v}" for k, v in opts.items())
    return "" if opts is None else str(opts)

def run_experiment(name: str, dataset_path: Path, prompt: str, out_dir: Path = NB_DIR / "results"):
    out_dir.mkdir(parents=True, exist_ok=True)
    questions = load_questions_file(dataset_path)
    print(f"Starting {name} with {len(questions)} questions...")

    for run_idx in range(1, NUM_RUNS + 1):
        log_path = out_dir / f"{name}_run{run_idx}.csv"
        print(f"\n--- Run {run_idx}/{NUM_RUNS} --- writing to {log_path}")

        with open(log_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(
                f,
                fieldnames=[
                    "test_name",
                    "question",
                    "options",
                    "correct_answer",
                    "llava_answer",
                    "raw_response",
                    "misleader",
                    "is_correct",
                    "elapsed_seconds",
                ],
            )
            writer.writeheader()

            for i, q in enumerate(questions, 1):
                qid = q.get("id", f"{name}_{i}")
                img = Path(q.get("image_path", ""))
                question_text = q.get("question", "")
                options = q.get("options", q.get("choices", ""))
                correct = q.get("correct_answer", q.get("answer", ""))
                misleader = q.get("misleader", "")

                try:
                    # --- measure LLaVA call time ---
                    t0 = perf_counter()
                    raw_response = call_llava(img, prompt + "\n\nQuestion: " + question_text)
                    elapsed = round(perf_counter() - t0, 3)

                    # Use raw text as llava_answer (or plug in your parser if you have one)
                    llava_answer = raw_response.strip()
                    is_correct = str(llava_answer).strip().lower() == str(correct).strip().lower()

                    writer.writerow({
                        "test_name": name,
                        "question": question_text,
                        "options": _opt_str(options),
                        "correct_answer": correct,
                        "llava_answer": llava_answer,
                        "raw_response": raw_response,
                        "misleader": misleader,
                        "is_correct": is_correct,
                        "elapsed_seconds": elapsed,
                    })

                except Exception as e:
                    # Still log the row to keep format consistent
                    writer.writerow({
                        "test_name": name,
                        "question": question_text,
                        "options": _opt_str(options),
                        "correct_answer": correct,
                        "llava_answer": "",
                        "raw_response": f"[exception] {e}",
                        "misleader": misleader,
                        "is_correct": False,
                        "elapsed_seconds": None,
                    })
                    print("Error:", e)

        print(f"✓ Done writing {log_path}")

    print("All runs completed for", name)


In [7]:
import requests, base64, json
# simple empty chat to check response
test = requests.post(
    'http://127.0.0.1:11434/api/chat',
    json={"model":"llava:7b","messages":[{"role":"user","content":"hi im Melita how are you"}]},
    timeout=60
)
print(test.status_code, test.text)

200 {"model":"llava:7b","created_at":"2025-10-08T19:48:17.4829704Z","message":{"role":"assistant","content":" Hello"},"done":false}
{"model":"llava:7b","created_at":"2025-10-08T19:48:17.7673098Z","message":{"role":"assistant","content":" Mel"},"done":false}
{"model":"llava:7b","created_at":"2025-10-08T19:48:17.8683749Z","message":{"role":"assistant","content":"ita"},"done":false}
{"model":"llava:7b","created_at":"2025-10-08T19:48:17.9655613Z","message":{"role":"assistant","content":"!"},"done":false}
{"model":"llava:7b","created_at":"2025-10-08T19:48:18.0646795Z","message":{"role":"assistant","content":" I"},"done":false}
{"model":"llava:7b","created_at":"2025-10-08T19:48:18.1608062Z","message":{"role":"assistant","content":"'"},"done":false}
{"model":"llava:7b","created_at":"2025-10-08T19:48:18.2581237Z","message":{"role":"assistant","content":"m"},"done":false}
{"model":"llava:7b","created_at":"2025-10-08T19:48:18.3541425Z","message":{"role":"assistant","content":" just"},"done":fals

In [8]:
# === Run experiments ===
run_experiment('VLAT', VLAT_JSON, VLAT_PROMPT)
run_experiment('CALVI', CALVI_JSON, CALVI_PROMPT)

Starting VLAT with 53 questions...

--- Run 1/3 --- writing to C:\Users\Melita\CSE 4001\VLM-Eval-Research\scripts\results\VLAT_run1.csv


KeyboardInterrupt: 