In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import os
import sys

APP_ROOT = os.path.abspath('..')
sys.path.append(os.path.abspath(APP_ROOT))

from chart_reasoning.pipeline import ChartReasoningPipeline

vistext_data_dir = os.path.join(APP_ROOT, 'data', 'vistext-data')
output_dir = os.path.join(APP_ROOT, 'output', 'chart-reasoning-trail-run')

pipeline = ChartReasoningPipeline(vistext_data_dir, output_dir, 
                                  chart_type_list=['unaligned_rule', 'color'])
# note that `table` type needs executable Chrome in the system.

In [3]:
# prepare the vistext data.
pipeline.refine_vistext_data()

100%|██████████| 8822/8822 [01:07<00:00, 130.53it/s]


In [10]:
for i in range(1):  # increase the number of loop for error skipping cases.
    pipeline.task_generation(sample_size=1)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  5.99it/s]


In [11]:
for i in range(1):  # increase the number of loop for error skipping cases.
    pipeline.visual_chart_reasoning()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:09<00:00,  9.82s/it]


In [13]:
for i in range(1):  # increase the number of loop for error skipping cases.
    pipeline.grade_with_text_agent()

100%|██████████| 2/2 [00:00<00:00, 543.34it/s]


In [None]:
# write the graded results to markdown file for easy viewing.
pipeline.graded_output_to_md()

# Evaluate GPT-4o

In [None]:
from tqdm import tqdm
import json
from copy import deepcopy

# code_graded_dir = os.path.join(output_dir, '06-code-assistant-grading-output')
text_graded_dir = os.path.join(output_dir, '04-text-grading-output')

# code_graded_task_ids = [fname.split(".")[0] for fname in os.listdir(code_graded_dir) if fname.endswith('.grade.json')]
# graded_task_ids = [fname.split(".")[0] for fname in os.listdir(text_graded_dir) if fname.endswith('.grade.json')]

graded_task_ids = open("reported_ids_1000.txt", "r").readlines()
graded_task_ids = [one_id.strip() for one_id in graded_task_ids]

# RQ2 exp1
chart_types = ['line', 'scatter', 'bar']
# RQ2 exp2
# chart_types = ['pie', 'table', 'bar_anno', 'line_anno', 'scatter_anno']

# RQ1 exp1
# chart_types = ['unaligned_rule', 'color', 'size', 'scatter']
# RQ1 exp2
# chart_types = ['rule', 'scatter_size', 'bar', 'bar_color']


text_student_answer_correctness_dict_by_chart_type = dict()
text_student_judgement_list = dict()
for chart_type in chart_types:
    text_student_answer_correctness_dict_by_chart_type[chart_type] = dict()
    text_student_judgement_list[chart_type] = []



valid_task_ids = list(set(graded_task_ids))
for task_id in tqdm(valid_task_ids):
    # text_graded_task_file = os.path.join(text_graded_dir, f'{task_id}.grade.json')
    for chart_type in chart_types:
        text_graded_task_file = os.path.join(text_graded_dir, f'{task_id}.{chart_type}.grade.json')

        if os.path.exists(text_graded_task_file):
            with open(text_graded_task_file, 'r') as f:
                text_graded_task = json.load(f)
                for question_index, question_dict in enumerate(text_graded_task):

                    text_student_judgement_list[chart_type].append(question_dict['student_answer_correctness'].lower())
                    if question_dict['student_answer_correctness'] not in text_student_answer_correctness_dict_by_chart_type[chart_type].keys():
                        text_student_answer_correctness_dict_by_chart_type[chart_type][question_dict['student_answer_correctness'].lower()] = []
                    else:
                        text_student_answer_correctness_dict_by_chart_type[chart_type][question_dict['student_answer_correctness'].lower()].append((question_dict['task_id']+"_"+str(question_index), question_dict['task_type']))

        else:
            print(f"File not found: {text_graded_task_file}")
            continue

In [None]:
correct_list = []
for chart_type in chart_types:
    print(f"Chart Type: {chart_type}")
    print("Total number of questions: ", len(text_student_judgement_list[chart_type]))
    print("Correctness Distribution:")
    total = sum([len(id_list) for id_list in text_student_answer_correctness_dict_by_chart_type[chart_type].values()])
    for key, id_list in text_student_answer_correctness_dict_by_chart_type[chart_type].items():
        print(key, ":", len(id_list), f"({round(len(id_list)/total * 100, 2)}%)")
        if key == 'correct':
            correct_list.append(round(len(id_list)/total * 100, 2))
    print("\n\n")


print("Correctness:", correct_list)


In [None]:


task_types = ['Find Anomalies', 'Find Correlation', 'Determine Range', 'Order', 'Filter', 'Compute Derived Value', 'Find Extremum', 'Retrieve Value', 'Find Clusters', 'Characterize Distribution']

for task_type in task_types:
    print(f"Task Type: {task_type}")
    # only consider the correctness of the task type
    for chart_type in chart_types:
        # print(f"Chart Type: {chart_type}")
        # get all examples for this task type for this chart and calculate the correctness
        # total = len(text_student_answer_correctness_dict_by_chart_type[chart_type][task_type])
        correct_cnt = 0
        for example in text_student_answer_correctness_dict_by_chart_type[chart_type]['correct']:
            if example[1] == task_type:
                correct_cnt += 1
        # count all
        all_example_cnt = 0
        for score_type, example_list in text_student_answer_correctness_dict_by_chart_type[chart_type].items():
            for example in example_list:
                if example[1] == task_type:
                    all_example_cnt += 1
        
        # print("Correctness:", correct_cnt, "All examples:", all_example_cnt, f"({round(correct_cnt/all_example_cnt * 100, 2)}%)")
        print(f"{round(correct_cnt/all_example_cnt * 100, 2)}", end=';')
    print("\n\n")