In [1]:
import os
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.gridspec as gridspec
from math import pi



rcParams['figure.dpi'] = 500
rcParams['savefig.dpi'] = 500
rcParams['font.family'] = 'Arial'
rcParams['axes.labelsize'] = 18
rcParams['axes.titlesize'] = 18
rcParams['legend.fontsize'] = 18
rcParams['figure.titlesize'] = 18
rcParams['markers.fillstyle'] = 'none'


# Map question code to difficulty level
# Used in the yield_data function
# The BSP is SAS, such rename happens in after data is loaded and summrized
problem_nphard_mapper = {
    'SPP': 'p', 'MFP': 'p', 'BSP': 'p', 'EDP': 'p',
    'TSP_D': 'np-cmp', 'GCP_D': 'np-cmp', 'KSP': 'np-cmp',
    'TSP': 'np-hard', 'GCP': 'np-hard', 'MSP': 'np-hard',
}

nphard_order_mapper = {'np-hard': 2, 'np-cmp': 1, 'p': 0}


# Order of questions in graphs
problem_order_mapper = {
    'GCP': 0, 'TSP': 1, 'MSP': 2, 'GCP_D': 3, 'TSP_D': 4, 'KSP': 5, 'SAS': 6, 'EDP': 7, 'SPP': 8
}

# Order of models in graphs
model_order_mapper = {"Gemini": 0, "GPT4V": 1, "LLaVa":2, "Otter":3 ,"Qwen-VL": 4, 
                       "CogVLM":5 ,"BLIP-2": 6, "Fuyu-8b": 7, "Kosmos2": 8,
                       "Close models":9, "Open models" : 10, "All models":11}

model_types = ["Open", "Close"]
prompt_types = ["figure+full_text", "figure+limited_text", "full_text_only", "recognition"]



def yield_data(data):
    """Generate the data for the detailed dataframe,
       data is dict of dict containing parsed experiment data
    """
    model_name = data["model_name"]
    problem_name = data["problem_name"]
    model_type = data["model_type"]
    prompt_type = data["prompt_type"]

    
    try:
        problem_nphard_mapper[problem_name]
    except:
        print('in yield_data except')
        print(problem_name)
        print(data)
        print()
        
    return [
        {
            "model_name": model_name,
            "problem_name": problem_name,
            "problem_type": problem_nphard_mapper[problem_name],
            # 1 is close model and 0 is open model
            "model_type": int(model_type == "Close"),
            "prompt_type": prompt_type,
            "recognized": data["recognized"][i],
            "parsable": data["parsable"][i],
            "correctness": data["correctness"][i],
            "recognized_and_parsable": data["recognized_and_parsable"][i],
            "level": data["level"][i]
        } for i in range(10) 
    ]

In [2]:
class ResultsInfo:
    """Wrapper class for the dataset"""
    def __init__(self, DATA_PATH, model_types, prompt_types, level_range, save_path=None):
        """initizalize the dataset"""
        assert os.path.exists(DATA_PATH), f"{DATA_PATH} does not exist"
        self.main_dir = DATA_PATH
        self.model_type_dirs = []
        self.prompt_type_dirs = []
        self.data = []
        self.level_range = level_range
        self.fetch_model_types(model_types)
        self.fetch_prompt_types(prompt_types)
        self.fetch_exist_results()
        self.dataframe = None
        self.convert_to_dataframe()
        self.rename_model_prompt_type()
        # save the dataframe
        if save_path:
            self.dataframe.to_csv(save_path, index=False)

    def fetch_model_types(self, model_types):
        """Fetch the existing model type directories(Open or Close)"""
        for model_type in model_types:
            datapath = os.path.join(self.main_dir, model_type)
            if os.path.exists(datapath):
                self.model_type_dirs.append(model_type)
    
    def fetch_prompt_types(self, prompt_types):
        """For each model type, fetch the existing prompt type directories"""
        for model_type_dir in self.model_type_dirs:
            for prompt_type in prompt_types:
                datapath = os.path.join(self.main_dir, model_type_dir, prompt_type)
                if os.path.exists(datapath):
                    self.prompt_type_dirs.append((model_type_dir, prompt_type))
    
    def fetch_exist_results(self):
        """Iterate through the directories and fetch the json results and metadata"""
        for model_type, prompt_type in tqdm(self.prompt_type_dirs):
            datapath = os.path.join(self.main_dir, model_type, prompt_type)
            for file in os.listdir(datapath):
                if file.endswith(".json"):
                    self.fetch_json_result(file, model_type, prompt_type, datapath)
    
    def fetch_json_result(self, file, model_type, prompt_type, datapath):
        """Fetch the result and extract the metadata from one json file"""
        metadata = self.parse_json_filename(file)
        metadata.update({"model_type": model_type, "prompt_type": prompt_type})
        data = None
        try:
            with open(os.path.join(datapath, file), "r") as f:
                data = json.load(f)

            if prompt_type == "recognition":
                data_info = self.recognition_data_info_extractor(data, metadata)
            else:
                data_info = self.data_info_extractor(data, metadata)
            data_info.update(metadata)
            self.data.append(data_info)
        except Exception as e:
            # continue to run and find all the errored files 
            print(f"Error in fetching {file}: {e}")
            print(f"metadata: {metadata}")
            print()
            pass

    def parse_json_filename(self, filename):
        """Parse the filename to extract the model and problem names to get the metadata
           Assume the following naming <question name>_<model name>_results.json
           the question name could be <question name> or <question name>_D
        """
        filename = filename.split(".")[0]
        filename = filename.split("_")
        raw_model_name = filename[-2]
        raw_problem_name = "_".join(filename[:-2])


        # rename blip and gemini model 
        if raw_model_name == "Blip" or raw_model_name == "blip" or raw_model_name == "blip2":
            raw_model_name = "BLIP-2"
        elif raw_model_name == "gemini" :
            raw_model_name = "Gemini"
        return {"model_name": raw_model_name, "problem_name": raw_problem_name}

    def data_postprocessor(self, data, metadata):
        """
        Postprocess the data to extract the required information because the data is 
        not in the same format
        """
        standard_data = []
        for x in data:

            # since the Gemini output could include an extra dict
            # "id" and "object" is used to skip such dict 
            # a vaild result should only contain "output" "correctness" "reasoning" as key
            if "id" and "object" in x:
                continue

            # the dtype of "correctness" could be bool or [bool, "comment on output"]
            x["recognized"] = x["correctness"] is not None
            x["correctness"] = x["correctness"][0] if isinstance(x["correctness"], list) else x["correctness"]
            # when there is output, such result is regraded as parsable
            x["parsable"] = (x["output"] != "")

            standard_data.append(x)
        return standard_data

    def recognition_data_info_extractor(self, data, metadata):
        # the recognition of a question is the mean five recognition tests
        # in each test, 1 is success and 0 is fail
        # the level recognition is the mean of the recognition of the question in the level 
        recognized_summary = np.array([x["mean"] for x in data]).reshape(-1, 10).sum(axis=-1)
        return {
            "recognized": recognized_summary,
            "parsable": recognized_summary,
            "correctness": recognized_summary,
            "recognized_and_parsable": recognized_summary,
            "level": self.level_range
        }

    def data_info_extractor(self, data, metadata):
        """
        For each data and metadata, extract the required information from the data
        """
        standard_data = self.data_postprocessor(data, metadata)
        data_summary = self.data_summary(standard_data)
        return data_summary
    
    def data_summary(self, data):
        """Summarize the data"""
        recognized_data = np.array([x["recognized"] for x in data]).reshape(-1, 10)
        parsable_data = np.array([x["parsable"] for x in data]).reshape(-1, 10)
        correctness_data = np.array([x["correctness"] for x in data]).reshape(-1, 10)

        # sum of stats of a question at a difficulty level
        recognaized_summary = np.sum(recognized_data, axis=-1)
        failure_summary = np.sum(parsable_data, axis=-1)
        number_of_recognized_and_parsable = np.sum(recognized_data * parsable_data, axis=-1)
        correctness_summary = np.sum(correctness_data, axis=-1)
        return {
            "recognized": recognaized_summary,
            "parsable": failure_summary,
            "correctness": correctness_summary,
            "recognized_and_parsable": number_of_recognized_and_parsable,
            "level": self.level_range
        }

    def convert_to_dataframe(self):
        """Convert the data to a pandas dataframe"""

        ###########################################################
        # more informative error 
        try:
            all_data = [a for x in self.data for a in yield_data(x)]
            self.dataframe = pd.DataFrame(all_data)
        except Exception as e:
            print("error in convert_to_dataframe")
            print(f"e is  {e}")
            print("all data:")
            print(self.data)
            raise e


    # rename the raw model name in dataframe
    def rename_model_prompt_type(self):
        model_replace_dict = {
            "fuyu": "Fuyu-8b",
            "qwen": "Qwen-VL",
            "gemini": "Gemini",
            "Blip2": "BLIP-2",
            "blip2": "BLIP-2",  
            "Llava": "LLaVa",
            "llava": "LLaVa",  
            "Cogvlm": "CogVLM",
            "cogvlm": "CogVLM",
            "Gpt4V": "GPT4V",
            "Gpt4v": "GPT4V",
            "gpt4v": "GPT4V",
            "kosmos2":"Kosmos2",
            "otter": "Otter"

        }

        self.dataframe["model_name"] = self.dataframe["model_name"].replace(model_replace_dict)


In [3]:
class SummarizeHelper:
    """Helper class to summarize the data to prepare for the visualization"""
    def __init__(self, dataframe, summary_func):
        self.dataframe = dataframe
        self.summary_func = summary_func
    
    # add summary_columns = None
    def summarize(self, summary_columns= None, groupby_columns=None, pre_func=None, post_func=None):
        # TO-DO: to be refined
        dataframe = self.dataframe.copy()
        if pre_func:
            dataframe = pre_func(dataframe)
        if groupby_columns:
            column_used = groupby_columns + summary_columns
            dataframe = dataframe[column_used].groupby(groupby_columns, as_index=False).agg({
                col: self.summary_func[col] for col in summary_columns
            }).reset_index()
        if post_func:
            dataframe = post_func(dataframe)
        return dataframe


def pre_func_for_agg_acc(dataframe):
    dataframe_prompt = dataframe[dataframe["prompt_type"] != "recognition"].copy()
    dataframe_recognition = dataframe[dataframe["prompt_type"] == "recognition"].copy()
    # Obtain the RA, 10 is the number of question at a difficulty level
    dataframe_recognition["RA"] = dataframe_recognition["correctness"] / 10
    # Obtain the Acc, correctness in recognized and parsable question
    dataframe_prompt["AA"] = dataframe_prompt["correctness"] / dataframe_prompt["recognized_and_parsable"].clip(0.1, 10)
    dataframe_prompt["FR"] = 1 - dataframe_prompt["parsable"] / 10
    dataframe_prompt["Acc"] = dataframe_prompt["AA"] * (1 - dataframe_prompt["FR"])
    # Obtain the weighted_acc
    # the RA data is derived from the recognition data set
    dataframe_prompt["RA"] = 0.0

    # match recognition data to its experiment data
    # could be refined using join 
    for i, row in dataframe_prompt.iterrows():
        related_value = dataframe_recognition[
            (dataframe_recognition["model_name"] == row["model_name"]) & 
            (dataframe_recognition["problem_name"] == row["problem_name"]) &
            (dataframe_recognition["level"] == row["level"])
        ]["RA"].values

        if len(related_value) != 1:
            print("error matching")
            print(f"index:{i}")
            print(f"row:{row}")
            print("data frane")
            print(dataframe)
            print()

        dataframe_prompt.loc[i, "RA"] = related_value[0] if len(related_value) > 0 else 1
    dataframe_prompt["weighted_acc"] = dataframe_prompt["Acc"] * dataframe_prompt["RA"]
    return dataframe_prompt



# apply weight of level 
def pre_func_common(data):
    dataframe = data.copy()
    dataframe["weighted_acc"] = dataframe["weighted_acc"] * dataframe["level"]
    return dataframe

# the weight of a level is apply using the mean form
def post_func_common(data):
    data["weighted_acc"] = data["weighted_acc"] / data["level"]
    return data

In [4]:
class DrawHelper:
    """Helper class to draw the results from the dataframe"""
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def plot_bar_chart(self, x_name, y_name, hue_name):
        sns.barplot(data=self.dataframe, x=x_name, y=y_name, hue=hue_name)
        plt.title("")
        plt.ylabel("Aggregated Accuracy")
        plt.legend(title='Prompt Type',title_fontsize = 'x-large', fontsize='medium',markerscale=1)
        plt.xlabel("")

    def plot_heatmap_recognition(self):
        col = ["model_name",'problem_name',"RA"]
        self.dataframe = self.dataframe[col]
        pivot_table = self.dataframe.pivot(index='model_name', columns='problem_name', values='RA')
        question_order = ['GCP', 'TSP', 'MSP', 'GCP_D', 'TSP_D', 'KSP', 'SAS', 'EDP', 'SPP']
        pivot_table = pivot_table[question_order]
        pivot_table = pivot_table.sort_index(key=lambda x: x.map(model_order_mapper))
        sns.heatmap(pivot_table, annot=True, vmin=0, vmax=1, cmap='Blues', fmt='.2f')
        plt.xlabel(None)
        plt.ylabel(None)
        plt.tight_layout()
    
    def plot_heatmap(self, x_name, y_name, z_name, col_name, problem_order_mapper):
        for z in self.dataframe[z_name].unique():
            tmp_df = self.dataframe[self.dataframe[z_name] == z].copy()

            # reorder level and model name  
            tmp_df = tmp_df.pivot_table(index=x_name, columns=y_name, values=col_name, aggfunc='mean').fillna(0).reset_index()

            # apply transpose() to set x be the level y be the model name
            col = list(tmp_df["level"])
            tmp_df = tmp_df.transpose()
            tmp_df = tmp_df.drop('level')
            tmp_df.columns = col
            tmp_df = tmp_df.sort_index(key=lambda x: x.map(model_order_mapper))

            pos = problem_order_mapper[z] + 1
            plt.subplot(3, 3, pos)
            sns.heatmap(tmp_df, annot=True, vmin=0, vmax=1, cmap='Blues', fmt='.2f', cbar=False)
            plt.title(z.upper())
            plt.xlabel(None)
            plt.ylabel(None)
            plt.tight_layout()
            
    def plot_spider(self, *metrics):
        dataframe = self.dataframe.copy()
        my_dpi = 100
        fig = plt.figure(figsize=(3000/my_dpi, 3000/my_dpi), dpi=my_dpi)
        
        cols = 3
        rows = 3
        
        gs = gridspec.GridSpec(rows, cols, wspace=0.5, hspace=0.5)
        
        for i, (index, row_data) in enumerate(dataframe.iterrows()):
            if i >= rows * cols:  
                break
            ax = plt.subplot(gs[i], polar=True)  # Specify polar=True here
            self.plot_sub_spider(row_data, ax, *metrics)

            

    def plot_sub_spider(self, row_data, ax, *metrics):

        # here we assue five metrics 
        angles = [n / float(5) * 2 * pi for n in range(5)]
        offset = pi / 5  
        angles = [(angle + offset) % (2 * pi) for angle in angles]  

        angles += angles[:1]
        
        ax.set_theta_offset(pi / 2)
        ax.set_theta_direction(-1)
        
        metrics = list(metrics)
        metrics_tag = metrics.copy()
        for i in range(len(metrics_tag)):
            if metrics_tag[i] == 'AA NP Hard':
                metrics_tag[i] = 'AA \nNP \nHard'
            if metrics_tag[i] == 'AA P':
                metrics_tag[i] = 'AA \nP'
            if metrics_tag[i] == 'AA NP Complete':
                metrics_tag[i] = 'AA \nNP Complete'
        
        plt.xticks(angles[:-1], metrics_tag, color='black', size=25)
        ax.tick_params(axis='x', pad=30)
        
        ax.set_rlabel_position(0)
        plt.yticks([0,0.25,0.5,0.75,1], ['0','0.25','0.5','0.75','1'], color="grey", size=18)
        plt.ylim(0,1)
        
        model_name = row_data['model_name']
        row_values = [row_data[metric] for metric in metrics]
        row_values += row_values[:1]
        
        ax.plot(angles, row_values, linewidth=2, linestyle='solid')
        ax.fill(angles, row_values, alpha=0.2)
        
        ax.set_title(model_name, size=45, y=1.1)  # Correctly set the title for the subplot
        

# this is not used now
def gather_final_summary(base_info, cl_info, p_acc_info, npc_acc_info, nph_acc_info):
    base_info = base_info.copy()
    for i, row in base_info.iterrows():
        model_name = row["model_name"]
        cl = cl_info[cl_info["model_name"] == model_name]["cl"].values[0]
        p_acc = p_acc_info[p_acc_info["model_name"] == model_name]["weighted_acc"].values[0]
        npc_acc = npc_acc_info[npc_acc_info["model_name"] == model_name]["weighted_acc"].values[0]
        nph_acc = nph_acc_info[nph_acc_info["model_name"] == model_name]["weighted_acc"].values[0]
        base_info.loc[i, "cl"] = cl
        base_info.loc[i, "p_acc"] = p_acc
        base_info.loc[i, "npc_acc"] = npc_acc
        base_info.loc[i, "nph_acc"] = nph_acc
    return base_info

In [5]:
def plot_line_chart(self, col_name, x_name, x_order_mapper,ax):
        tmp_df = self.dataframe.copy()
        tmp_df['x_order'] = tmp_df[x_name].map(x_order_mapper)
        tmp_df.sort_values(by=['x_order', 'model_type'], inplace=True)

        open_model_df = tmp_df[tmp_df['model_type'] == 0]
        close_model_df = tmp_df[tmp_df['model_type'] == 1]

        number_of_close_models = close_model_df['model_name'].nunique()
        number_of_open_models = open_model_df['model_name'].nunique()

        # make one red palette and one blue palette
        palette = sns.color_palette('tab20', n_colors=number_of_close_models + number_of_open_models)
        palette = sorted(palette, key=lambda x: x[0] - x[2])
        palette_map = {}

        sns.pointplot(data=open_model_df, ax = ax, x=x_name,
                y=col_name, hue='model_name', linestyle='',
                alpha=0.8, marker='s', palette=palette[:number_of_open_models])
        palette_map = {model: palette[i] for i, model in enumerate(open_model_df['model_name'].unique())}

        sns.pointplot(data=close_model_df, ax = ax, x=x_name, y=col_name,
                        hue='model_name', linestyle='', alpha=0.8, marker='^', palette=palette[number_of_open_models:])
        palette_map.update({model: palette[i + number_of_open_models] for i, model in enumerate(close_model_df['model_name'].unique())})

        sns.lineplot(data=close_model_df, ax = ax, x=x_name, y=col_name, color='darkred',
                        marker='o', markersize=10, fillstyle='full', label='Close models', errorbar=None)
        
        sns.lineplot(data=open_model_df, ax = ax, x=x_name, y=col_name, color='red',
                        marker='o', markersize=10, fillstyle='full', label='Open models', errorbar=None)

        sns.lineplot(data=tmp_df, ax = ax, x=x_name, y=col_name, color='black',
                        marker='o', markersize=10, fillstyle='full', label='All models', errorbar=None)
        
        leg = ax.legend()

        if col_name == 'weighted_acc':
            plt.title("a.", loc='left')
            plt.ylabel('Aggregate Accuracy')
            leg.remove()

            
        else:
            plt.title("b.", loc='left')
            plt.ylabel('Instruction Following \n Effective Rate')
            # get the label and name and sort them based on model_order_mapper
            handles, labels = plt.gca().get_legend_handles_labels()
            sorted_labels_handles = sorted(zip(labels, handles), key=lambda lh: model_order_mapper.get(lh[0], 0))
            sorted_labels, sorted_handles = zip(*sorted_labels_handles)
        #     plt.legend(sorted_handles, sorted_labels,title='Model', bbox_to_anchor=(1.05, 1),\
        #                 markerscale = 1, fontsize = 'medium',loc='right')
            ax.legend(sorted_handles, sorted_labels,title='Model',bbox_to_anchor=(1.05, 1),\
                         markerscale=1.3, fontsize='large',loc='upper left',title_fontsize = 'x-large')
        # ax.legend(title='Label', loc='upper left', bbox_to_anchor=(1, 1))


        ax.set_ylim([0, 1])
        plt.xlabel('Complexity')
        # plt.tight_layout()
        ax.set_xticklabels(['P','NP-Complete','NP-Hard'])
        plt.ylim(0, 1.05)


DrawHelper.plot_line_chart = plot_line_chart

In [17]:
# Assume a new env is created 
base_dir = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(base_dir, 'Results')

# If not please use the following, and place the results in summary
# DATA_PATH = 'Results'


model_types = ["Open", "Close"]
prompt_types = ["figure+full_text", "figure+limited_text", "full_text_only", "recognition"]
level_range = np.arange(1, 11)
data = ResultsInfo(DATA_PATH, model_types, prompt_types, level_range, save_path="results.csv")
data.dataframe['problem_name'] = data.dataframe['problem_name'].replace('BSP', 'SAS')


# assume that for each groupby, it has the same number of results. Otherwise, it will be problematic
# the raw weighted_acc does not apply the level weight
summary_func = {"level": "mean", "FR": "mean", "RA": "mean", "weighted_acc": "mean"}
baseSummary = SummarizeHelper(data.dataframe, summary_func)

base_summary_info = baseSummary.summarize(pre_func=pre_func_for_agg_acc)
base_summary_info["effective_rate"] = 1 - base_summary_info["FR"]
base_summary_info.to_csv("base_summary_info.csv", index=False)

summary1 = SummarizeHelper(base_summary_info, summary_func)
summary_info1 = summary1.summarize(
    groupby_columns=["model_name", "problem_name", "problem_type"],
    summary_columns=["RA", "FR", "weighted_acc", "level"],
    pre_func=pre_func_common,
    post_func=post_func_common
).sort_values(by="model_name", key=lambda x: x.map(model_order_mapper))
summary_info1["effective_rate"] = 1 - summary_info1["FR"]
summary_info1.to_csv("summary_info1.csv", index=False)

summary_info2 = summary1.summarize(
    groupby_columns=["prompt_type", "model_name"],
    summary_columns=["RA", "FR", "weighted_acc", "level"],
    pre_func=pre_func_common,
    post_func=post_func_common
).sort_values(by="model_name", key=lambda x: x.map(model_order_mapper))
summary_info2["effective_rate"] = 1 - summary_info2["FR"]
summary_info2.to_csv("summary_info2.csv", index=False)

summary_info3 = summary1.summarize(
    groupby_columns=["model_name", "problem_type", "model_type"],
    summary_columns=["RA", "FR", "weighted_acc", "level"],
    pre_func=pre_func_common,
    post_func=post_func_common
).sort_values(by="model_name", key=lambda x: x.map(model_order_mapper))
summary_info3["effective_rate"] = 1 - summary_info3["FR"]
summary_info3.to_csv("summary_info3.csv", index=False)

100%|██████████| 8/8 [00:00<00:00, 44.67it/s]


In [7]:
# create folder
folder_name = "figures"
current_directory = os.getcwd()
folder_path = os.path.join(current_directory, folder_name)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_name}' created.")
else:
    print(f"Folder '{folder_name}' already exists.")

Folder 'figures' already exists.


In [None]:
# plot spider graph 
summary_info_radar_model = summary1.summarize(
    groupby_columns=["model_name"],
    summary_columns=["RA", "FR", "weighted_acc", "level"],
    pre_func=pre_func_common,
    post_func=post_func_common
).sort_values(by="model_name", key=lambda x: x.map(model_order_mapper))
summary_info_radar_model["effective_rate"] = 1 - summary_info_radar_model["FR"]
summary_info_radar_model

summary_info_radar_problem_type = summary1.summarize(
    groupby_columns=["model_name", "problem_type"],
    summary_columns=["RA", "FR", "weighted_acc", "level"],
    pre_func=pre_func_common,
    post_func=post_func_common
).sort_values(by="model_name", key=lambda x: x.map(model_order_mapper))
summary_info_radar_problem_type["effective_rate"] = 1 - summary_info_radar_problem_type["FR"]
summary_info_radar_problem_type = summary_info_radar_problem_type.pivot(index='model_name', columns='problem_type', values='weighted_acc')
summary_info_radar_problem_type = summary_info_radar_problem_type.reset_index()
summary_info_radar_problem_type = summary_info_radar_problem_type.sort_values(by = 'model_name', key=lambda x: x.map(model_order_mapper))
summary_info_radar_problem_type

# join the graph with overall following rate and RA with AA in different difficulty level
summary_info_radar_join = pd.merge(summary_info_radar_model, summary_info_radar_problem_type, on="model_name")
summary_info_radar_join = summary_info_radar_join[["model_name" , "FR" ,"p","np-cmp", "np-hard","RA"]]
summary_info_radar_join["FR"] = 1 - summary_info_radar_join["FR"]
summary_info_radar_join = summary_info_radar_join.rename(columns={'p': "AA P", "np-cmp" :'AA NP Complete', 'np-hard': 'AA NP Hard',"FR":"ER"})

spider_helper = DrawHelper(summary_info_radar_join)
spider_helper.plot_spider("ER","AA P", 'AA NP Complete', 'AA NP Hard',"RA")
plt.savefig('figures/spider.png', bbox_inches='tight')

In [None]:
## rq0 question hardness level recognition

fig, _ = plt.subplots(3, 3, figsize=(15, 12))
rq1_1_drawer = DrawHelper(base_summary_info)
rq1_1_drawer.plot_heatmap('level', 'model_name', 'problem_name', 'RA', problem_order_mapper)
fig.subplots_adjust(right=0.95)
cbar_ax = fig.add_axes([0.96, 0.7, 0.02, 0.27])
fig.colorbar(ax=fig.axes, cax=cbar_ax, mappable=fig.axes[0].collections[0], orientation='vertical')
plt.savefig('figures/recognition_heatmap.png', bbox_inches='tight')

In [None]:
# rq0.1 recognition question level
fig, _ = plt.subplots(figsize=(15, 12))
rq1_drawer = DrawHelper(summary_info1)
rq1_drawer.plot_heatmap_recognition()
plt.savefig('figures/agg_recognition_heatmap.png', bbox_inches='tight')

In [None]:
# rq1
rq1_drawer = DrawHelper(summary_info3)
## rq1.1
fig, axs = plt.subplots(figsize=(12, 6))
gs = gridspec.GridSpec(1, 2, width_ratios=[1,1], height_ratios=[1],wspace=0.30)

ax1 = plt.subplot(gs[0, 0])
rq1_drawer.plot_line_chart('weighted_acc', 'problem_type', nphard_order_mapper,ax1)
ax1.set_aspect(2)

ax2 = plt.subplot(gs[0, 1])
rq1_drawer.plot_line_chart('effective_rate', 'problem_type', nphard_order_mapper,ax2)
ax2.set_aspect(2)

plt.savefig('figures/weighted_accuracy_effective_rate.png', bbox_inches='tight')

In [None]:
## rq1.2
fig, _ = plt.subplots(3, 3, figsize=(15, 12))
rq1_1_drawer = DrawHelper(base_summary_info)
temp = rq1_1_drawer.plot_heatmap('level', 'model_name', 'problem_name', 'weighted_acc', problem_order_mapper)
fig.subplots_adjust(right=0.95)
cbar_ax = fig.add_axes([0.96, 0.7, 0.02, 0.27])
fig.colorbar(ax=fig.axes, cax=cbar_ax, mappable=fig.axes[0].collections[0], orientation='vertical')
plt.savefig('figures/zeroshot_heatmap.png', bbox_inches='tight')


In [None]:
# rq2
mapper = {
    'figure+limited_text': 'Figure + Limited Text',
    'full_text_only': 'Full Text Only',
    'figure+full_text': 'Figure + Full Text'
}

summary_info2['prompt_type'] = summary_info2['prompt_type'].map(mapper)
summary_info2.rename(columns={"prompt_type":"Prompt Type"})
rq2_drawer = DrawHelper(summary_info2)
fig = plt.figure()
rq2_drawer.plot_bar_chart('model_name', 'weighted_acc', 'prompt_type')
plt.savefig('figures/weighted_accuracy_prompt_type.png', bbox_inches='tight')

In [18]:
# rq3: finetune
# it is not used yet
# datapaths = ["baseline", "finetune-1", "finetune-2", "finetune-3", "finetune-4", "finetune-5"]
# number_of_tunes = len(datapaths)
# number_of_problems = 9
# number_of_models = 2
# model_problem_cl_values = np.zeros((number_of_tunes, number_of_models, number_of_problems))
# problem_names = problem_order_mapper.keys()
# for i, datapath in enumerate(datapaths):
#     data = ResultsInfo(datapath, model_types, prompt_types, level_range)
#     tmp_summary_info = baseSummary.summarize(
#         summary_columns=["RA", "FR", "weighted_acc", "level"],
#         pre_func=pre_func_for_agg_acc,
#     )
#     tmp_summary = SummarizeHelper(tmp_summary_info, summary_func)
#     tmp_summary_info1 = tmp_summary.summarize(
#         groupby_columns=["model_name", "problem_name"],
#         summary_columns=["weighted_acc", "level"],
#         pre_func=pre_func_common,
#         post_func=post_func_common
#     ).sort_values(by="model_name", key=lambda x: x.map(model_order_mapper))
#     tmp_pivot = tmp_summary_info1.pivot(index="model_name", columns="problem_name", values="weighted_acc")
#     tmp_pivot = tmp_pivot[problem_names]
#     model_problem_cl_values[i] = tmp_pivot.values

# model_problem_cl = np.mean(np.diff(model_problem_cl_values, axis=0), axis=0)
# model_problem_cl_df = pd.DataFrame(model_problem_cl, columns=problem_names, index=model_order_mapper.keys())
# fig = plt.figure()
# sns.heatmap(model_problem_cl_df, annot=True, cmap="Blues", fmt=".2f")
# plt.title("Continual Learnability (CL)")
# plt.ylabel("Model")
# plt.xlabel("Problem")
# plt.tight_layout()
# plt.savefig('figures/continual_learnability.png', bbox_inches='tight')