In [11]:
import pandas as pd
import os

USE_MODEL = 'Mistral'
PATH = f'../{USE_MODEL}/Results'

DATASETS = ['amazon', 'dadjokes', 'headlines', 'one_liners', 'yelp_reviews']
PATH_TEMPLATE = PATH + '/{dataset}-models/'
TABLES_PATH = PATH + '/Tables/'

LOO_DATASETS = [f'loo_{dataset}' for dataset in DATASETS]
LOO_PATH = PATH + '/leave-one-out/'

In [12]:
# LOO_EVAL = False
LOO_EVAL = False

TRAIN_DATASETS = LOO_DATASETS if LOO_EVAL else DATASETS

In [13]:
# Create the score table for regular dataset
if not LOO_EVAL:
    scores_df = pd.DataFrame()
    for dataset_name in TRAIN_DATASETS:
        root_dir = PATH_TEMPLATE.format(dataset=dataset_name)
        for _, dirs, _ in os.walk(root_dir):
            for dir_name in dirs:
                inner_dir = os.path.join(root_dir, dir_name)
                score_file_path = os.path.join(inner_dir, f'{dataset_name}_scores.csv')
                df = pd.read_csv(score_file_path)
                # for the first df, set its columns to be the columns of the overall dataframe (scores_df)
                if scores_df.empty:
                    scores_df = pd.DataFrame(columns=df.columns)
                scores_df = pd.concat([scores_df, df])

  scores_df = pd.concat([scores_df, df])


In [14]:
# Create the score table for loo dataset (only one model per dataset)
if LOO_EVAL:
    scores_df = pd.DataFrame()
    for dataset_name in TRAIN_DATASETS:
        score_file_path = os.path.join(LOO_PATH, f'{dataset_name}_scores.csv')
        df = pd.read_csv(score_file_path)
        # for the first df, set its columns to be the columns of the overall dataframe (scores_df)
        if scores_df.empty:
            scores_df = pd.DataFrame(columns=df.columns)
        scores_df = pd.concat([scores_df, df])

In [16]:
METRICS = ['accuracy', 'f1', 'recall', 'precision']
base_model = 'flan-t5-base'
base_model = 'llama-2-7b'
base_model = 'mistral-7b'
# models_name = [glob.glob(f'{models_path}/{base_model}_on_{dataset}*')[0] for dataset in dataset_names]

result_df = pd.read_excel(TABLES_PATH + 'result_template.xlsx')
result_df.fillna(method='ffill', axis=0, inplace=True)
result_df.set_index(['metric', 'model', 'trained on'], inplace=True)

  result_df.fillna(method='ffill', axis=0, inplace=True)


In [17]:
# calculate metrics' mean and std for all pairs of datasets
for train_dataset in TRAIN_DATASETS:
    metrics_dict = {metric_name: {} for metric_name in METRICS}

    for eval_dataset in DATASETS:
        df = scores_df[(scores_df['train_dataset'] == train_dataset) & (scores_df['evaluate_dataset'] == eval_dataset)]
        # print(train_dataset, eval_dataset)
        for metric in METRICS:
            # print(metric)
            values = df[metric]
            mean, std = values.mean(), values.std()
            metrics_dict[metric][eval_dataset] = float("%.4f" % mean)
            # print(mean, std)

    for metric in METRICS:
        result_df.loc[(metric, base_model, train_dataset)] = metrics_dict[metric]

In [18]:
from datetime import datetime
# save performance to output file
date = datetime.now().date()
i = 1
while os.path.exists(TABLES_PATH + f'humor_results_{date}_{i}*.xlsx'):
    i += 1

result_df.to_excel(TABLES_PATH + f'humor_results_{date}_{i}.xlsx')