In [64]:
import pandas as pd
import os

USE_MODEL = 'Llama-2'
PATH = f'../{USE_MODEL}/Results'

DATASETS = ['amazon', 'dadjokes', 'headlines', 'one_liners', 'yelp_reviews']
# DATASETS = ['amazon']
PATH_TEMPLATE = PATH + '/{dataset}-models/'
TABLES_PATH = PATH + '/Tables/'

LOO_DATASETS = [f'loo_{dataset}' for dataset in DATASETS]
LOO_PATH = PATH + '/leave-one-out/'

In [65]:
# LOO_EVAL = False
LOO_EVAL = False

TRAIN_DATASETS = LOO_DATASETS if LOO_EVAL else DATASETS

In [66]:
# Create the score table for regular dataset
if not LOO_EVAL:
    scores_df = pd.DataFrame()
    for dataset_name in TRAIN_DATASETS:
        root_dir = PATH_TEMPLATE.format(dataset=dataset_name)
        for _, dirs, _ in os.walk(root_dir):
            for dir_name in dirs:
                inner_dir = os.path.join(root_dir, dir_name)
                score_file_path = os.path.join(inner_dir, f'{dataset_name}_scores.csv')
                df = pd.read_csv(score_file_path)
                # for the first df, set its columns to be the columns of the overall dataframe (scores_df)
                if scores_df.empty:
                    scores_df = pd.DataFrame(columns=df.columns)
                scores_df = pd.concat([scores_df, df])

  scores_df = pd.concat([scores_df, df])


In [5]:
# Create the score table for loo dataset (only one model per dataset)
if LOO_EVAL:
    scores_df = pd.DataFrame()
    for dataset_name in TRAIN_DATASETS:
        score_file_path = os.path.join(LOO_PATH, f'{dataset_name}_scores.csv')
        df = pd.read_csv(score_file_path)
        # for the first df, set its columns to be the columns of the overall dataframe (scores_df)
        if scores_df.empty:
            scores_df = pd.DataFrame(columns=df.columns)
        scores_df = pd.concat([scores_df, df])

In [5]:
METRICS = ['accuracy', 'f1', 'recall', 'precision']
base_model = 'flan-t5-base'
base_model = 'llama-2-7b'
base_model = 'mistral-7b'
# models_name = [glob.glob(f'{models_path}/{base_model}_on_{dataset}*')[0] for dataset in dataset_names]

result_df = pd.read_excel(TABLES_PATH + 'result_template.xlsx')
result_df.fillna(method='ffill', axis=0, inplace=True)
result_df.set_index(['metric', 'model', 'trained on'], inplace=True)

  result_df.fillna(method='ffill', axis=0, inplace=True)


In [4]:
# calculate metrics' mean and std for all pairs of datasets
for train_dataset in TRAIN_DATASETS:
    metrics_dict = {metric_name: {} for metric_name in METRICS}

    for eval_dataset in DATASETS:
        df = scores_df[(scores_df['train_dataset'] == train_dataset) & (scores_df['evaluate_dataset'] == eval_dataset)]
        # print(train_dataset, eval_dataset)
        for metric in METRICS:
            # print(metric)
            values = df[metric]
            mean, std = values.mean(), values.std()
            metrics_dict[metric][eval_dataset] = float("%.4f" % mean)
            # print(mean, std)

    for metric in METRICS:
        result_df.loc[(metric, base_model, train_dataset)] = metrics_dict[metric]

NameError: name 'METRICS' is not defined

In [66]:
from datetime import datetime
# save performance to output file
date = datetime.now().date()
i = 1
while os.path.exists(TABLES_PATH + f'humor_results_{date}_{i}*.xlsx'):
    i += 1

result_df.to_excel(TABLES_PATH + f'humor_results_{date}_{i}.xlsx')

## Create table by best median/mean/acc

In [72]:
def find_best_accuracies(df, trained_dataset_name, which_best='All'):
    accuracies = {}
    best_trained, best_median, best_mean = (-1, None), (-1, None), (-1, None)

    for params, rows in df.groupby(by=params_column_names):  # level=0 refers to 'letter'
        # print(f"Index: {name}")
        # print(group)
        row_trained = rows[rows['evaluate_dataset'] == trained_dataset_name]
        row_others = rows[rows['evaluate_dataset'] != trained_dataset_name]
        curr_trained_accuracy = row_trained['accuracy'].values[0]
        others_accuracy_median = row_others['accuracy'].median()
        others_accuracy_mean = row_others['accuracy'].mean()

        best_trained = (curr_trained_accuracy, params) if curr_trained_accuracy > best_trained[0] else best_trained
        best_median = (others_accuracy_median, params) if others_accuracy_median > best_median[0] else best_median
        best_mean = (others_accuracy_mean, params) if others_accuracy_mean > best_mean[0] else best_mean


        curr_accs = {'trained': curr_trained_accuracy,
                     'median': others_accuracy_median,
                     'mean': others_accuracy_mean}

        accuracies[params] = curr_accs

    # print(f'Accuracies for dataset: {trained_dataset_name}')
    # print(f'Best trained accuracy = {best_trained}')
    # print(f'Best median accuracy = {best_median}')
    # print(f'Best mean accuracy = {best_mean}')

    if which_best == 'All':
        return {'best_trained': best_trained,
                'best_median': best_median,
                'best_mean': best_mean}

    # Return only best median
    if which_best == 'Median':
        return {'best_median': best_median}

In [13]:
params_column_names = ['seed', 'learning_rate', 'per_device_train_batch_size',
       'per_device_eval_batch_size', 'max_steps', 'lora_rank', 'lora_alpha',]

In [14]:
METRICS = ['accuracy', 'f1', 'recall', 'precision']
# base_model = 'flan-t5-base'
base_model = 'llama-2-7b'
# base_model = 'mistral-7b'
# models_name = [glob.glob(f'{models_path}/{base_model}_on_{dataset}*')[0] for dataset in dataset_names]

result_param_df = pd.read_excel(TABLES_PATH + 'result_param_metric_template.xlsx')
result_param_df.fillna(method='ffill', axis=0, inplace=True)
result_param_df.set_index(['metric', 'model', 'trained on', 'param_metric'], inplace=True)

  result_param_df.fillna(method='ffill', axis=0, inplace=True)


In [67]:
def get_split_avg_df(df):
    # Iterate over the existing parameters combinations and create df of the average cv splits results
    df_avg_splits = pd.DataFrame(columns=df.columns)
    for params, rows in df.groupby(by=params_column_names):
        for dataset_name in DATASETS:
            rows_of_dataset = rows[rows['evaluate_dataset'] == dataset_name]

            # Columns you want to average
            columns_to_average = ['accuracy', 'precision', 'recall', 'f1']

            # Calculate the mean for the relevant columns
            mean_values = rows_of_dataset[columns_to_average].mean()

            # Prepare a new row with average values and other relevant info
            # For example, 'split' can be labeled as 'average'
            new_row = rows_of_dataset.iloc[0].copy(deep=True)

            new_row.update(mean_values)
            new_row.update({'split_num': 'average'})
            new_row_df = pd.DataFrame([new_row], columns=df.columns)

            df_avg_splits = pd.concat([df_avg_splits, new_row_df], axis=0)

    return df_avg_splits

### Get dataframe of the averages of the cv splits:

In [82]:
df_avg_splits_all = pd.DataFrame()
for dataset_name in TRAIN_DATASETS:
    # For now taking only the first split (until I'll have all the splits results)
    # and then I'll need to average on the splits
    df_curr_trained = scores_df[(scores_df['train_dataset'] == dataset_name)]


    params_vals = {param_name: list(df_curr_trained[param_name].unique()) for param_name in params_column_names}

    df_curr_trained.set_index(params_column_names, inplace=True)

    df_curr_trained_avg_splits = get_split_avg_df(df_curr_trained)
    df_avg_splits_all = pd.concat([df_curr_trained_avg_splits, df_avg_splits_all])

    df_avg_splits_all.index.names = df_curr_trained.index.names

  df_avg_splits = pd.concat([df_avg_splits, new_row_df], axis=0)


ValueError: Length of new names must be 1, got 7

In [86]:
len(df_curr_trained.index.names)

7

In [88]:
df_avg_splits_all.index

Index([   (42, 5e-06, 2, 2, 150, 32, 8),    (42, 5e-06, 2, 2, 150, 32, 8),
          (42, 5e-06, 2, 2, 150, 32, 8),    (42, 5e-06, 2, 2, 150, 32, 8),
          (42, 5e-06, 2, 2, 150, 32, 8),   (42, 5e-06, 2, 2, 150, 32, 16),
         (42, 5e-06, 2, 2, 150, 32, 16),   (42, 5e-06, 2, 2, 150, 32, 16),
         (42, 5e-06, 2, 2, 150, 32, 16),   (42, 5e-06, 2, 2, 150, 32, 16),
       ...
        (42, 0.0003, 2, 2, 200, 128, 8),  (42, 0.0003, 2, 2, 200, 128, 8),
        (42, 0.0003, 2, 2, 200, 128, 8),  (42, 0.0003, 2, 2, 200, 128, 8),
        (42, 0.0003, 2, 2, 200, 128, 8), (42, 0.0003, 2, 2, 200, 128, 64),
       (42, 0.0003, 2, 2, 200, 128, 64), (42, 0.0003, 2, 2, 200, 128, 64),
       (42, 0.0003, 2, 2, 200, 128, 64), (42, 0.0003, 2, 2, 200, 128, 64)],
      dtype='object', length=225)

In [89]:
df_curr_trained.index

MultiIndex([(42, 0.0003, 2, 2, 200,  64, 32),
            (42, 0.0003, 2, 2, 200,  64, 32),
            (42, 0.0003, 2, 2, 200,  64, 32),
            (42, 0.0003, 2, 2, 200,  64, 32),
            (42, 0.0003, 2, 2, 200,  64, 32),
            (42,  5e-05, 2, 2, 150, 128, 16),
            (42,  5e-05, 2, 2, 150, 128, 16),
            (42,  5e-05, 2, 2, 150, 128, 16),
            (42,  5e-05, 2, 2, 150, 128, 16),
            (42,  5e-05, 2, 2, 150, 128, 16),
            ...
            (42, 0.0003, 2, 2, 200,  64,  8),
            (42, 0.0003, 2, 2, 200,  64,  8),
            (42, 0.0003, 2, 2, 200,  64,  8),
            (42, 0.0003, 2, 2, 200,  64,  8),
            (42, 0.0003, 2, 2, 200,  64,  8),
            (42,  1e-05, 2, 2, 200, 128,  8),
            (42,  1e-05, 2, 2, 200, 128,  8),
            (42,  1e-05, 2, 2, 200, 128,  8),
            (42,  1e-05, 2, 2, 200, 128,  8),
            (42,  1e-05, 2, 2, 200, 128,  8)],
           names=['seed', 'learning_rate', 'per_device_train_ba

In [91]:
# Convert the single-level index into a MultiIndex
df_avg_splits_all.index = pd.MultiIndex.from_tuples(df_avg_splits_all.index, names=df_curr_trained.index.names)

# Now df_avg_splits_all will have a MultiIndex with 7 levels
print(df_avg_splits_all.index.names)


['seed', 'learning_rate', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'max_steps', 'lora_rank', 'lora_alpha']


In [93]:
dataset_accs = {}
for dataset_name in TRAIN_DATASETS:
    # For now taking only the first split (until I'll have all the splits results)
    # and then I'll need to average on the splits
    df_curr_trained = df_avg_splits_all[(df_avg_splits_all['train_dataset'] == dataset_name)]

    # params_vals = {param_name: list(df_curr_trained[param_name].unique()) for param_name in params_column_names}

    df_curr_trained.set_index(params_column_names, inplace=True)
    best_accs = find_best_accuracies(df_curr_trained, dataset_name, which_best='Median')
    dataset_accs[dataset_name] = best_accs
    # Iterate over the best accuracies and save all of them
    # calculate metrics' mean and std for all pairs of datasets
    for param_mertic, acc_and_params in best_accs.items():
        params = acc_and_params[1]
        df_params = df_curr_trained.at[params]
        metrics_dict = {metric_name: {} for metric_name in METRICS}

        for eval_dataset in DATASETS:
            df = df_params[(df_params['evaluate_dataset'] == eval_dataset)]
            # print(train_dataset, eval_dataset)
            for metric in METRICS:
                # print(metric)
                values = df[metric]
                mean, std = values.mean(), values.std()
                metrics_dict[metric][eval_dataset] = float("%.4f" % mean)
                # print(mean, std)

        for metric in METRICS:
            result_param_df.loc[(metric, base_model, dataset_name, param_mertic)] = metrics_dict[metric]

KeyError: "None of ['seed', 'learning_rate', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'max_steps', 'lora_rank', 'lora_alpha'] are in the columns"

In [80]:
from datetime import datetime
# save performance to output file
date = datetime.now().date()
i = 1
while os.path.exists(TABLES_PATH + f'humor_results_params_{date}_{i}*.xlsx'):
    i += 1

result_param_df.to_excel(TABLES_PATH + f'humor_results_params_{date}_{i}.xlsx')