# Tables

## RQ1 Table: Difference btwn/ N=0 and N=2

In [None]:
import os
import numpy as np
import pandas as pd
import utils_general

ROUGE_TYPE = "rougeL" # options are: "rouge1", "rouge2", "rougeL", "rougeLsum"
ROUGE_LIST = ["rouge1", "rouge2", "rougeL"]

In [None]:
N = 2
N_DECIMAL_PLACES = 3

table_df = []
for m in utils_general.INFERENCE_MODEL_LIST:
    
    for t in ["repeats", "interjections", "false-starts", "interjections-and-false-starts", "repeats-and-false-starts", "repeats-and-interjections",  "all-3"]:

        # calculate the mean at 0 for this model/trf
        df_zero = pd.read_csv(os.path.join(utils_general.PATH_TO_CSV, "0" + "__" + m + "__ROUGE_SCORES.csv"))
        mean_at_zero = df_zero[ROUGE_TYPE].mean()

        # calculate the mean at 2 for this model/trf
        df_n = pd.read_csv(os.path.join(utils_general.PATH_TO_CSV, t + "__" + m + "__ROUGE_SCORES.csv"))
        df_n = df_n.loc[df_n["N_parameter"] == N]
        mean_at_n = df_n[ROUGE_TYPE].mean()

        # get the difference
        difference_in_mean = mean_at_n - mean_at_zero
        mean_at_n = mean_at_n
        mean_at_zero = mean_at_zero

        table_df.append({"model": m,
                       "trf": t,
                       "mean_at_2": mean_at_n,
                       "mean_at_0": mean_at_zero,
                       "difference_in_mean": difference_in_mean})

table_df = pd.DataFrame(table_df)
table_df["percent_change"] = (((table_df["mean_at_2"] - table_df["mean_at_0"]) / table_df["mean_at_0"]) * 100)

# rounding at the end
table_df = table_df.round(N_DECIMAL_PLACES)

# display(table_df.pivot(index=["model"], columns=["trf"], values=["mean_at_2","mean_at_0","difference_in_mean","percent_change"]).stack(0))

display(table_df.pivot(index=["model"], columns=["trf"], values=["mean_at_0", "difference_in_mean", "percent_change"]).stack(0))


## RQ2 Table: Inference Only

In [None]:
df = []

for model_name in ["bart", "t5", "pegasus"]:
    for test_version in ["tagged", "-1", "0"]:

        m = f"{test_version}__{model_name}__ROUGE_SCORES.csv"
        temp_df = pd.read_csv(os.path.join(utils_general.PATH_TO_CSV,m))

        temp_mean_rouge1 = temp_df["rouge1"].mean()
        temp_mean_rouge2 = temp_df["rouge2"].mean()
        temp_mean_rougeL = temp_df["rougeL"].mean()
        temp_mean_rougeLsum = temp_df["rougeLsum"].mean()

        df.append({"model_name": model_name,
                   "test_version": test_version,
                   "rouge1": temp_mean_rouge1,
                   "rouge2": temp_mean_rouge2,
                   "rougeL": temp_mean_rougeL,
                   "rougeLsum": temp_mean_rougeLsum})
    
df = pd.DataFrame(df)

# format the numeric columns
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = (df[numeric_cols]*100).round(decimals=N_DECIMAL_PLACES)

df = df.sort_values(by=["model_name","test_version"])
display(df)

df.to_csv(os.path.join(utils_general.PATH_TO_CSV, "RQ2_Table_Inference_Only.csv"), index=False)

## RQ2 Table: Fine-Tuning on Annotated Transcripts

In [None]:
print("TABLE FOR {BART, T5, PEGASUS} FINE-TUNED SCORES\n\n\n")

for model_name in ["fine-tuned__bart", "fine-tuned__pegasus", "fine-tuned__t5"]:
    
    df = []
    
    # write rouge score csvs for each of the FINE-TUNED trfs and models
    for test_version in ["tagged", "-1", "0"]:
        for seed_number in [0, 10, 20, 30, 40]:
            for train_version in ["0", "-1", "tagged"]:

                train = f"train_{train_version}"
                test = f"test_{test_version}"
                csv_name = f"{model_name}__seed_{seed_number}__{train}__{test}__ROUGE_SCORES.csv"

                temp_df = pd.read_csv(os.path.join(utils_general.PATH_TO_CSV,csv_name))
                temp_mean_rouge1 = temp_df["rouge1"].mean()
                temp_mean_rouge2 = temp_df["rouge2"].mean()
                temp_mean_rougeL = temp_df["rougeL"].mean()
                temp_mean_rougeLsum = temp_df["rougeLsum"].mean()

                df.append({"model_name":model_name,
                           "test_version": test_version,
                           "train_version": train_version,
                           "seed_number": seed_number,
                           "rouge1": temp_mean_rouge1,
                           "rouge2": temp_mean_rouge2,
                           "rougeL": temp_mean_rougeL,
                           "rougeLsum": temp_mean_rougeLsum})

    df = pd.DataFrame(df)
    
    df = pd.pivot_table(df, values=ROUGE_LIST, index=["train_version", "test_version"], aggfunc="mean")
    
    # format the numeric columns
    numeric_cols = df.select_dtypes(include=np.number).columns
    df[numeric_cols] = (df[numeric_cols]*100).round(decimals=N_DECIMAL_PLACES)

    print(model_name)
    display(df)
    print("\n\n\n")
    df.to_csv(os.path.join(utils_general.PATH_TO_CSV, f"RQ2_Table_Fine-Tuned_{model_name}.csv"), index=False)