In [1]:
import glob, yaml
import pandas as pd

from helper_functions import get_aggregated_asa_df

### Results file paths 

These output files **contain the predictions** of the best model (with the highest $F_1$) on the test subsets of the folds (4 folds in total).

In [2]:
result_paths_fi = {
    "BASE": glob.glob("../../experiments/ex0_base/asa_ex0_no_augment_no_drop/asa_output_?.out"),
    "BASE_CCL": glob.glob("../../experiments/ex0_base/asa_ex4_base+base_ccl/asa_output_?.out"),
    "BASE_CCL_2": glob.glob("../../experiments/ex0_base/asa_ex4_base+base_ccl_2/asa_output_?.out"),
    "BASE_CCL_UM": glob.glob("../../experiments/ex0_base/asa_ex4_base+base_ccl_um/asa_output_?.out"),
    "BASE_CCL_2_UM": glob.glob("../../experiments/ex0_base/asa_ex4_base+base_ccl_2_um/asa_output_?.out"),
    
    "OS": glob.glob("../../experiments/ex0_base/asa_ex2_resample_cefr/asa_output_?.out"),
    "OS_CCL": glob.glob("../../experiments/ex0_base/asa_ex4_base+os_augment_ccl/asa_output_?.out"),
    "OS_CCL_2": glob.glob("../../experiments/ex0_base/asa_ex4_base+os_augment_ccl_2/asa_output_?.out"),
    "OS_CCL_UM": glob.glob("../../experiments/ex0_base/asa_ex4_base+os_augment_ccl_um/asa_output_?.out"),
    "OS_CCL_2_UM": glob.glob("../../experiments/ex0_base/asa_ex4_base+os_augment_ccl_2_um/asa_output_?.out")
}

In [3]:
result_paths_sv = {
    "BASE": glob.glob("../../experiments/sv_base/asa_base/asa_output_?.out"),
    "BASE_CCL": glob.glob("../../experiments/sv_base/asa_base_ccl/asa_output_?.out"),
    "BASE_CCL_2": glob.glob("../../experiments/sv_base/asa_base_ccl_2/asa_output_?.out"),
    "BASE_CCL_UM": glob.glob("../../experiments/sv_base/asa_base_ccl_um/asa_output_?.out"),
    "BASE_CCL_2_UM": glob.glob("../../experiments/sv_base/asa_base_ccl_2_um/asa_output_?.out"),
    
    "OS": glob.glob("../../experiments/sv_base/asa_os_augment/asa_output_?.out"),
    "OS_CCL": glob.glob("../../experiments/sv_base/asa_os_augment_ccl/asa_output_?.out"),
    "OS_CCL_2": glob.glob("../../experiments/sv_base/asa_os_augment_ccl_2/asa_output_?.out"),
    "OS_CCL_UM": glob.glob("../../experiments/sv_base/asa_os_augment_ccl_um/asa_output_?.out"),
    "OS_CCL_2_UM": glob.glob("../../experiments/sv_base/asa_os_augment_ccl_2_um/asa_output_?.out")
}

### Load the dfs 
These dfs **do not contain** the **model predictions**.

In [4]:
with open("../../config.yml") as file:
    train_config = yaml.safe_load(file)

csv_path_fi = train_config["data_args"]["csv_fi"]
csv_path_sv = train_config["data_args"]["csv_sv"]

usecols=['sample', 'student', 'recording_path', 
         'cefr_mean', 'split', 'transcript_normalized']

df_fi = pd.read_csv(csv_path_fi, usecols=usecols+["task_id"])
df_sv = pd.read_csv(csv_path_sv, usecols=usecols+["task"])

df_fi.head()

Unnamed: 0,sample,student,task_id,recording_path,cefr_mean,split,transcript_normalized
0,782,1,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,5,1,jos saisin valita tärkeimmän paikan minulle se...
1,181,3,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,4,1,minä kertoon minun kesämökistä öö se on paras ...
2,913,4,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,5,3,tärkeä paikka minulle on makuuhuoneeni siellä ...
3,1822,5,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,5,2,minulle tärkeä paikka on mun olohuone koska vo...
4,12,7,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,6,1,tämä paikka on minulle tosi tärkeä koska se an...


### Gather the predictions of each experiments to the same df

In [5]:
# For each experiment
# We add the predction of the best model as a column to the df

# get_aggregated_asa_df is a helper function that puts
# the model predictions in the output file as a column
# and returns the final df

# 1. Finnish 
for exp_name, result_paths in result_paths_fi.items():
    df = get_aggregated_asa_df(df_fi, result_paths, []).sort_index()
    df_fi[exp_name] = df.Prediction

# 2. Finland Swedish
for exp_name, result_paths in result_paths_sv.items():
    df = get_aggregated_asa_df(df_sv, result_paths, []).sort_index()
    df_sv[exp_name] = df.Prediction

In [6]:
# Inspect the final dfs
df_fi.head()

Unnamed: 0,sample,student,task_id,recording_path,cefr_mean,split,transcript_normalized,BASE,BASE_CCL,BASE_CCL_2,BASE_CCL_UM,BASE_CCL_2_UM,OS,OS_CCL,OS_CCL_2,OS_CCL_UM,OS_CCL_2_UM
0,782,1,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,5,1,jos saisin valita tärkeimmän paikan minulle se...,5,5,5,5,5,5,5,5,5,6
1,181,3,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,4,1,minä kertoon minun kesämökistä öö se on paras ...,4,5,4,4,5,4,4,4,4,4
2,913,4,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,5,3,tärkeä paikka minulle on makuuhuoneeni siellä ...,5,6,5,5,5,5,6,6,6,6
3,1822,5,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,5,2,minulle tärkeä paikka on mun olohuone koska vo...,5,5,6,6,5,6,5,6,5,5
4,12,7,1,/m/teamwork/t40511_asr/c/digitala/DigiTala_201...,6,1,tämä paikka on minulle tosi tärkeä koska se an...,4,5,5,5,5,4,5,5,5,5


In [7]:
df_fi.to_csv("model_predictions_fi.csv")
df_sv.to_csv("model_predictions_sv.csv")