In [46]:
# Importing required libraries
import pandas as pd

# Loading the CSV file
file_path = 'cosine_distance.csv'
data = pd.read_csv(file_path)

# Displaying the first few rows to understand the structure
data.head(10)

Unnamed: 0,Name,test_exact_match,test_f1,Runtime,Notes,State,Tags,eval/loss,eval/runtime,eval/samples_per_second,...,train/global_step,train/learning_rate,train/loss,train/total_flos,train/train_loss,train/train_runtime,train/train_samples_per_second,train/train_steps_per_second,val_exact_match,val_f1
0,th_wordnet_aug_1.0,39.575662,54.099859,3011,-,finished,,2.364525,5.105,441.916,...,5640,0,0.1326,76607636381752320,0.790639,2867.3967,125.842,1.967,51.329787,62.947724
1,th_wordnet_aug_0.9,40.294582,54.295754,2871,-,finished,,2.357939,5.1034,442.058,...,5360,0,0.162,72777679169894400,0.836683,2727.1321,125.7,1.965,49.379433,61.372919
2,th_wordnet_aug_0.8,38.997019,53.889524,2732,-,finished,,2.198472,5.1039,442.015,...,5080,0,0.1241,68947721958036480,0.841739,2589.0343,125.437,1.962,49.64539,61.390519
3,th_wordnet_aug_0.7,40.522532,54.269615,2596,-,finished,,2.172062,5.1066,441.779,...,4800,0,0.175,65117764746178560,0.906674,2455.6063,124.906,1.955,50.975177,61.952266
4,th_wordnet_aug_0.6,37.03314,52.975213,2459,-,finished,,2.220396,5.1042,441.99,...,4520,0,0.1403,61287807534320640,0.845333,2320.3382,124.413,1.948,50.487589,62.225778
5,th_wordnet_aug_0.5,37.874803,52.672719,2313,-,finished,,2.052335,5.1119,441.321,...,4220,0,0.1229,57318791454737280,0.86233,2175.7934,124.378,1.94,49.689716,61.418748
6,th_wordnet_aug_0.4,39.733474,53.050482,2174,-,finished,,2.084451,5.1134,441.19,...,3940,0,0.1738,53490108064568450,0.903963,2038.8332,123.885,1.932,50.664894,62.222471
7,th_wordnet_aug_0.3,38.804138,53.73571,2029,-,finished,,1.965081,5.1116,441.35,...,3660,0,0.2114,49661424674399620,0.976911,1900.2045,123.429,1.926,51.108156,62.652332
8,th_wordnet_aug_0.2,38.909346,53.22636,1896,-,finished,,1.92808,5.1099,441.497,...,3380,0,0.2209,45832741284230780,1.009166,1763.6275,122.758,1.917,49.64539,61.364477
9,th_wordnet_aug_0.1,37.78713,52.773388,1754,-,finished,,1.860637,5.1073,441.717,...,3100,0,0.2612,42004057894061950,1.084156,1625.0496,122.126,1.908,50.221631,62.119381


In [47]:
pretty_names = {
    "th_qcpg_0.8_llm_gec_aug": "QCPG (0.8) + LLM GEC",
    "th_qcpg_0.5_llm_gec_aug": "QCPG (0.5) + LLM GEC",
    "th_qcpg_0.2_llm_gec_aug": "QCPG (0.2) + LLM GEC",
    "th_qcpg_0.8_aug": "QCPG (0.8)",
    "th_qcpg_0.5_aug": "QCPG (0.5)",
    "th_qcpg_0.2_aug": "QCPG (0.2)",
    "th_aug": "Backtranslation",
    "th_fasttext_aug": "FastText",
    "th_llm_gec_aug": "LLM GEC",
    "th_llm_paraphrase_aug": "LLM Paraphrase",
    "th_ltw2v_aug": "LTW2Vec",
    "th_thai2fit_aug": "Thai2Fit",
    "th_wordnet_aug": "WordNet",
}

In [48]:
# Function to extract the augmentation type from the name
def get_augmentation_type(name):
    if name == "original":
        return "original"
    return name.rsplit("_", 2)[0]

# Function to extract the augmentation ratio from the name
def get_augment_ratio(name):
    if name == "original":
        return "N/A" # Not applicable for the original model
    return name.rsplit("_", 1)[-1]

# Function to extract the pretty name (augmentation name without ratio) from the name
def get_pretty_name(name):
    if name == "original":
        return "Original"
    base_name = name.rsplit("_", 1)[0]
    return pretty_names[base_name]


# Apply the function to create a new column with the augmentation type
data['augmentation_type'] = data['Name'].apply(get_augmentation_type)

# Find the best performing model for each augmentation type based on the "test_exact_match" metric
best_models = data.loc[data.groupby('augmentation_type')['test_exact_match'].idxmax()]

# Resetting the index
best_models.reset_index(drop=True, inplace=True)

# Apply the functions to create the new columns
best_models['augment_ratio'] = best_models['Name'].apply(get_augment_ratio)
best_models['pretty_name'] = best_models['Name'].apply(get_pretty_name)

# Displaying the results
best_models[['Name', 'pretty_name', 'test_exact_match']]

Unnamed: 0,Name,pretty_name,test_exact_match
0,original,Original,39.242504
1,th_aug_1.0,Backtranslation,41.381729
2,th_fasttext_aug_0.8,FastText,40.803086
3,th_llm_gec_aug_0.7,LLM GEC,41.08364
4,th_llm_paraphrase_aug_0.5,LLM Paraphrase,40.540067
5,th_ltw2v_aug_1.0,LTW2Vec,40.189374
6,th_qcpg_0.2_aug_0.4,QCPG (0.2),39.41785
7,th_qcpg_0.2_llm_gec_aug_0.2,QCPG (0.2) + LLM GEC,40.259513
8,th_qcpg_0.5_aug_0.7,QCPG (0.5),40.119235
9,th_qcpg_0.5_llm_gec_aug_0.4,QCPG (0.5) + LLM GEC,40.504997


In [49]:
# Selecting columns that begin with "test" along with the "Name" column
test_columns = [col for col in data.columns if col.startswith("test")]
selected_columns = ['pretty_name', 'augment_ratio', "val_exact_match", "val_f1", "test_tydiqa_exact_match", "test_tydiqa_f1", "test_xquad_exact_match", "test_xquad_f1"]

# Selecting the relevant columns from the best models
best_models_test_metrics = best_models[selected_columns]

# Sorting the results by "test_exact_match" in descending order
best_models_test_metrics_sorted = best_models_test_metrics.sort_values(by='test_tydiqa_exact_match', ascending=True)

# Displaying the sorted results
best_models_test_metrics_sorted

Unnamed: 0,pretty_name,augment_ratio,val_exact_match,val_f1,test_tydiqa_exact_match,test_tydiqa_f1,test_xquad_exact_match,test_xquad_f1
0,Original,,50.35461,62.278984,40.636746,55.101966,33.898305,47.915171
6,QCPG (0.2),0.4,49.601064,60.829974,41.123148,54.287219,32.881356,45.600339
8,QCPG (0.5),0.7,49.64539,61.112328,41.720097,54.983745,33.983051,47.679517
7,QCPG (0.2) + LLM GEC,0.2,50.35461,62.080071,41.963299,55.280244,33.728814,47.37128
5,LTW2Vec,1.0,51.196809,62.484083,42.007517,55.898205,33.220339,47.849854
12,Thai2Fit,1.0,51.241135,63.06627,42.2065,56.134349,32.79661,45.984167
4,LLM Paraphrase,0.5,50.886525,62.352358,42.317046,55.670707,33.728814,47.019141
13,WordNet,0.7,50.975177,61.952266,42.383374,55.979663,33.389831,47.714915
9,QCPG (0.5) + LLM GEC,0.4,49.29078,61.64428,42.471811,56.061706,32.966102,46.328062
10,QCPG (0.8),0.7,50.177305,62.113025,42.692903,55.391331,34.915254,47.937018


In [50]:
def add_monolingual_col(df):
    df = df.copy()
    new_col = []

    for i in range(len(df)):
        if "QCPG" in df["pretty_name"].iloc[i]:
            new_col.append(False)
        elif "LLM" in df["pretty_name"].iloc[i]:
            new_col.append(False)
        elif "Back" in df["pretty_name"].iloc[i]:
            new_col.append(False)
        else:
            new_col.append(True)
    
    df["monolingual"] = new_col
    return df

best_models_test_metrics_sorted = add_monolingual_col(best_models_test_metrics_sorted)
best_models_test_metrics_sorted

Unnamed: 0,pretty_name,augment_ratio,val_exact_match,val_f1,test_tydiqa_exact_match,test_tydiqa_f1,test_xquad_exact_match,test_xquad_f1,monolingual
0,Original,,50.35461,62.278984,40.636746,55.101966,33.898305,47.915171,True
6,QCPG (0.2),0.4,49.601064,60.829974,41.123148,54.287219,32.881356,45.600339,False
8,QCPG (0.5),0.7,49.64539,61.112328,41.720097,54.983745,33.983051,47.679517,False
7,QCPG (0.2) + LLM GEC,0.2,50.35461,62.080071,41.963299,55.280244,33.728814,47.37128,False
5,LTW2Vec,1.0,51.196809,62.484083,42.007517,55.898205,33.220339,47.849854,True
12,Thai2Fit,1.0,51.241135,63.06627,42.2065,56.134349,32.79661,45.984167,True
4,LLM Paraphrase,0.5,50.886525,62.352358,42.317046,55.670707,33.728814,47.019141,False
13,WordNet,0.7,50.975177,61.952266,42.383374,55.979663,33.389831,47.714915,True
9,QCPG (0.5) + LLM GEC,0.4,49.29078,61.64428,42.471811,56.061706,32.966102,46.328062,False
10,QCPG (0.8),0.7,50.177305,62.113025,42.692903,55.391331,34.915254,47.937018,False


In [51]:
# Round all numerical columns to 2 decimal places
best_models_test_metrics_sorted = best_models_test_metrics_sorted.round(2)
best_models_test_metrics_sorted

Unnamed: 0,pretty_name,augment_ratio,val_exact_match,val_f1,test_tydiqa_exact_match,test_tydiqa_f1,test_xquad_exact_match,test_xquad_f1,monolingual
0,Original,,50.35,62.28,40.64,55.1,33.9,47.92,True
6,QCPG (0.2),0.4,49.6,60.83,41.12,54.29,32.88,45.6,False
8,QCPG (0.5),0.7,49.65,61.11,41.72,54.98,33.98,47.68,False
7,QCPG (0.2) + LLM GEC,0.2,50.35,62.08,41.96,55.28,33.73,47.37,False
5,LTW2Vec,1.0,51.2,62.48,42.01,55.9,33.22,47.85,True
12,Thai2Fit,1.0,51.24,63.07,42.21,56.13,32.8,45.98,True
4,LLM Paraphrase,0.5,50.89,62.35,42.32,55.67,33.73,47.02,False
13,WordNet,0.7,50.98,61.95,42.38,55.98,33.39,47.71,True
9,QCPG (0.5) + LLM GEC,0.4,49.29,61.64,42.47,56.06,32.97,46.33,False
10,QCPG (0.8),0.7,50.18,62.11,42.69,55.39,34.92,47.94,False


In [52]:
from IPython.display import Markdown
# Function to create a markdown table row (corrected)
def create_md_row(row, best_values, ignore_bestval_columns=["pretty_name", "augment_ratio"]):
    md_row = "| "
    for col, value in row.items():  # Using 'items' instead of 'iteritems'
        if value == best_values[col] and col not in ignore_bestval_columns:
            md_row += f"**{value}** | "
        else:
            md_row += f"{value} | "
    return md_row

best_values = best_models_test_metrics_sorted.max()

# Creating markdown table rows (corrected)
md_table_corrected = "| " + " | ".join(selected_columns) + " |\n"
md_table_corrected += "| " + " | ".join(["-" * len(col) for col in selected_columns]) + " |\n"
for _, row in best_models_test_metrics_sorted.iterrows():
    md_table_corrected += create_md_row(row, best_values) + "\n"

# Displaying the corrected markdown table
Markdown(md_table_corrected)

| pretty_name | augment_ratio | val_exact_match | val_f1 | test_tydiqa_exact_match | test_tydiqa_f1 | test_xquad_exact_match | test_xquad_f1 |
| ----------- | ------------- | --------------- | ------ | ----------------------- | -------------- | ---------------------- | ------------- |
| Original | N/A | 50.35 | 62.28 | 40.64 | 55.1 | 33.9 | 47.92 | **True** | 
| QCPG (0.2) | 0.4 | 49.6 | 60.83 | 41.12 | 54.29 | 32.88 | 45.6 | False | 
| QCPG (0.5) | 0.7 | 49.65 | 61.11 | 41.72 | 54.98 | 33.98 | 47.68 | False | 
| QCPG (0.2) + LLM GEC | 0.2 | 50.35 | 62.08 | 41.96 | 55.28 | 33.73 | 47.37 | False | 
| LTW2Vec | 1.0 | 51.2 | 62.48 | 42.01 | 55.9 | 33.22 | 47.85 | **True** | 
| Thai2Fit | 1.0 | **51.24** | **63.07** | 42.21 | 56.13 | 32.8 | 45.98 | **True** | 
| LLM Paraphrase | 0.5 | 50.89 | 62.35 | 42.32 | 55.67 | 33.73 | 47.02 | False | 
| WordNet | 0.7 | 50.98 | 61.95 | 42.38 | 55.98 | 33.39 | 47.71 | **True** | 
| QCPG (0.5) + LLM GEC | 0.4 | 49.29 | 61.64 | 42.47 | 56.06 | 32.97 | 46.33 | False | 
| QCPG (0.8) | 0.7 | 50.18 | 62.11 | 42.69 | 55.39 | 34.92 | 47.94 | False | 
| FastText | 0.8 | 50.27 | 61.91 | 42.85 | 56.14 | 32.97 | 47.22 | **True** | 
| LLM GEC | 0.7 | 50.31 | 61.84 | 42.91 | **56.38** | 34.07 | 48.08 | False | 
| Backtranslation | 1.0 | 50.93 | 62.7 | 42.96 | 56.26 | **35.34** | 48.64 | False | 
| QCPG (0.8) + LLM GEC | 1.0 | 50.53 | 62.47 | **43.25** | 56.08 | 34.92 | **49.03** | False | 


In [31]:
def compute_relative_scores(df):
    df = df.copy()

    # Create a new DataFrame to store the results
    new_df = pd.DataFrame(columns=['Augmentation', 'Ratio', 'Val EM/F1', 'TyDiQA EM/F1', 'XQuAD EM/F1'])

    # Adding the original scores as the first row without difference calculation
    original_row = df.iloc[0]
    new_df.loc[0] = [original_row['pretty_name'], original_row['augment_ratio']] + [f"{original_row[i]:.2f} / {original_row[i + 1]:.2f}" for i in range(2, len(original_row), 2)]

    # Assuming the first row contains the original scores
    original_scores = original_row[2:]

    # Iterate through the remaining rows, combining the EM and F1 columns
    for i in range(1, len(df)):
        row = df.iloc[i]
        combined_scores = []
        for j in range(2, len(row), 2):
            em_score = row[j] - original_scores[j - 2]
            f1_score = row[j + 1] - original_scores[j - 1]
            
            em_arrow = "\\greenarrowup" if em_score > 0 else "\\redarrowdown" if em_score < 0 else ""
            f1_arrow = "\\greenarrowup" if f1_score > 0 else "\\redarrowdown" if f1_score < 0 else ""
            
            if em_arrow == f1_arrow:
                combined = f"{em_arrow}{{{em_score:.2f} / {f1_score:.2f}}}" if em_arrow else f"{em_score:.2f} / {f1_score:.2f}"
            else:
                combined = f"{em_arrow}{{{em_score:.2f}}} / {f1_arrow}{{{f1_score:.2f}}}"
            
            combined_scores.append(combined)

        # Add the combined scores to the new DataFrame
        new_df.loc[i] = [row['pretty_name'], row['augment_ratio']] + combined_scores
    
    return new_df

def export_latex_tables(df, filename):
    # Assuming that 'monolingual' column is a boolean column indicating whether the augmentation is monolingual or not
    # Adjust the condition according to your DataFrame
    monolingual_df = df[df['monolingual']]
    cross_lingual_df = df[~df['monolingual']]

    # Drop the 'monolingual' column as it was only used for filtering
    monolingual_df = monolingual_df.drop(columns=['monolingual'])
    cross_lingual_df = cross_lingual_df.drop(columns=['monolingual'])

    # Convert both DataFrames to LaTeX format
    monolingual_latex = monolingual_df.to_latex(index=False)
    cross_lingual_latex = cross_lingual_df.to_latex(index=False)

    # Combine the LaTeX tables and add a midline between them
    combined_latex = monolingual_latex + "\n\\midrule\n" + cross_lingual_latex

    # Save the combined LaTeX tables to the specified file
    with open(filename, 'w') as file:
        file.write(combined_latex)


export_latex_tables(
        compute_relative_scores(best_models_test_metrics_sorted).to_latex(
            escape=False,
            index=False,
            float_format="{:0.2f}".format,
        ),
        "cosine_distance.tex"
)

# SLEM

In [32]:
# Importing required libraries
import pandas as pd

# Loading the CSV file
file_path = 'slem.csv'
data = pd.read_csv(file_path)

# Displaying the first few rows to understand the structure
data.head(10)

Unnamed: 0,Name,test_exact_match,test_f1,Runtime,Notes,State,Tags,eval/loss,eval/runtime,eval/samples_per_second,...,train/global_step,train/learning_rate,train/loss,train/total_flos,train/train_loss,train/train_runtime,train/train_samples_per_second,train/train_steps_per_second,val_exact_match,val_f1
0,th_wordnet_aug_1.0,41.101175,54.713284,4566,-,finished,slem,2.396067,7.486,301.362,...,5640,4.432624e-08,0.1107,76607636381752320,0.797324,4217.3024,85.562,1.337,51.595745,62.900263
1,th_wordnet_aug_0.9,38.593723,53.270918,4383,-,finished,slem,2.3119,7.5081,300.477,...,5360,5.597015e-08,0.1424,72777679169894400,0.839547,4040.7511,84.836,1.326,49.689716,61.851385
2,th_wordnet_aug_0.8,39.224969,52.982076,4195,-,finished,slem,2.241398,7.5166,300.134,...,5080,4.429134e-08,0.1569,68947721958036480,0.854922,3844.4154,84.476,1.321,47.916667,60.225599
3,th_wordnet_aug_0.7,39.049623,53.857083,3960,-,finished,slem,2.163667,7.4843,301.431,...,4800,4.6875e-08,0.1598,65117764746178560,0.906016,3628.1722,84.538,1.323,51.41844,63.044351
4,th_wordnet_aug_0.6,40.873225,54.903598,3795,-,finished,slem,2.073708,7.4701,302.002,...,4520,4.977876e-08,0.1623,61287807534320640,0.91499,3454.6001,83.564,1.308,51.108156,62.643491
5,th_wordnet_aug_0.5,38.874277,53.396272,3569,-,finished,slem,2.011191,7.4747,301.819,...,4220,4.739336e-08,0.1843,57318791454737280,0.965117,3248.3382,83.31,1.299,49.955674,61.758513
6,th_wordnet_aug_0.4,40.575136,54.565553,3374,-,finished,slem,1.937988,7.482,301.525,...,3940,6.979695e-08,0.2443,53490108064568450,0.988335,3045.6076,82.933,1.294,51.329787,63.377183
7,th_wordnet_aug_0.3,39.751008,54.464701,3160,-,finished,slem,1.92606,7.5284,299.666,...,3660,5.464481e-08,0.2253,49661424674399620,1.022299,2850.1654,82.29,1.284,50.044326,61.79041
8,th_wordnet_aug_0.2,38.593723,53.001801,2960,-,finished,slem,1.92975,7.4688,302.058,...,3380,5.91716e-08,0.2402,45832741284230780,1.050339,2655.0688,81.542,1.273,48.758865,60.852043
9,th_wordnet_aug_0.1,39.873751,54.604449,2798,-,finished,slem,1.726772,7.4838,301.451,...,3100,8.064516e-08,0.3456,42004057894061950,1.155032,2468.6859,80.391,1.256,51.329787,62.564199


In [33]:
# filter out name that has 1.0 inside
data = data[data['Name'].str.contains('1.0') == False]
data

Unnamed: 0,Name,test_exact_match,test_f1,Runtime,Notes,State,Tags,eval/loss,eval/runtime,eval/samples_per_second,...,train/global_step,train/learning_rate,train/loss,train/total_flos,train/train_loss,train/train_runtime,train/train_samples_per_second,train/train_steps_per_second,val_exact_match,val_f1
1,th_wordnet_aug_0.9,38.593723,53.270918,4383,-,finished,slem,2.311900,7.5081,300.477,...,5360,5.597015e-08,0.1424,72777679169894400,0.839547,4040.7511,84.836,1.326,49.689716,61.851385
2,th_wordnet_aug_0.8,39.224969,52.982076,4195,-,finished,slem,2.241398,7.5166,300.134,...,5080,4.429134e-08,0.1569,68947721958036480,0.854922,3844.4154,84.476,1.321,47.916667,60.225599
3,th_wordnet_aug_0.7,39.049623,53.857083,3960,-,finished,slem,2.163667,7.4843,301.431,...,4800,4.687500e-08,0.1598,65117764746178560,0.906016,3628.1722,84.538,1.323,51.418440,63.044351
4,th_wordnet_aug_0.6,40.873225,54.903598,3795,-,finished,slem,2.073708,7.4701,302.002,...,4520,4.977876e-08,0.1623,61287807534320640,0.914990,3454.6001,83.564,1.308,51.108156,62.643491
5,th_wordnet_aug_0.5,38.874277,53.396272,3569,-,finished,slem,2.011191,7.4747,301.819,...,4220,4.739336e-08,0.1843,57318791454737280,0.965117,3248.3382,83.310,1.299,49.955674,61.758513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,th_aug_0.5,37.208487,52.723649,3670,-,finished,slem,1.948586,7.5235,299.861,...,4220,5.331754e-08,0.2553,57318791454737280,1.066411,3301.6425,81.965,1.278,48.448582,61.015501
126,th_aug_0.4,40.171839,53.781333,3426,-,finished,slem,1.935652,7.4867,301.334,...,3940,5.710660e-08,0.2839,53490108064568450,1.110444,3075.4920,82.127,1.281,50.354610,62.052183
127,th_aug_0.3,39.400316,53.765047,3265,-,finished,slem,1.736972,7.5043,300.629,...,3660,6.147541e-08,0.3137,49661424674399620,1.152881,2918.1682,80.372,1.254,50.044326,62.075772
128,th_aug_0.2,40.224443,53.541545,3034,-,finished,slem,1.771218,7.5009,300.764,...,3380,6.656805e-08,0.2955,45832741284230780,1.163375,2688.3533,80.533,1.257,50.664894,62.534593


In [34]:
pretty_names = {
    "th_qcpg_0.8_llm_gec_aug": "QCPG (0.8) + LLM GEC",
    "th_qcpg_0.5_llm_gec_aug": "QCPG (0.5) + LLM GEC",
    "th_qcpg_0.2_llm_gec_aug": "QCPG (0.2) + LLM GEC",
    "th_qcpg_0.8_aug": "QCPG (0.8)",
    "th_qcpg_0.5_aug": "QCPG (0.5)",
    "th_qcpg_0.2_aug": "QCPG (0.2)",
    "th_aug": "Backtranslation",
    "th_fasttext_aug": "FastText",
    "th_llm_gec_aug": "LLM GEC",
    "th_llm_paraphrase_aug": "LLM Paraphrase",
    "th_ltw2v_aug": "LTW2Vec",
    "th_thai2fit_aug": "Thai2Fit",
    "th_wordnet_aug": "WordNet",
}

In [35]:
# Function to extract the augmentation type from the name
def get_augmentation_type(name):
    if name == "original":
        return "original"
    return name.rsplit("_", 2)[0]

# Function to extract the augmentation ratio from the name
def get_augment_ratio(name):
    if name == "original":
        return "N/A" # Not applicable for the original model
    return name.rsplit("_", 1)[-1]

# Function to extract the pretty name (augmentation name without ratio) from the name
def get_pretty_name(name):
    if name == "original":
        return "Original"
    base_name = name.rsplit("_", 1)[0]
    return pretty_names[base_name]


# Apply the function to create a new column with the augmentation type
data['augmentation_type'] = data['Name'].apply(get_augmentation_type)

# Find the best performing model for each augmentation type based on the "test_exact_match" metric
best_models = data.loc[data.groupby('augmentation_type')['test_exact_match'].idxmax()]

# Resetting the index
best_models.reset_index(drop=True, inplace=True)

# Apply the functions to create the new columns
best_models['augment_ratio'] = best_models['Name'].apply(get_augment_ratio)
best_models['pretty_name'] = best_models['Name'].apply(get_pretty_name)

# Displaying the results
best_models[['Name', 'pretty_name', 'test_exact_match']]

Unnamed: 0,Name,pretty_name,test_exact_match
0,th_aug_0.6,Backtranslation,41.34666
1,th_fasttext_aug_0.7,FastText,40.838155
2,th_llm_gec_aug_0.5,LLM GEC,40.785551
3,th_llm_paraphrase_aug_0.7,LLM Paraphrase,40.838155
4,th_ltw2v_aug_0.7,LTW2Vec,41.57461
5,th_qcpg_0.2_aug_0.3,QCPG (0.2),39.90882
6,th_qcpg_0.2_llm_gec_aug_0.7,QCPG (0.2) + LLM GEC,41.048571
7,th_qcpg_0.5_aug_0.5,QCPG (0.5),39.891285
8,th_qcpg_0.5_llm_gec_aug_0.2,QCPG (0.5) + LLM GEC,39.6458
9,th_qcpg_0.8_aug_0.8,QCPG (0.8),40.610205


In [36]:
# Selecting columns that begin with "test" along with the "Name" column
test_columns = [col for col in data.columns if col.startswith("test")]
selected_columns = ['pretty_name', 'augment_ratio', "val_exact_match", "val_f1", "test_tydiqa_exact_match", "test_tydiqa_f1", "test_xquad_exact_match", "test_xquad_f1"]

# Selecting the relevant columns from the best models
best_models_test_metrics = best_models[selected_columns]

# Sorting the results by "test_exact_match" in descending order
best_models_test_metrics_sorted = best_models_test_metrics.sort_values(by='test_tydiqa_exact_match', ascending=False)

# Displaying the sorted results
best_models_test_metrics_sorted

Unnamed: 0,pretty_name,augment_ratio,val_exact_match,val_f1,test_tydiqa_exact_match,test_tydiqa_f1,test_xquad_exact_match,test_xquad_f1
12,WordNet,0.6,51.108156,62.643491,43.577272,57.353464,30.508475,45.513136
4,LTW2Vec,0.7,51.019504,62.361371,43.488835,56.457985,34.237288,47.769619
0,Backtranslation,0.6,51.241135,62.652818,43.090869,56.846774,34.661017,48.081519
6,QCPG (0.2) + LLM GEC,0.7,50.576241,61.826054,42.936104,55.735174,33.813559,47.41625
11,Thai2Fit,0.4,50.576241,62.25253,42.891886,56.086161,33.898305,47.179939
1,FastText,0.7,50.842199,62.349252,42.648685,55.970625,33.898305,47.055486
3,LLM Paraphrase,0.7,50.265957,61.958406,42.516029,56.076793,34.40678,48.045
2,LLM GEC,0.5,52.304965,63.541967,42.49392,55.754316,34.237288,47.499111
9,QCPG (0.8),0.8,51.108156,62.468492,42.361265,56.223504,33.898305,47.880145
7,QCPG (0.5),0.5,50.35461,61.682652,41.941189,55.332033,32.033898,45.787066


In [37]:
# Round all numerical columns to 2 decimal places
best_models_test_metrics_sorted = best_models_test_metrics_sorted.round(2)
best_models_test_metrics_sorted

Unnamed: 0,pretty_name,augment_ratio,val_exact_match,val_f1,test_tydiqa_exact_match,test_tydiqa_f1,test_xquad_exact_match,test_xquad_f1
12,WordNet,0.6,51.11,62.64,43.58,57.35,30.51,45.51
4,LTW2Vec,0.7,51.02,62.36,43.49,56.46,34.24,47.77
0,Backtranslation,0.6,51.24,62.65,43.09,56.85,34.66,48.08
6,QCPG (0.2) + LLM GEC,0.7,50.58,61.83,42.94,55.74,33.81,47.42
11,Thai2Fit,0.4,50.58,62.25,42.89,56.09,33.9,47.18
1,FastText,0.7,50.84,62.35,42.65,55.97,33.9,47.06
3,LLM Paraphrase,0.7,50.27,61.96,42.52,56.08,34.41,48.04
2,LLM GEC,0.5,52.3,63.54,42.49,55.75,34.24,47.5
9,QCPG (0.8),0.8,51.11,62.47,42.36,56.22,33.9,47.88
7,QCPG (0.5),0.5,50.35,61.68,41.94,55.33,32.03,45.79


In [38]:
from IPython.display import Markdown
# Function to create a markdown table row (corrected)
def create_md_row(row, best_values, ignore_bestval_columns=["pretty_name", "augment_ratio"]):
    md_row = "| "
    for col, value in row.items():  # Using 'items' instead of 'iteritems'
        if value == best_values[col] and col not in ignore_bestval_columns:
            md_row += f"**{value}** | "
        else:
            md_row += f"{value} | "
    return md_row

best_values = best_models_test_metrics_sorted.max()

# Creating markdown table rows (corrected)
md_table_corrected = "| " + " | ".join(selected_columns) + " |\n"
md_table_corrected += "| " + " | ".join(["-" * len(col) for col in selected_columns]) + " |\n"
for _, row in best_models_test_metrics_sorted.iterrows():
    md_table_corrected += create_md_row(row, best_values) + "\n"

# Displaying the corrected markdown table
Markdown(md_table_corrected)

| pretty_name | augment_ratio | val_exact_match | val_f1 | test_tydiqa_exact_match | test_tydiqa_f1 | test_xquad_exact_match | test_xquad_f1 |
| ----------- | ------------- | --------------- | ------ | ----------------------- | -------------- | ---------------------- | ------------- |
| WordNet | 0.6 | 51.11 | 62.64 | **43.58** | **57.35** | 30.51 | 45.51 | 
| LTW2Vec | 0.7 | 51.02 | 62.36 | 43.49 | 56.46 | 34.24 | 47.77 | 
| Backtranslation | 0.6 | 51.24 | 62.65 | 43.09 | 56.85 | **34.66** | **48.08** | 
| QCPG (0.2) + LLM GEC | 0.7 | 50.58 | 61.83 | 42.94 | 55.74 | 33.81 | 47.42 | 
| Thai2Fit | 0.4 | 50.58 | 62.25 | 42.89 | 56.09 | 33.9 | 47.18 | 
| FastText | 0.7 | 50.84 | 62.35 | 42.65 | 55.97 | 33.9 | 47.06 | 
| LLM Paraphrase | 0.7 | 50.27 | 61.96 | 42.52 | 56.08 | 34.41 | 48.04 | 
| LLM GEC | 0.5 | **52.3** | **63.54** | 42.49 | 55.75 | 34.24 | 47.5 | 
| QCPG (0.8) | 0.8 | 51.11 | 62.47 | 42.36 | 56.22 | 33.9 | 47.88 | 
| QCPG (0.5) | 0.5 | 50.35 | 61.68 | 41.94 | 55.33 | 32.03 | 45.79 | 
| QCPG (0.8) + LLM GEC | 0.7 | 50.71 | 62.27 | 41.85 | 55.24 | 34.58 | 47.83 | 
| QCPG (0.2) | 0.3 | 49.56 | 61.23 | 41.83 | 55.42 | 32.54 | 46.7 | 
| QCPG (0.5) + LLM GEC | 0.2 | 50.84 | 62.3 | 41.26 | 55.41 | 33.47 | 47.04 | 


In [39]:
with open("slem.tex", "w") as tf:
    tf.write(
        compute_relative_scores(best_models_test_metrics_sorted).to_latex(
            escape=False,
            index=False,
            float_format="{:0.2f}".format,
        )
    )