In [1]:
# Importing required libraries
import pandas as pd

# Loading the CSV file
file_path = 'cosine_distance.csv'
data = pd.read_csv(file_path)

# Displaying the first few rows to understand the structure
data.head(10)

Unnamed: 0,Name,test_exact_match,test_f1,Runtime,Notes,State,Tags,eval/loss,eval/runtime,eval/samples_per_second,...,train/global_step,train/learning_rate,train/loss,train/total_flos,train/train_loss,train/train_runtime,train/train_samples_per_second,train/train_steps_per_second,val_exact_match,val_f1
0,th_wordnet_aug_1.0,39.575662,54.099859,3011,-,finished,,2.364525,5.105,441.916,...,5640,0,0.1326,76607636381752320,0.790639,2867.3967,125.842,1.967,51.329787,62.947724
1,th_wordnet_aug_0.9,40.294582,54.295754,2871,-,finished,,2.357939,5.1034,442.058,...,5360,0,0.162,72777679169894400,0.836683,2727.1321,125.7,1.965,49.379433,61.372919
2,th_wordnet_aug_0.8,38.997019,53.889524,2732,-,finished,,2.198472,5.1039,442.015,...,5080,0,0.1241,68947721958036480,0.841739,2589.0343,125.437,1.962,49.64539,61.390519
3,th_wordnet_aug_0.7,40.522532,54.269615,2596,-,finished,,2.172062,5.1066,441.779,...,4800,0,0.175,65117764746178560,0.906674,2455.6063,124.906,1.955,50.975177,61.952266
4,th_wordnet_aug_0.6,37.03314,52.975213,2459,-,finished,,2.220396,5.1042,441.99,...,4520,0,0.1403,61287807534320640,0.845333,2320.3382,124.413,1.948,50.487589,62.225778
5,th_wordnet_aug_0.5,37.874803,52.672719,2313,-,finished,,2.052335,5.1119,441.321,...,4220,0,0.1229,57318791454737280,0.86233,2175.7934,124.378,1.94,49.689716,61.418748
6,th_wordnet_aug_0.4,39.733474,53.050482,2174,-,finished,,2.084451,5.1134,441.19,...,3940,0,0.1738,53490108064568450,0.903963,2038.8332,123.885,1.932,50.664894,62.222471
7,th_wordnet_aug_0.3,38.804138,53.73571,2029,-,finished,,1.965081,5.1116,441.35,...,3660,0,0.2114,49661424674399620,0.976911,1900.2045,123.429,1.926,51.108156,62.652332
8,th_wordnet_aug_0.2,38.909346,53.22636,1896,-,finished,,1.92808,5.1099,441.497,...,3380,0,0.2209,45832741284230780,1.009166,1763.6275,122.758,1.917,49.64539,61.364477
9,th_wordnet_aug_0.1,37.78713,52.773388,1754,-,finished,,1.860637,5.1073,441.717,...,3100,0,0.2612,42004057894061950,1.084156,1625.0496,122.126,1.908,50.221631,62.119381


In [2]:
pretty_names = {
    "th_qcpg_0.8_llm_gec_aug": "QCPG (0.8) + LLM GEC",
    "th_qcpg_0.5_llm_gec_aug": "QCPG (0.5) + LLM GEC",
    "th_qcpg_0.2_llm_gec_aug": "QCPG (0.2) + LLM GEC",
    "th_qcpg_0.8_aug": "QCPG (0.8)",
    "th_qcpg_0.5_aug": "QCPG (0.5)",
    "th_qcpg_0.2_aug": "QCPG (0.2)",
    "th_aug": "Backtranslation",
    "th_fasttext_aug": "FastText",
    "th_llm_gec_aug": "LLM GEC",
    "th_llm_paraphrase_aug": "LLM Paraphrase",
    "th_ltw2v_aug": "LTW2Vec",
    "th_thai2fit_aug": "Thai2Fit",
    "th_wordnet_aug": "WordNet",
}

In [3]:
# Function to extract the augmentation type from the name
def get_augmentation_type(name):
    if name == "original":
        return "original"
    return name.rsplit("_", 2)[0]

# Function to extract the augmentation ratio from the name
def get_augment_ratio(name):
    if name == "original":
        return "N/A" # Not applicable for the original model
    return name.rsplit("_", 1)[-1]

# Function to extract the pretty name (augmentation name without ratio) from the name
def get_pretty_name(name):
    if name == "original":
        return "Original"
    base_name = name.rsplit("_", 1)[0]
    return pretty_names[base_name]


# Apply the function to create a new column with the augmentation type
data['augmentation_type'] = data['Name'].apply(get_augmentation_type)

# Find the best performing model for each augmentation type based on the "test_exact_match" metric
best_models = data.loc[data.groupby('augmentation_type')['test_exact_match'].idxmax()]

# Resetting the index
best_models.reset_index(drop=True, inplace=True)

# Apply the functions to create the new columns
best_models['augment_ratio'] = best_models['Name'].apply(get_augment_ratio)
best_models['pretty_name'] = best_models['Name'].apply(get_pretty_name)

# Displaying the results
best_models[['Name', 'pretty_name', 'test_exact_match']]

Unnamed: 0,Name,pretty_name,test_exact_match
0,original,Original,39.242504
1,th_aug_1.0,Backtranslation,41.381729
2,th_fasttext_aug_0.8,FastText,40.803086
3,th_llm_gec_aug_0.7,LLM GEC,41.08364
4,th_llm_paraphrase_aug_0.5,LLM Paraphrase,40.540067
5,th_ltw2v_aug_1.0,LTW2Vec,40.189374
6,th_qcpg_0.2_aug_0.4,QCPG (0.2),39.41785
7,th_qcpg_0.2_llm_gec_aug_0.2,QCPG (0.2) + LLM GEC,40.259513
8,th_qcpg_0.5_aug_0.7,QCPG (0.5),40.119235
9,th_qcpg_0.5_llm_gec_aug_0.4,QCPG (0.5) + LLM GEC,40.504997


In [4]:
# Selecting columns that begin with "test" along with the "Name" column
test_columns = [col for col in data.columns if col.startswith("test")]
selected_columns = ['pretty_name', 'augment_ratio', "val_exact_match", "val_f1", "test_tydiqa_exact_match", "test_tydiqa_f1", "test_xquad_exact_match", "test_xquad_f1"]

# Selecting the relevant columns from the best models
best_models_test_metrics = best_models[selected_columns]

# Sorting the results by "test_exact_match" in descending order
best_models_test_metrics_sorted = best_models_test_metrics.sort_values(by='test_tydiqa_exact_match', ascending=False)

# Displaying the sorted results
best_models_test_metrics_sorted

Unnamed: 0,pretty_name,augment_ratio,val_exact_match,val_f1,test_tydiqa_exact_match,test_tydiqa_f1,test_xquad_exact_match,test_xquad_f1
11,QCPG (0.8) + LLM GEC,1.0,50.531915,62.467403,43.245633,56.076068,34.915254,49.032274
1,Backtranslation,1.0,50.930851,62.698039,42.958214,56.260604,35.338983,48.6357
3,LLM GEC,0.7,50.310284,61.839691,42.913995,56.377384,34.067797,48.076127
2,FastText,0.8,50.265957,61.910283,42.847667,56.141118,32.966102,47.223199
10,QCPG (0.8),0.7,50.177305,62.113025,42.692903,55.391331,34.915254,47.937018
9,QCPG (0.5) + LLM GEC,0.4,49.29078,61.64428,42.471811,56.061706,32.966102,46.328062
13,WordNet,0.7,50.975177,61.952266,42.383374,55.979663,33.389831,47.714915
4,LLM Paraphrase,0.5,50.886525,62.352358,42.317046,55.670707,33.728814,47.019141
12,Thai2Fit,1.0,51.241135,63.06627,42.2065,56.134349,32.79661,45.984167
5,LTW2Vec,1.0,51.196809,62.484083,42.007517,55.898205,33.220339,47.849854


In [5]:
# Round all numerical columns to 2 decimal places
best_models_test_metrics_sorted = best_models_test_metrics_sorted.round(2)
best_models_test_metrics_sorted

Unnamed: 0,pretty_name,augment_ratio,val_exact_match,val_f1,test_tydiqa_exact_match,test_tydiqa_f1,test_xquad_exact_match,test_xquad_f1
11,QCPG (0.8) + LLM GEC,1.0,50.53,62.47,43.25,56.08,34.92,49.03
1,Backtranslation,1.0,50.93,62.7,42.96,56.26,35.34,48.64
3,LLM GEC,0.7,50.31,61.84,42.91,56.38,34.07,48.08
2,FastText,0.8,50.27,61.91,42.85,56.14,32.97,47.22
10,QCPG (0.8),0.7,50.18,62.11,42.69,55.39,34.92,47.94
9,QCPG (0.5) + LLM GEC,0.4,49.29,61.64,42.47,56.06,32.97,46.33
13,WordNet,0.7,50.98,61.95,42.38,55.98,33.39,47.71
4,LLM Paraphrase,0.5,50.89,62.35,42.32,55.67,33.73,47.02
12,Thai2Fit,1.0,51.24,63.07,42.21,56.13,32.8,45.98
5,LTW2Vec,1.0,51.2,62.48,42.01,55.9,33.22,47.85


In [6]:
from IPython.display import Markdown
# Function to create a markdown table row (corrected)
def create_md_row(row, best_values, ignore_bestval_columns=["pretty_name", "augment_ratio"]):
    md_row = "| "
    for col, value in row.items():  # Using 'items' instead of 'iteritems'
        if value == best_values[col] and col not in ignore_bestval_columns:
            md_row += f"**{value}** | "
        else:
            md_row += f"{value} | "
    return md_row

best_values = best_models_test_metrics_sorted.max()

# Creating markdown table rows (corrected)
md_table_corrected = "| " + " | ".join(selected_columns) + " |\n"
md_table_corrected += "| " + " | ".join(["-" * len(col) for col in selected_columns]) + " |\n"
for _, row in best_models_test_metrics_sorted.iterrows():
    md_table_corrected += create_md_row(row, best_values) + "\n"

# Displaying the corrected markdown table
Markdown(md_table_corrected)

| pretty_name | augment_ratio | val_exact_match | val_f1 | test_tydiqa_exact_match | test_tydiqa_f1 | test_xquad_exact_match | test_xquad_f1 |
| ----------- | ------------- | --------------- | ------ | ----------------------- | -------------- | ---------------------- | ------------- |
| QCPG (0.8) + LLM GEC | 1.0 | 50.53 | 62.47 | **43.25** | 56.08 | 34.92 | **49.03** | 
| Backtranslation | 1.0 | 50.93 | 62.7 | 42.96 | 56.26 | **35.34** | 48.64 | 
| LLM GEC | 0.7 | 50.31 | 61.84 | 42.91 | **56.38** | 34.07 | 48.08 | 
| FastText | 0.8 | 50.27 | 61.91 | 42.85 | 56.14 | 32.97 | 47.22 | 
| QCPG (0.8) | 0.7 | 50.18 | 62.11 | 42.69 | 55.39 | 34.92 | 47.94 | 
| QCPG (0.5) + LLM GEC | 0.4 | 49.29 | 61.64 | 42.47 | 56.06 | 32.97 | 46.33 | 
| WordNet | 0.7 | 50.98 | 61.95 | 42.38 | 55.98 | 33.39 | 47.71 | 
| LLM Paraphrase | 0.5 | 50.89 | 62.35 | 42.32 | 55.67 | 33.73 | 47.02 | 
| Thai2Fit | 1.0 | **51.24** | **63.07** | 42.21 | 56.13 | 32.8 | 45.98 | 
| LTW2Vec | 1.0 | 51.2 | 62.48 | 42.01 | 55.9 | 33.22 | 47.85 | 
| QCPG (0.2) + LLM GEC | 0.2 | 50.35 | 62.08 | 41.96 | 55.28 | 33.73 | 47.37 | 
| QCPG (0.5) | 0.7 | 49.65 | 61.11 | 41.72 | 54.98 | 33.98 | 47.68 | 
| QCPG (0.2) | 0.4 | 49.6 | 60.83 | 41.12 | 54.29 | 32.88 | 45.6 | 
| Original | N/A | 50.35 | 62.28 | 40.64 | 55.1 | 33.9 | 47.92 | 


In [7]:
def to_latex_with_bold(df):
    # Finding the highest value in each column (excluding the "Name" column)
    df = df.copy()
    max_values = df.iloc[:, 2:].max()

    # Applying bold formatting to the highest values
    for idx, col in enumerate(df.columns[2:]):
        df[col] = df[col].apply(
            lambda x: f"\\textbf{{{x}}}" if x == max_values[idx] else x
        )

    # Converting to LaTeX
    return df 


with open("cosine_distance.tex", "w") as tf:
    tf.write(
        to_latex_with_bold(best_models_test_metrics_sorted).to_latex(
            escape=False,
            index=False,
            header=[
                "Augmentation",
                "Ratio",
                "Val EM",
                "Val F1",
                "TyDiQA EM",
                "TyDiQA F1",
                "XQuAD EM",
                "XQuAD F1",
            ],
            float_format="{:0.2f}".format,
        )
    )