# Create Figures for Paper

Ce notebook permet de créer les figures du papier.

In [1]:
# ---------------------------- PREPARING NOTEBOOK ---------------------------- #
# Autoreload
%load_ext autoreload
%autoreload 2

# Random seed
import numpy as np
np.random.seed(42)

# External modules
import os
from IPython.display import display, Markdown, Latex, clear_output
from tqdm import notebook as tqdm

# Set global log level
import logging
logging.basicConfig(level=logging.INFO)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Define PWD as the current git repository
import git
repo = git.Repo('.', search_parent_directories=True)
pwd = repo.working_dir
os.chdir(pwd)

## Loading Data

### OpenAI Evaluation

In [2]:
import pandas as pd

# Load metrics
path = os.path.join(
    pwd, "results", "difficulty_estimation", "OpenAiEvaluation", "metrics.csv"
)
openai_classification_metrics = pd.read_csv(path)
openai_classification_metrics

Unnamed: 0,dataset,context,model,accuracy,f1 (macro),f1 (micro),precision (macro),precision (micro),recall (macro),recall (micro)
0,sentences,CECRL,gpt-3.5-turbo-1106,0.897917,0.89697,0.897917,0.897494,0.897917,0.897917,0.897917
1,sentences,empty,gpt-3.5-turbo-1106,0.866667,0.864868,0.866667,0.866231,0.866667,0.866667,0.866667
2,sentences,CECRL,davinci-002,0.814583,0.812253,0.814583,0.811908,0.814583,0.814583,0.814583
3,ljl,empty,gpt-3.5-turbo-1106,0.733656,0.74584,0.733656,0.74934,0.733656,0.746278,0.733656
4,ljl,CECRL,gpt-3.5-turbo-1106,0.723971,0.735612,0.723971,0.756418,0.723971,0.7255,0.723971
5,sentences,empty,davinci-002,0.825,0.61909,0.825,0.622441,0.825,0.61875,0.825
6,sentences,empty,babbage-002,0.8125,0.609588,0.8125,0.612527,0.8125,0.609375,0.8125
7,sentences,CECRL,babbage-002,0.8125,0.608993,0.8125,0.610264,0.8125,0.609375,0.8125
8,french-difficulty,CECRL,gpt-3.5-turbo-1106,0.498958,0.423531,0.498958,0.428613,0.498958,0.427679,0.498958
9,ljl,empty,davinci-002,0.585956,0.337429,0.585956,0.347085,0.585956,0.331162,0.585956


### Open-source Models Evaluation

In [3]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "OpenSourceModelsEvaluation",
    "bert_metrics.csv",
)
bert_classification_metrics = pd.read_csv(path)
bert_classification_metrics

Unnamed: 0.1,Unnamed: 0,accuracy,f1,precision_macro,precision_micro,recall_macro,recall_micro
0,sentences,0.822917,0.821201,0.826776,0.822917,0.822917,0.822917
1,ljl,0.624697,0.631121,0.626671,0.624697,0.640783,0.624697
2,french_difficulty,0.522917,0.51258,0.528225,0.522917,0.522917,0.522917


In [4]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "OpenSourceModelsEvaluation",
    "mistral_metrics.csv",
)
mistral_classification_metrics = pd.read_csv(path)
mistral_classification_metrics

Unnamed: 0,dataset,context,accuracy,f1,precision_macro,precision_micro,recall_macro,recall_micro
0,sentences,CECRL,0.7479,0.7347,0.7739,0.7479,0.7479,0.7479
1,ljl,CECRL,0.6368,0.6281,0.6805,0.6368,0.6056,0.6368
2,sentences,no-context,0.6312,0.6279,0.6505,0.6312,0.6312,0.6312
3,french-difficulty,CECRL,0.5125,0.5151,0.5212,0.5125,0.5125,0.5125
4,ljl,no-context,0.4722,0.3376,0.4739,0.4722,0.3556,0.4722
5,french-difficulty,no-context,0.3542,0.3063,0.448,0.3542,0.3542,0.3542


### Pairwise Mismatched Evaluation

In [5]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "readability_index_classification_metrics.csv",
)
readability_index_classification_metrics = pd.read_csv(path)
readability_index_classification_metrics

Unnamed: 0,dataset,model,accuracy,f1,precision,recall
0,ljl,gfi,0.4504,0.4119,0.4692,0.4051
1,ljl,fkgl,0.4213,0.4064,0.4704,0.3913
2,ljl,ari,0.3995,0.3229,0.4462,0.335
3,french-difficulty,fkgl,0.3458,0.3366,0.3386,0.3458
4,sentences,ari,0.3438,0.2797,0.2941,0.3438
5,french-difficulty,gfi,0.3417,0.3141,0.323,0.3417
6,french-difficulty,ari,0.3417,0.3325,0.334,0.3417
7,sentences,fkgl,0.3354,0.2856,0.293,0.3354
8,sentences,gfi,0.3229,0.2786,0.2932,0.3229


In [6]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "readability_index_pairwise_mismatch.csv",
)
readability_index_pairwise_mismatch = pd.read_csv(path)
readability_index_pairwise_mismatch

Unnamed: 0,dataset,model,pairwise_mismatch
0,ljl,gfi,44.0969
1,ljl,ari,47.0944
2,ljl,fkgl,54.5375
3,sentences,ari,88.2375
4,sentences,fkgl,99.3167
5,sentences,gfi,101.9583
6,french-difficulty,ari,111.5646
7,french-difficulty,fkgl,112.3771
8,french-difficulty,gfi,112.6271


In [7]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "bert_pairwise_mismatch.csv",
)
bert_pairwise_mismatch = pd.read_csv(path)
bert_pairwise_mismatch

Unnamed: 0.1,Unnamed: 0,Pairwise mismatch
0,ljl,13.9516
1,sentences,22.2125
2,french_difficulty,36.9812


In [8]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "openai_pairwise_mismatch.csv",
)
openai_pairwise_mismatch = pd.read_csv(path)
openai_pairwise_mismatch

Unnamed: 0,dataset,context,model,pairwise_mismatch
0,ljl,empty,gpt-3.5-turbo-1106,9.2736
1,ljl,CECRL,gpt-3.5-turbo-1106,11.0605
2,sentences,CECRL,gpt-3.5-turbo-1106,13.1125
3,sentences,empty,gpt-3.5-turbo-1106,18.1083
4,ljl,empty,babbage-002,18.3341
5,ljl,CECRL,davinci-002,18.6925
6,sentences,empty,davinci-002,20.9042
7,ljl,empty,davinci-002,21.0896
8,sentences,empty,babbage-002,21.2625
9,ljl,CECRL,babbage-002,23.7482


In [9]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "mistral_pairwise_mismatch.csv",
)
mistral_pairwise_mismatch = pd.read_csv(path)
mistral_pairwise_mismatch

Unnamed: 0.1,Unnamed: 0,Pairwise mismatch
0,ljl_CECRL,13.6223
1,ljl_no-context,29.9855
2,french_difficulty_no-context,39.3771
3,sentences_CECRL,39.7625
4,french_difficulty_CECRL,48.4146
5,sentences_no-context,61.7542


## Figures Creation

In [10]:
# Load metrics
metrics = {
    "readability_index_classification_metrics": readability_index_classification_metrics.copy(),
    "bert_classification_metrics": bert_classification_metrics.copy(),
    "mistral_classification_metrics": mistral_classification_metrics.copy(),
    "openai_classification_metrics": openai_classification_metrics.copy(),
    "readability_index_pairwise_mismatch": readability_index_pairwise_mismatch.copy(),
    "bert_pairwise_mismatch": bert_pairwise_mismatch.copy(),
    "mistral_pairwise_mismatch": mistral_pairwise_mismatch.copy(),
    "openai_pairwise_mismatch": openai_pairwise_mismatch.copy(),
}

# Format all dataframe to have the same columns in classification metrics
## Readability index
metrics["readability_index_classification_metrics"].columns = [
    "dataset",
    "model",
    "accuracy",
    "f1 (micro)",
    "precision (micro)",
    "recall (micro)",
]
metrics["readability_index_classification_metrics"]["context"] = "empty"
## CamemBERT
metrics["bert_classification_metrics"].drop(
    columns=["precision_macro", "recall_macro"], inplace=True
)
metrics["bert_classification_metrics"].columns = [
    "dataset",
    "accuracy",
    "f1 (micro)",
    "precision (micro)",
    "recall (micro)",
]
metrics["bert_classification_metrics"]["model"] = "CamemBERT"
metrics["bert_classification_metrics"]["context"] = "empty"
## Mistral
metrics["mistral_classification_metrics"].drop(
    columns=["precision_macro", "recall_macro"], inplace=True
)
metrics["mistral_classification_metrics"].columns = [
    "dataset",
    "context",
    "accuracy",
    "f1 (micro)",
    "precision (micro)",
    "recall (micro)",
]
metrics["mistral_classification_metrics"]["model"] = "Mistral-7B"
## OpenAI
metrics["openai_classification_metrics"].drop(
    columns=["f1 (micro)", "precision (micro)", "recall (micro)"], inplace=True
)
metrics["openai_classification_metrics"].columns = [
    "dataset",
    "context",
    "model",
    "accuracy",
    "f1 (micro)",
    "precision (micro)",
    "recall (micro)",
]

# Format all dataframe to have the same columns in pairwise mismatch
## Readability index
metrics["readability_index_pairwise_mismatch"]["context"] = "empty"
metrics["readability_index_pairwise_mismatch"].columns = [
    "dataset",
    "model",
    "pairwise mismatch",
    "context",
]
## CamemBERT
metrics["bert_pairwise_mismatch"].columns = [
    "dataset",
    "pairwise mismatch",
]
metrics["bert_pairwise_mismatch"]["model"] = "CamemBERT"
metrics["bert_pairwise_mismatch"]["context"] = "empty"
## Mistral
metrics["mistral_pairwise_mismatch"].iloc[:, 0] = (
    metrics["mistral_pairwise_mismatch"]
    .iloc[:, 0]
    .str.replace("french_difficulty", "french-difficulty")
)
metrics["mistral_pairwise_mismatch"]["dataset"] = (
    metrics["mistral_pairwise_mismatch"].iloc[:, 0].apply(lambda x: x.split("_")[0])
)
metrics["mistral_pairwise_mismatch"]["context"] = (
    metrics["mistral_pairwise_mismatch"].iloc[:, 0].apply(lambda x: x.split("_")[1])
)
metrics["mistral_pairwise_mismatch"].drop(
    columns=[metrics["mistral_pairwise_mismatch"].columns[0]], inplace=True
)
metrics["mistral_pairwise_mismatch"].columns = [
    "pairwise mismatch",
    "dataset",
    "context",
]
metrics["mistral_pairwise_mismatch"]["model"] = "Mistral-7B"
## OpenAI
metrics["openai_pairwise_mismatch"].columns = [
    "dataset",
    "context",
    "model",
    "pairwise mismatch",
]

# Merge classification metrics and pairwise mismatch
merged_metrics = {}
for key, metric in metrics.items():
    table = "_".join(key.split("_")[:-2])
    if table not in merged_metrics.keys():
        merged_metrics[table] = metric.copy()
    else:
        merged_metrics[table] = merged_metrics[table].merge(
            metric, on=["dataset", "model", "context"]
        )

# Concatenate all metrics
concatenated_metrics = pd.concat(merged_metrics.values(), ignore_index=True)

# Replace "no-context" by "empty" and "french_difficulty" by "french-difficulty"
concatenated_metrics["context"] = concatenated_metrics["context"].replace(
    "no-context", "empty"
)
concatenated_metrics["dataset"] = concatenated_metrics["dataset"].replace(
    "french_difficulty", "french-difficulty"
)

# Reorder columns
concatenated_metrics = concatenated_metrics[
    [
        "model",
        "context",
        "dataset",
        "pairwise mismatch",
        "accuracy",
        "f1 (micro)",
        "precision (micro)",
        "recall (micro)",
    ]
]

# Remove "babbage-002" model
concatenated_metrics = concatenated_metrics[
    concatenated_metrics["model"] != "babbage-002"
]

# Capitalize model names
concatenated_metrics["model"] = concatenated_metrics["model"].apply(
    lambda x: x.replace("gpt", "GPT")
    .replace("bert", "BERT")
    .replace("mistral", "Mistral")
    .replace("davinci", "Davinci")
    .replace("gfi", "GFI")
    .replace("ari", "ARI")
    .replace("fkgl", "FKGL")
)

# Round all metrics to 2 decimals
concatenated_metrics = concatenated_metrics.round(2)

# Sort by pairwise mismatch
concatenated_metrics.sort_values(by=["pairwise mismatch"], inplace=True)

# Replace context
concatenated_metrics["context"] = (
    concatenated_metrics["context"].replace("empty", "-").replace("CECRL", "\checkmark")
)

# Split by dataset
datasets_metrics = {}
for dataset in concatenated_metrics["dataset"].unique():
    # Split by dataset
    datasets_metrics[dataset] = (
        concatenated_metrics[concatenated_metrics["dataset"] == dataset]
        .copy()
        .drop(columns=["dataset"])
    ).reset_index(drop=True)

    # Set multi-index from model and context
    datasets_metrics[dataset].set_index(["model", "context"], inplace=True)


for dataset, metric in datasets_metrics.items():
    display(Markdown(f"### {dataset}"))
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        display(metric)

### ljl

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GPT-3.5-turbo-1106,-,9.27,0.73,0.75,0.75,0.75
GPT-3.5-turbo-1106,\checkmark,11.06,0.72,0.74,0.76,0.73
Mistral-7B,\checkmark,13.62,0.64,0.63,0.64,0.64
CamemBERT,-,13.95,0.62,0.63,0.62,0.62
Davinci-002,\checkmark,18.69,0.61,0.31,0.32,0.31
Davinci-002,-,21.09,0.59,0.34,0.35,0.33
Mistral-7B,-,29.99,0.47,0.34,0.47,0.47
GFI,-,44.1,0.45,0.41,0.47,0.41
ARI,-,47.09,0.4,0.32,0.45,0.34
FKGL,-,54.54,0.42,0.41,0.47,0.39


### sentences

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GPT-3.5-turbo-1106,\checkmark,13.11,0.9,0.9,0.9,0.9
GPT-3.5-turbo-1106,-,18.11,0.87,0.86,0.87,0.87
Davinci-002,-,20.9,0.82,0.62,0.62,0.62
CamemBERT,-,22.21,0.82,0.82,0.82,0.82
Davinci-002,\checkmark,23.82,0.81,0.81,0.81,0.81
Mistral-7B,\checkmark,39.76,0.75,0.73,0.75,0.75
Mistral-7B,-,61.75,0.63,0.63,0.63,0.63
ARI,-,88.24,0.34,0.28,0.29,0.34
FKGL,-,99.32,0.34,0.29,0.29,0.34
GFI,-,101.96,0.32,0.28,0.29,0.32


### french-difficulty

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CamemBERT,-,36.98,0.52,0.51,0.52,0.52
Mistral-7B,-,39.38,0.35,0.31,0.35,0.35
Mistral-7B,\checkmark,48.41,0.51,0.52,0.51,0.51
GPT-3.5-turbo-1106,\checkmark,51.98,0.5,0.42,0.43,0.43
GPT-3.5-turbo-1106,-,61.12,0.49,0.29,0.3,0.29
Davinci-002,\checkmark,66.44,0.47,0.25,0.26,0.26
Davinci-002,-,76.52,0.47,0.28,0.29,0.28
ARI,-,111.56,0.34,0.33,0.33,0.34
FKGL,-,112.38,0.35,0.34,0.34,0.35
GFI,-,112.63,0.34,0.31,0.32,0.34


## Exporting as Latex

### Tables

In [11]:
# Mute pandas warning
import warnings
import seaborn as sns

warnings.simplefilter(action="ignore", category=FutureWarning)


# Bold best results
def highlight_best(x):
    # Create empty dataframe
    df = pd.DataFrame("", index=x.index, columns=x.columns)

    # Bold min of pairwise mismatch
    df["pairwise mismatch"].loc[
        x["pairwise mismatch"].idxmin()
    ] += "font-weight: bold; color: #FF9999;"

    # Bold max of accuracy	f1 (micro)	precision (micro)	recall (micro)
    for metric in ["accuracy", "f1 (micro)", "precision (micro)", "recall (micro)"]:
        df[metric].loc[x[metric].idxmax()] += "font-weight: bold; color: #FF9999;"

    return df


for dataset_name, df in datasets_metrics.items():
    styled_df = (
        df.style.background_gradient(cmap=sns.light_palette("green", as_cmap=True))
        .background_gradient(
            cmap=sns.light_palette("green", as_cmap=True, reverse=True),
            subset=["pairwise mismatch"],
        )
        .apply(highlight_best, axis=None)
        .applymap_index(lambda v: "font-weight: bold;", axis="columns")
        .applymap_index(lambda v: "font-weight: bold;", axis="rows")
        .format(decimal=",", thousands=".", precision=2)
    )
    display(styled_df)
    path = os.path.join(pwd, "figures", "difficulty_estimation", f"{dataset_name}.tex")
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    latex = styled_df.to_latex(
        caption=(f"Metrics for the {dataset_name} dataset"),
        clines="skip-last;data",
        convert_css=True,
        position_float="centering",
        multicol_align="|c|",
        hrules=True,
    )

    # Add \begin{adjustbox}{center}
    latex = latex.replace(
        "\\begin{tabular}", "\\begin{adjustbox}{center}\n\\begin{tabular}"
    ).replace("\\end{tabular}", "\\end{tabular}\n\\end{adjustbox}")

    # Add comment under the table
    latex = latex.replace(
        "\\end{adjustbox}",
        "\\end{adjustbox}\n\\begin{minipage}{12cm}\n\\vspace{0.1cm}\n\\hline\n\\vspace{0.1cm}\n\\begin{itemize}\n\\item The best results have been highlighted in bold and light red.\n\\item The color gradient is more intense for the best results.\n\\begin{itemize}\n\\item The smallest values for the \\textbf{pairwise mismatch} metric.\n\\item The largest values for the  \\textbf{accuracy}, \\textbf{f1}, \\textbf{precision} and \\textbf{recall} metrics.\n\\end{itemize}\n\\item The models are sorted by pairwise mismatch performance\n\n\item The column \\textit{context} indicates whether the model has been trained and evaluated with the context \\textit{CECRL} described above.\\end{itemize}\n\\end{minipage}",
    )

    # Force position of table
    latex = latex.replace("\\begin{table}", "\\begin{table}[!h]")

    print(latex)

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GPT-3.5-turbo-1106,-,927,73,75,75,75
GPT-3.5-turbo-1106,\checkmark,1106,72,74,76,73
Mistral-7B,\checkmark,1362,64,63,64,64
CamemBERT,-,1395,62,63,62,62
Davinci-002,\checkmark,1869,61,31,32,31
Davinci-002,-,2109,59,34,35,33
Mistral-7B,-,2999,47,34,47,47
GFI,-,4410,45,41,47,41
ARI,-,4709,40,32,45,34
FKGL,-,5454,42,41,47,39


\begin{table}[!h]
\centering
\caption{Metrics for the ljl dataset}
\begin{adjustbox}{center}
\begin{tabular}{llrrrrr}
\toprule
 &  & \bfseries pairwise mismatch & \bfseries accuracy & \bfseries f1 (micro) & \bfseries precision (micro) & \bfseries recall (micro) \\
model & context &  &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{\bfseries GPT-3.5-turbo-1106} & \bfseries - & {\cellcolor[HTML]{EBF3EB}} \color[HTML]{000000} {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 9,27 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,73 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,75 & {\cellcolor[HTML]{058205}} \color[HTML]{F1F1F1} 0,75 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,75 \\
\bfseries  & \bfseries \checkmark & {\cellcolor[HTML]{E1EEE1}} \color[HTML]{000000} {\cellcolor[HTML]{098509}} \color[HTML]{F1F1F1} 11,06 & {\cellcolor[HTML]{068306}} \color[HTML]

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GPT-3.5-turbo-1106,\checkmark,1311,90,90,90,90
GPT-3.5-turbo-1106,-,1811,87,86,87,87
Davinci-002,-,2090,82,62,62,62
CamemBERT,-,2221,82,82,82,82
Davinci-002,\checkmark,2382,81,81,81,81
Mistral-7B,\checkmark,3976,75,73,75,75
Mistral-7B,-,6175,63,63,63,63
ARI,-,8824,34,28,29,34
FKGL,-,9932,34,29,29,34
GFI,-,10196,32,28,29,32


\begin{table}[!h]
\centering
\caption{Metrics for the sentences dataset}
\begin{adjustbox}{center}
\begin{tabular}{llrrrrr}
\toprule
 &  & \bfseries pairwise mismatch & \bfseries accuracy & \bfseries f1 (micro) & \bfseries precision (micro) & \bfseries recall (micro) \\
model & context &  &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{\bfseries GPT-3.5-turbo-1106} & \bfseries \checkmark & {\cellcolor[HTML]{EBF3EB}} \color[HTML]{000000} {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 13,11 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,90 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,90 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,90 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,90 \\
\bfseries  & \bfseries - & {\cellcolor[HTML]{DEEDDE}} \color[HTML]{000000} {\cellcolor[HTML]{0D860D}} \color[HTML]{F1F1F1} 18,11 & {

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CamemBERT,-,3698,52,51,52,52
Mistral-7B,-,3938,35,31,35,35
Mistral-7B,\checkmark,4841,51,52,51,51
GPT-3.5-turbo-1106,\checkmark,5198,50,42,43,43
GPT-3.5-turbo-1106,-,6112,49,29,30,29
Davinci-002,\checkmark,6644,47,25,26,26
Davinci-002,-,7652,47,28,29,28
ARI,-,11156,34,33,33,34
FKGL,-,11238,35,34,34,35
GFI,-,11263,34,31,32,34


\begin{table}[!h]
\centering
\caption{Metrics for the french-difficulty dataset}
\begin{adjustbox}{center}
\begin{tabular}{llrrrrr}
\toprule
 &  & \bfseries pairwise mismatch & \bfseries accuracy & \bfseries f1 (micro) & \bfseries precision (micro) & \bfseries recall (micro) \\
model & context &  &  &  &  &  \\
\midrule
\bfseries CamemBERT & \bfseries - & {\cellcolor[HTML]{EBF3EB}} \color[HTML]{000000} {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 36,98 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,52 & {\cellcolor[HTML]{088408}} \color[HTML]{F1F1F1} 0,51 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,52 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0,52 \\
\cline{1-7}
\multirow[c]{2}{*}{\bfseries Mistral-7B} & \bfseries - & {\cellcolor[HTML]{E3EFE3}} \color[HTML]{000000} {\cellcolor[HTML]{078407}} \color[HTML]{F1F1F1} 39,38 & {\cellcolor[HTML]{DE

### Figures

In [12]:
from matplotlib import pyplot as plt
import matplotlib

matplotlib.use("pgf")
matplotlib.rcParams.update(
    {
        "pgf.texsystem": "pdflatex",
        "font.family": "serif",
        "text.usetex": True,
        "pgf.rcfonts": False,
        "font.size": 10,
    }
)

# Bar plot of accuracy one bar per model and context on each dataset
df = concatenated_metrics.copy()
# Create model + context column
df["model + context"] = df["model"] + df["context"].replace("-", "").replace(
    "\checkmark", " + CECRL"
)
plt.figure()
barplot = sns.barplot(
    x="dataset",
    y="accuracy",
    hue="model + context",
    data=df,
    palette=sns.color_palette(
        [
            "#6baed6",
            "#3182bd",
            "#e6550d",
            "yellow",
            "#756bb1",
            "#9e9ac8",
            "#fd8d3c",
            "#31a354",
            "#74c476",
            "#a1d99b",
        ]
    ),
    ci=None,
)

# Itérer sur les barres du graphique et ajouter le texte à l'intérieur de chaque barre.
for p in barplot.patches:
    if p.get_height() > 0.0:
        barplot.annotate(
            "{}%".format(int(p.get_height() * 100)),  # Format pour la hauteur/accuracy
            (p.get_x() + p.get_width() / 2.0, p.get_height()),
            ha="center",
            va="center",
            xytext=(0, 5),
            textcoords="offset points",
            fontsize=5,
        )

plt.legend(loc="upper right", fontsize=7)
plt.title(
    "Accuracy per model and context", fontsize=16, fontweight="bold"
)  # Ajout d'une police plus marquée pour le titre
plt.xlabel("Dataset", fontsize=12)
plt.ylabel("Accuracy", fontsize=12)
sns.despine()  # Retrait des bordures indésirables
plt.tight_layout()
# Set size inches
plt.gcf().set_size_inches(8, h=8 / 1.618)
# Reduce font size
plt.rcParams.update({"font.size": 6})

# Export to latex
path = os.path.join(pwd, "figures", "difficulty_estimation", "accuracy.pgf")
if not os.path.exists(os.path.dirname(path)):
    os.makedirs(os.path.dirname(path))
plt.savefig(path)

# Display latex
with open(path, "r") as file:
    latex = file.read()
print(latex)

%% Creator: Matplotlib, PGF backend
%%
%% To include the figure in your LaTeX document, write
%%   \input{<filename>.pgf}
%%
%% Make sure the required packages are loaded in your preamble
%%   \usepackage{pgf}
%%
%% Also ensure that all the required font packages are loaded; for instance,
%% the lmodern package is sometimes necessary when using math font.
%%   \usepackage{lmodern}
%%
%% Figures using additional raster images can only be included by \input if
%% they are in the same directory as the main LaTeX file. For loading figures
%% from other directories you can use the `import` package
%%   \usepackage{import}
%%
%% and then include the figures with
%%   \import{<path to file>}{<filename>.pgf}
%%
%% Matplotlib used the following preamble
%%   \def\mathdefault#1{#1}
%%   \everymath=\expandafter{\the\everymath\displaystyle}
%%   
%%   \makeatletter\@ifpackageloaded{underscore}{}{\usepackage[strings]{underscore}}\makeatother
%%
\begingroup%
\makeatletter%
\begin{pgfpicture}%
\pgfpa