# OpenAi models Evaluation

The aim of this notebook is to evaluate the performance of paid models provided by **OpenAi** on our text difficulty classification task. The models have theoretically already been fine-tuned in the previous notebook.

In [1]:
# ---------------------------- PREPARING NOTEBOOK ---------------------------- #
# Autoreload
%load_ext autoreload
%autoreload 2

# Random seed
import numpy as np
np.random.seed(42)

# External modules
import os
from IPython.display import display, Markdown, Latex, clear_output
from tqdm import notebook as tqdm

# Set global log level
import logging
logging.basicConfig(level=logging.INFO)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Define PWD as the current git repository
import git
repo = git.Repo('.', search_parent_directories=True)
pwd = repo.working_dir
os.chdir(pwd)

In [2]:
# --------------------------- MODEL LIST DEFINITION -------------------------- #
MODEL_LIST = {
    "train_french_difficulty_empty_davinci-002_prepared_for_fine_tuning": "ft:davinci-002:university-of-lausanne::8QCZp0Y2",
    "train_french_difficulty_empty_babbage-002_prepared_for_fine_tuning": "ft:babbage-002:university-of-lausanne::8QCl5zSl",
    "train_french_difficulty_CECRL_davinci-002_prepared_for_fine_tuning": "ft:davinci-002:university-of-lausanne::8QD1Xbjf",
    "train_french_difficulty_CECRL_babbage-002_prepared_for_fine_tuning": "ft:babbage-002:university-of-lausanne::8QDDOxw0",
    "train_sentences_empty_davinci-002_prepared_for_fine_tuning": "ft:davinci-002:university-of-lausanne::8QDQINn1",
    "train_sentences_empty_babbage-002_prepared_for_fine_tuning": "ft:babbage-002:university-of-lausanne::8QDaQaCo",
    "train_sentences_CECRL_davinci-002_prepared_for_fine_tuning": "ft:davinci-002:university-of-lausanne::8QDoTY1V",
    "train_sentences_CECRL_babbage-002_prepared_for_fine_tuning": "ft:babbage-002:university-of-lausanne::8QDyhjW0",
    "train_ljl_empty_davinci-002_prepared_for_fine_tuning": "ft:davinci-002:university-of-lausanne::8QEESk7J",
    "train_ljl_empty_babbage-002_prepared_for_fine_tuning": "ft:babbage-002:university-of-lausanne::8QENQKTK",
    "train_ljl_CECRL_davinci-002_prepared_for_fine_tuning": "ft:davinci-002:university-of-lausanne::8QEc9bOU",
    "train_ljl_CECRL_babbage-002_prepared_for_fine_tuning": "ft:babbage-002:university-of-lausanne::8QEmXtlZ",
    "train_french_difficulty_empty_gpt-3.5-turbo-1106_prepared_for_fine_tuning": "ft:gpt-3.5-turbo-1106:university-of-lausanne::8S1DqR8V",
    "train_french_difficulty_CECRL_gpt-3.5-turbo-1106_prepared_for_fine_tuning": "ft:gpt-3.5-turbo-1106:university-of-lausanne::8S2AcQXl",
    "train_sentences_empty_gpt-3.5-turbo-1106_prepared_for_fine_tuning": "ft:gpt-3.5-turbo-1106:university-of-lausanne::8S3UMIx9",
    "train_sentences_CECRL_gpt-3.5-turbo-1106_prepared_for_fine_tuning": "ft:gpt-3.5-turbo-1106:university-of-lausanne::8S4tvVym",
    "train_ljl_empty_gpt-3.5-turbo-1106_prepared_for_fine_tuning": "ft:gpt-3.5-turbo-1106:university-of-lausanne::8S5w5UNK",
    "train_ljl_CECRL_gpt-3.5-turbo-1106_prepared_for_fine_tuning": "ft:gpt-3.5-turbo-1106:university-of-lausanne::8S75jPff",
}

In [3]:
# ---------------------------- COMPUTE PREDICTIONS --------------------------- #
from src.DifficultyEstimationModel import DifficultyEstimationModel
import pandas as pd

model_predictions = {}
for model_key, model_id in tqdm.tqdm(MODEL_LIST.items()):
    dataset = model_key.replace("french_difficulty_", "french-difficulty_").split("_")[
        1
    ]
    context = model_key.replace("french_difficulty_", "french-difficulty_").split("_")[
        2
    ]
    model_name = model_key.replace("french_difficulty_", "french-difficulty_").split(
        "_"
    )[3]

    # Try to load already computed predictions
    file_name = model_key.replace("train_", "test_")
    path = os.path.join(
        pwd,
        "results",
        "DifficultyEstimationModel",
        f"{file_name}_predictions.csv",
    )
    try:
        model_predictions[model_key] = {"predictions": pd.read_csv(path)}
    except:
        # Create model
        model = DifficultyEstimationModel(model=model_name, model_id=model_id)
        model_predictions[model_key] = {"predictions": model.predict(file_name)}

  0%|          | 0/18 [00:00<?, ?it/s]

In [4]:
# ---------------------------- COMPUTE METRICS ------------------------------- #
import sklearn.metrics
import pandas as pd

for model_key, model_predictions_df in model_predictions.items():
    dataset = model_key.split("_")[1]

    # Truncate predictions
    if dataset in ["sentences", "french"]:
        model_predictions[model_key]["predictions"]["predictions"] = model_predictions[
            model_key
        ]["predictions"]["predictions"].apply(lambda x: x[:2])
    elif dataset in ["ljl"]:
        model_predictions[model_key]["predictions"]["predictions"] = model_predictions[
            model_key
        ]["predictions"]["predictions"].apply(lambda x: x[:6])

    # Compute metrics
    metrics = pd.DataFrame(
        {
            "accuracy": [
                sklearn.metrics.accuracy_score(
                    model_predictions[model_key]["predictions"]["assistant"],
                    model_predictions[model_key]["predictions"]["predictions"],
                )
            ],
            "f1 (macro)": [
                sklearn.metrics.f1_score(
                    model_predictions[model_key]["predictions"]["assistant"],
                    model_predictions[model_key]["predictions"]["predictions"],
                    average="macro",
                )
            ],
            "f1 (micro)": [
                sklearn.metrics.f1_score(
                    model_predictions[model_key]["predictions"]["assistant"],
                    model_predictions[model_key]["predictions"]["predictions"],
                    average="micro",
                )
            ],
            "precision (macro)": [
                sklearn.metrics.precision_score(
                    model_predictions[model_key]["predictions"]["assistant"],
                    model_predictions[model_key]["predictions"]["predictions"],
                    average="macro",
                )
            ],
            "precision (micro)": [
                sklearn.metrics.precision_score(
                    model_predictions[model_key]["predictions"]["assistant"],
                    model_predictions[model_key]["predictions"]["predictions"],
                    average="micro",
                )
            ],
            "recall (macro)": [
                sklearn.metrics.recall_score(
                    model_predictions[model_key]["predictions"]["assistant"],
                    model_predictions[model_key]["predictions"]["predictions"],
                    average="macro",
                )
            ],
            "recall (micro)": [
                sklearn.metrics.recall_score(
                    model_predictions[model_key]["predictions"]["assistant"],
                    model_predictions[model_key]["predictions"]["predictions"],
                    average="micro",
                )
            ],
        }
    )
    model_predictions[model_key]["metrics"] = metrics

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# ------------------------------ DISPLAY METRICS ----------------------------- #

df = pd.concat(
    [model_predictions[model_key]["metrics"] for model_key in MODEL_LIST.keys()],
    keys=MODEL_LIST.keys(),
)

# Transform index to multi-index
df = df.reset_index()
df["level_0"] = df["level_0"].str.replace("french_difficulty_", "french-difficulty_")
df["dataset"] = df["level_0"].apply(lambda x: x.split("_")[1])
df["context"] = df["level_0"].apply(lambda x: x.split("_")[2])
df["model"] = df["level_0"].apply(lambda x: x.split("_")[3])
df = df.set_index(["dataset", "context", "model"]).drop(columns=["level_0", "level_1"])
df = df[
    [
        "accuracy",
        "f1 (macro)",
        "f1 (micro)",
        "precision (macro)",
        "precision (micro)",
        "recall (macro)",
        "recall (micro)",
    ]
]
df = df.sort_values(by="f1 (macro)", ascending=False)

# Save metrics
path = os.path.join(
    pwd, "results", "difficulty_estimation", "OpenAiEvaluation", "metrics.csv"
)
if not os.path.exists(os.path.dirname(path)):
    os.makedirs(os.path.dirname(path))
df.to_csv(path)

# Display metrics
df.style.background_gradient(cmap="viridis")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1 (macro),f1 (micro),precision (macro),precision (micro),recall (macro),recall (micro)
dataset,context,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sentences,CECRL,gpt-3.5-turbo-1106,0.897917,0.89697,0.897917,0.897494,0.897917,0.897917,0.897917
sentences,empty,gpt-3.5-turbo-1106,0.866667,0.864868,0.866667,0.866231,0.866667,0.866667,0.866667
sentences,CECRL,davinci-002,0.814583,0.812253,0.814583,0.811908,0.814583,0.814583,0.814583
ljl,empty,gpt-3.5-turbo-1106,0.733656,0.74584,0.733656,0.74934,0.733656,0.746278,0.733656
ljl,CECRL,gpt-3.5-turbo-1106,0.723971,0.735612,0.723971,0.756418,0.723971,0.7255,0.723971
sentences,empty,davinci-002,0.825,0.61909,0.825,0.622441,0.825,0.61875,0.825
sentences,empty,babbage-002,0.8125,0.609588,0.8125,0.612527,0.8125,0.609375,0.8125
sentences,CECRL,babbage-002,0.8125,0.608993,0.8125,0.610264,0.8125,0.609375,0.8125
french-difficulty,CECRL,gpt-3.5-turbo-1106,0.498958,0.423531,0.498958,0.428613,0.498958,0.427679,0.498958
ljl,empty,davinci-002,0.585956,0.337429,0.585956,0.347085,0.585956,0.331162,0.585956
