# Benchmark: Compare experiments and decide the champion architecture

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import datetime
import os
import pandas as pd
from sklearn.metrics import f1_score

from src import models, utils

# Parameters

In [None]:
VERSION_ID = '202602'  # model_version_id
SCORE = "F1"

# Paths
path_interim = os.path.join("data", "interim")
path_experiment1 =  os.path.join(path_interim, "exp01_hpt_nb")
path_experiment2 =  os.path.join(path_interim, "exp02_hpt_gbt")
path_model_prod = os.path.join("models", "prod")
path_model_arch = os.path.join("models", "archive")

# Input
file_train = "train.csv"
file_test = "test.csv"

In [None]:
if not os.path.exists(path_model_prod):
    print(f"Creating the folder: {path_model_prod}")
    os.mkdir(path_model_prod)
if not os.path.exists(path_model_arch):
    print(f"Creating the folder: {path_model_arch}")
    os.mkdir(path_model_arch)

# Load data

## Train/Test Subsets

In [None]:
path_data_train = os.path.join(path_interim, file_train)

df_train = pd.read_csv(...)
df_train.head(2)

In [None]:
path_data_test = os.path.join(path_interim, file_test)

df_test = pd.read_csv(...)
df_test.head(2)

## Experiments results

In [None]:
df_cv_summary_exp1 = pd.read_csv(
    os.path.join(path_experiment1,"df_exp_summary.csv")
)
df_cv_summary_exp1.head(2)

In [None]:
df_cv_summary_exp2 = pd.read_csv(
     os.path.join(path_experiment2,"df_exp_summary.csv")
)
df_cv_summary_exp2.head(2)

# Benchmark

In [None]:
df_cv_summary_exp1.sort_values(ascending=True, by="rank_test_score").head(5)

In [None]:
df_cv_summary_exp2.sort_values(ascending=True, by="rank_test_score").head(5)

# Champion model

In [None]:
df_cv_summary_exp1.loc[
    df_cv_summary_exp1['rank_test_score'] == 1, [
        "mean_test_score", "std_test_score",
        "param_fte__max_df","param_fte__max_features",	"param_fte__min_df"]  # set the params of your champion model
]   # at a tie, you can get the model with : lowest std_test_score and the most simple one

Go to src/models.py and implement get_model()


```python
def get_model(
    # your Hiperparameters:
    min_df: int = 3,
    max_df: float = 0.5,
    ...
):
    """
    Builds and returns a scikit-learn Pipeline for Spanish text classification

    Args:
        min_df (int): Minimum document frequency for vectorizer.
        max_df (float): Maximum document frequency for vectorizer.
        max_features (int, optional): Maximum number of features to include.

    Returns:
        sklearn.pipeline.Pipeline: A pipeline with vectorizer and a classifier.
    """

    
    logging.info("Building pipeline...")

    # Your vectorization strategy
    dtm_transformer = XXXVectorizer(
        ...
    )

    # Your model
    clf = ...


    # Champion pipeline architecture
    skl_pl = Pipeline([
        ('fte', dtm_transformer),
        ('clf', clf)
    ])

    return skl_pl
```