In [1]:
%pip install ./humor-detection ipywidgets==8.1.5 --q

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from humor_detection.decoder import classification_model, detection_model
from humor_detection.test import test_classification, test_detection
from humor_detection.train import train_classification, train_detection
from humor_detection.predict import predict_classification, predict_detection
from humor_detection.utils import set_random_seeds
from IPython.display import display
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.training_args import TrainingArguments

model_name = "distilbert/distilgpt2"
save_path = "./models/distilgpt2"
default_arguments = {
    "bf16": True,
    "bf16_full_eval": True,
    "disable_tqdm": False,
    "per_device_eval_batch_size": 20,
    "per_device_train_batch_size": 40,
}
prompts = [
    "¿Cuál es el último animal que subió al arca de Noé? El del-fin.",
    "El otro día unas chicas llamarón a mi puerta y me pidieron una pequeña donación para una piscina local.\nLes di un garrafa de agua.",
    "The brain surgeon changed my life. He really opened my mind.",
    "djasndoasndoa",
    "jajaja",
]


# Para GPT2 es necesario asignar un pad_token y puede que para otros modelos también
def fix_tokenizer(tokenizer: PreTrainedTokenizerBase):
    tokenizer.pad_token = tokenizer.eos_token

In [3]:
def run_classification(full_dataset: bool, train: bool):
    set_random_seeds()

    def prompter(input: str):
        return f"Give a humor rating 1 to 5 for the following text:\n{input}\nScore:\n"

    arguments = TrainingArguments(
        num_train_epochs=4,
        optim="adamw_8bit",
        lr_scheduler_type="cosine_with_restarts",
        **default_arguments,
    )
    model, tokenizer = classification_model(
        model_name,
        lora_configuration=None,  # Hacer uso de configuración LoRA para causal LM ejemplo: LoraConfig(task_type="CAUSAL_LM")
        tokenizer_name=None,  # Nombre de tokenizador especial en caso de no tener tokenizador
        classes=None,  # Lista de tokens para clasificación en caso de usar distinto a 0-1 para detección y 1-5 para clasificación
    )
    fix_tokenizer(tokenizer)
    if train:
        train_logs, metrics = train_classification(
            model,
            tokenizer,
            arguments,
            prompter=prompter,  # Función para modificar los prompts, solo es útil en decoders
            full_dataset=full_dataset,
            class_weights=[1, 1.25, 1.25, 2, 4],
            save_path=f"{save_path}/classification" if full_dataset else None,
        )
        display(train_logs)
        display(metrics)
    if not full_dataset or not train:
        display(test_classification(model, tokenizer, arguments))
    display(predict_classification(model, tokenizer, prompts, arguments, prompter))

In [4]:
run_classification(True, False)

Map:   0%|          | 0/1178 [00:00<?, ? examples/s]

2025/05/07 11:03:28 INFO mlflow.tracking.fluent: Experiment with name 'test_distilbert/distilgpt2' does not exist. Creating a new experiment.


{'eval_loss': 1.3709943294525146,
 'eval_model_preparation_time': 0.2601,
 'eval_0_precision': 0.6657458563535912,
 'eval_0_recall': 0.926923076923077,
 'eval_0_f1-score': 0.77491961414791,
 'eval_0_support': 520.0,
 'eval_1_precision': 0.558974358974359,
 'eval_1_recall': 0.6072423398328691,
 'eval_1_f1-score': 0.582109479305741,
 'eval_1_support': 359.0,
 'eval_2_precision': 0.71875,
 'eval_2_recall': 0.2072072072072072,
 'eval_2_f1-score': 0.32167832167832167,
 'eval_2_support': 222.0,
 'eval_3_precision': 0.0,
 'eval_3_recall': 0.0,
 'eval_3_f1-score': 0.0,
 'eval_3_support': 65.0,
 'eval_4_precision': 0.0,
 'eval_4_recall': 0.0,
 'eval_4_f1-score': 0.0,
 'eval_4_support': 12.0,
 'eval_accuracy': 0.633276740237691,
 'eval_macro_avg_precision': 0.38869404306559,
 'eval_macro_avg_recall': 0.34827452479263066,
 'eval_macro_avg_f1-score': 0.3357414830263945,
 'eval_macro_avg_support': 1178.0,
 'eval_weighted_avg_precision': 0.5996792361423279,
 'eval_weighted_avg_recall': 0.63327674023

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Unnamed: 0,score_0,score_1,score_2,score_3,score_4,labels
0,0.479065,0.226294,0.137254,0.050493,0.106894,0
1,0.534707,0.196708,0.092918,0.056358,0.119309,0
2,0.610031,0.174777,0.082559,0.050074,0.082559,0
3,0.526283,0.193609,0.11743,0.071225,0.091454,0
4,0.526283,0.193609,0.11743,0.071225,0.091454,0


In [5]:
run_classification(True, True)

Map:   0%|          | 0/9234 [00:00<?, ? examples/s]

Map:   0%|          | 0/1178 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,0 Precision,0 Recall,0 F1-score,0 Support,1 Precision,1 Recall,1 F1-score,1 Support,2 Precision,2 Recall,2 F1-score,2 Support,3 Precision,3 Recall,3 F1-score,3 Support,4 Precision,4 Recall,4 F1-score,4 Support,Accuracy,Macro Avg Precision,Macro Avg Recall,Macro Avg F1-score,Macro Avg Support,Weighted Avg Precision,Weighted Avg Recall,Weighted Avg F1-score,Weighted Avg Support
1,1.584,1.483778,0.976852,0.405769,0.57337,520.0,0.585455,0.448468,0.507886,359.0,0.31441,0.972973,0.475248,222.0,0.0,0.0,0.0,65.0,0.0,0.0,0.0,12.0,0.499151,0.375343,0.365442,0.311301,1178.0,0.66888,0.499151,0.497443,1178.0
2,1.5634,1.550483,0.787879,0.05,0.094033,520.0,0.303681,0.275766,0.289051,359.0,0.247863,0.914414,0.39001,222.0,0.0,0.0,0.0,65.0,0.0,0.0,0.0,12.0,0.278438,0.267885,0.248036,0.154619,1178.0,0.487049,0.278438,0.203097,1178.0
3,1.5383,1.557374,0.760417,0.140385,0.237013,520.0,0.383013,0.665738,0.486267,359.0,0.325328,0.671171,0.438235,222.0,0.0,0.0,0.0,65.0,0.0,0.0,0.0,12.0,0.391341,0.293751,0.295459,0.232303,1178.0,0.513702,0.391341,0.335403,1178.0
4,1.5112,1.582221,0.775,0.178846,0.290625,520.0,0.373967,0.504178,0.429419,359.0,0.282723,0.72973,0.407547,222.0,1.0,0.015385,0.030303,65.0,0.0,0.0,0.0,12.0,0.370968,0.486338,0.285628,0.231579,1178.0,0.564532,0.370968,0.337633,1178.0


Unnamed: 0,train_loss,loss,accuracy,macro_f1,weighted_f1
0,1.584,1.483778,0.499151,0.311301,0.497443
1,1.5634,1.550483,0.278438,0.154619,0.203097
2,1.5383,1.557374,0.391341,0.232303,0.335403
3,1.5112,1.582221,0.370968,0.231579,0.337633


{'loss': 1.4837775230407715,
 '0_precision': 0.9768518518518519,
 '0_recall': 0.40576923076923077,
 '0_f1-score': 0.5733695652173914,
 '0_support': 520.0,
 '1_precision': 0.5854545454545454,
 '1_recall': 0.44846796657381616,
 '1_f1-score': 0.5078864353312302,
 '1_support': 359.0,
 '2_precision': 0.314410480349345,
 '2_recall': 0.972972972972973,
 '2_f1-score': 0.4752475247524752,
 '2_support': 222.0,
 '3_precision': 0.0,
 '3_recall': 0.0,
 '3_f1-score': 0.0,
 '3_support': 65.0,
 '4_precision': 0.0,
 '4_recall': 0.0,
 '4_f1-score': 0.0,
 '4_support': 12.0,
 'accuracy': 0.499151103565365,
 'macro_avg_precision': 0.3753433755311485,
 'macro_avg_recall': 0.36544203406320397,
 'macro_avg_f1-score': 0.31130070506021934,
 'macro_avg_support': 1178.0,
 'weighted_avg_precision': 0.6688796871126481,
 'weighted_avg_recall': 0.499151103565365,
 'weighted_avg_f1-score': 0.4974434250356575,
 'weighted_avg_support': 1178.0,
 'runtime': 9.4534,
 'samples_per_second': 124.612,
 'steps_per_second': 6.24

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Unnamed: 0,score_0,score_1,score_2,score_3,score_4,labels
0,0.229204,0.229204,0.294304,0.139019,0.108268,2
1,0.215195,0.276316,0.276316,0.130522,0.101651,1
2,0.253503,0.253503,0.253503,0.119746,0.119746,0
3,0.245164,0.245164,0.245164,0.1487,0.115807,0
4,0.2604,0.2604,0.2604,0.123004,0.095796,0


In [6]:
def run_detection(full_dataset: bool, train: bool, threshold: float | None):
    set_random_seeds()

    def prompter(input: str):
        return f"Detect if the following text is humor 1 or not 0:\n{input}\nScore:\n"

    arguments = TrainingArguments(
        num_train_epochs=3,
        **default_arguments,
    )
    model, tokenizer = detection_model(model_name)
    fix_tokenizer(tokenizer)
    if train:
        train_logs, metrics = train_detection(
            model,
            tokenizer,  
            arguments,
            prompter=prompter,
            full_dataset=full_dataset,
            sample="under",
            threshold=threshold,
            save_path=f"{save_path}/detection" if full_dataset else None,
        )
        display(train_logs)
        display(metrics)
    if not full_dataset or not train:
        display(test_detection(model, tokenizer, arguments, prompter, threshold))
    display(
        predict_detection(model, tokenizer, prompts, arguments, prompter, threshold)
    )

In [7]:
run_detection(True, False, None)

Map:   0%|          | 0/1991 [00:00<?, ? examples/s]

{'eval_loss': 0.8240367770195007,
 'eval_model_preparation_time': 0.2677,
 'eval_0_precision': 0.7584134615384616,
 'eval_0_recall': 0.7761377613776138,
 'eval_0_f1-score': 0.7671732522796353,
 'eval_0_support': 813.0,
 'eval_1_precision': 0.8429680759275238,
 'eval_1_recall': 0.8293718166383701,
 'eval_1_f1-score': 0.8361146769362431,
 'eval_1_support': 1178.0,
 'eval_accuracy': 0.8076343545956806,
 'eval_macro_avg_precision': 0.8006907687329927,
 'eval_macro_avg_recall': 0.802754789007992,
 'eval_macro_avg_f1-score': 0.8016439646079392,
 'eval_macro_avg_support': 1991.0,
 'eval_weighted_avg_precision': 0.8084412544818645,
 'eval_weighted_avg_recall': 0.8076343545956806,
 'eval_weighted_avg_f1-score': 0.8079633066470305,
 'eval_weighted_avg_support': 1991.0,
 'eval_runtime': 13.2344,
 'eval_samples_per_second': 150.441,
 'eval_steps_per_second': 7.556}

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Unnamed: 0,score_0,score_1,labels
0,0.562177,0.437823,0
1,0.562177,0.437823,0
2,0.437823,0.562177,1
3,0.5,0.5,0
4,0.5,0.5,0


In [8]:
run_detection(True, True, None)

Map:   0%|          | 0/18468 [00:00<?, ? examples/s]

Map:   0%|          | 0/1991 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,0 Precision,0 Recall,0 F1-score,0 Support,1 Precision,1 Recall,1 F1-score,1 Support,Accuracy,Macro Avg Precision,Macro Avg Recall,Macro Avg F1-score,Macro Avg Support,Weighted Avg Precision,Weighted Avg Recall,Weighted Avg F1-score,Weighted Avg Support
1,0.445,0.571397,0.776543,0.773678,0.775108,813.0,0.8442,0.84635,0.845273,1178.0,0.816675,0.810372,0.810014,0.810191,1991.0,0.816573,0.816675,0.816622,1991.0
2,0.3976,0.639883,0.843615,0.723247,0.778808,813.0,0.826121,0.90747,0.864887,1178.0,0.832245,0.834868,0.815359,0.821847,1991.0,0.833264,0.832245,0.829738,1991.0
3,0.3642,0.727536,0.871517,0.692497,0.771761,813.0,0.814126,0.929542,0.868014,1178.0,0.832747,0.842822,0.811019,0.819888,1991.0,0.837561,0.832747,0.828711,1991.0


Unnamed: 0,train_loss,loss,accuracy,macro_f1,weighted_f1
0,0.445,0.571397,0.816675,0.810191,0.816622
1,0.3976,0.639883,0.832245,0.821847,0.829738
2,0.3642,0.727536,0.832747,0.819888,0.828711


{'loss': 0.6398828625679016,
 '0_precision': 0.8436154949784792,
 '0_recall': 0.7232472324723247,
 '0_f1-score': 0.7788079470198676,
 '0_support': 813.0,
 '1_precision': 0.8261205564142194,
 '1_recall': 0.9074702886247877,
 '1_f1-score': 0.8648867313915858,
 '1_support': 1178.0,
 'accuracy': 0.832245102963335,
 'macro_avg_precision': 0.8348680256963493,
 'macro_avg_recall': 0.8153587605485562,
 'macro_avg_f1-score': 0.8218473392057266,
 'macro_avg_support': 1991.0,
 'weighted_avg_precision': 0.8332643962197157,
 'weighted_avg_recall': 0.832245102963335,
 'weighted_avg_f1-score': 0.8297375341569264,
 'weighted_avg_support': 1991.0,
 'runtime': 14.1847,
 'samples_per_second': 140.363,
 'steps_per_second': 7.05,
 'epoch': 2.0}

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Unnamed: 0,score_0,score_1,labels
0,0.268941,0.731059,1
1,0.5,0.5,0
2,0.952574,0.047426,0
3,0.998499,0.001501,0
4,0.999089,0.000911,0
