In [34]:
import altair as alt
import matplotlib.pyplot as plt
import pandas as pd
import os

In [35]:
visus_folder_name = 'visualizations'
os.makedirs(visus_folder_name, exist_ok=True)
visus_path = os.path.join(os.getcwd(), visus_folder_name)

In [36]:
df = pd.read_json('./evaluation/aggregate_metrics.json')

In [37]:
df_punchlines = pd.DataFrame(df.loc["punchlines"].tolist(), index=df.columns)
df_punchlines

Unnamed: 0,dice_similarity,jaccard_similarity,levenshtein_distance,hit_rate_pre_treatment,hit_rate
mixtral-8x7b-instruct-v01,0.543614,0.412302,25.267368,0.989474,0.989474
granite-3-3-8b-instruct,0.528261,0.399735,31.288421,0.842105,0.945263
gemini-2.5-flash,0.677834,0.560615,19.029474,0.993684,0.993684
gpt-4,0.728631,0.606435,24.04,0.997895,0.997895
llama-3-405b-instruct,0.590931,0.469324,24.128421,0.983158,0.993684


In [38]:
df_dice = df_punchlines[['dice_similarity']].reset_index().rename(columns={'index': 'model'})
df_lev = df_punchlines[['levenshtein_distance']].reset_index().rename(columns={'index': 'model'})

dice_order = df_dice.sort_values('dice_similarity')['model'].tolist()[::-1]
lev_order = df_lev.sort_values('levenshtein_distance')['model'].tolist()[::-1]

chart_dice = alt.layer(
    alt.Chart(df_dice).mark_bar(size=80).encode(
        x=alt.X('model:N', sort=dice_order, title=' ',
                scale=alt.Scale(paddingInner=0.1), axis=alt.Axis(grid=False)),
        y=alt.Y('dice_similarity:Q', title='Média da Similaridade de Dice', axis=alt.Axis(grid=False)),
        color=alt.Color('model:N', legend=None)
    ),
    alt.Chart(df_dice).mark_text(
        align='center',
        baseline='bottom',
        dy=-2
    ).encode(
        x=alt.X('model:N', sort=dice_order),
        y='dice_similarity:Q',
        text=alt.Text('dice_similarity:Q', format=".2f")
    )
).properties(
    width=600,
    height=300,
    title='Sobreposição de Punchlines por Modelo (Similaridade de Dice)'
).configure_axisX(
    labelAngle=0,
    labelAlign='center'
)

chart_lev = alt.layer(
    alt.Chart(df_lev).mark_bar(size=80).encode(
        x=alt.X('model:N', sort=lev_order, title=' ',
                scale=alt.Scale(paddingInner=0.1), axis=alt.Axis(grid=False)),
        y=alt.Y('levenshtein_distance:Q', title='Média da Distância de Levenshtein', axis=alt.Axis(grid=False)),
        color=alt.Color('model:N', legend=None)
    ),
    alt.Chart(df_lev).mark_text(
        align='center',
        baseline='bottom',
        dy=-2
    ).encode(
        x=alt.X('model:N', sort=lev_order),
        y='levenshtein_distance:Q',
        text=alt.Text('levenshtein_distance:Q', format=".2f")
    )
).properties(
    width=600,
    height=300,
    title='Distância de Punchlines por Modelo (Distância de Levenshtein)'
).configure_axisX(
    labelAngle=0,
    labelAlign='center'
)

chart_dice.save(os.path.join(visus_path,"punchlines_dice.png"), scale_factor=3.0)
chart_lev.save(os.path.join(visus_path, "punchlines_levenshtein.png"), scale_factor=3.0)

chart_dice.show()
chart_lev.show()

In [39]:
df_texts_explanations = df.loc["texts_explanations"].to_frame().T
df_texts_explanations

Unnamed: 0,mixtral-8x7b-instruct-v01,granite-3-3-8b-instruct,gemini-2.5-flash,gpt-4,llama-3-405b-instruct
texts_explanations,3.362869,3.642105,4.635789,4.282105,3.972632


In [40]:
texts_expl = df.loc["texts_explanations"].astype(float)
texts_df = texts_expl.reset_index()
texts_df.columns = ["Modelo", "Concordância Média"]

chart = alt.layer(
    alt.Chart(texts_df).mark_bar(size=80).encode(
        y=alt.Y('Concordância Média:Q',
                title='Nível médio de concordância (Modelo Juiz)',
                scale=alt.Scale(domain=[0, texts_df["Concordância Média"].max() + 0.5]),
                axis=alt.Axis(grid=False)),
        x=alt.X('Modelo:N',
                sort='-y',
                title='',
                axis=alt.Axis(grid=False, labelAngle=0)),
        color=alt.Color('Modelo:N', legend=None),
        tooltip=["Modelo", "Concordância Média"]
    ),
    alt.Chart(texts_df).mark_text(
        align='center',
        baseline='bottom',
        dy=-3
    ).encode(
        x=alt.X('Modelo:N', sort='-y'),
        y='Concordância Média:Q',
        text=alt.Text('Concordância Média:Q', format='.2f')
    )
).properties(
    width=600,
    height=400,
    title="Avaliação das Explicações dos Textos Humorísticos por Modelo"
)
chart.save(os.path.join(visus_path, "explicacoes_concordancia.png"), scale_factor=3.0)
chart

In [41]:
df_comic_styles = pd.DataFrame(df.loc["comic_styles"].dropna().tolist(), index=df.columns[df.loc["comic_styles"].notna()])
df_comic_styles

Unnamed: 0,f1_score,precision,recall,accuracy,f1_macro,f1_micro,hamming_loss,hit_rate_pre_treatment,hit_rate
granite-3-3-8b-instruct,"{'fun': 0.0, 'humor': 0.22222222222222202, 'no...","{'fun': 0.0, 'humor': 0.833333333333333, 'nons...","{'fun': 0.0, 'humor': 0.128205128205128, 'nons...","{'fun': 0.23368421052631502, 'humor': 0.336842...",0.133263,0.164804,0.472105,0.0,1.0
gemini-2.5-flash,"{'fun': 0.7873563218390801, 'humor': 0.5658914...","{'fun': 0.825301204819277, 'humor': 0.88484848...","{'fun': 0.7527472527472521, 'humor': 0.4159544...","{'fun': 0.688421052631578, 'humor': 0.52842105...",0.569205,0.626368,0.359474,1.0,1.0
gpt-4,"{'fun': 0.851511169513797, 'humor': 0.55619047...","{'fun': 0.816120906801007, 'humor': 0.83908045...","{'fun': 0.8901098901098901, 'humor': 0.4159544...","{'fun': 0.7621052631578941, 'humor': 0.5094736...",0.574178,0.637314,0.346842,1.0,1.0
llama-3-405b-instruct,"{'fun': 0.838874680306905, 'humor': 0.73851030...","{'fun': 0.7846889952153111, 'humor': 0.8321428...","{'fun': 0.9010989010989011, 'humor': 0.6638176...","{'fun': 0.734736842105263, 'humor': 0.65263157...",0.613782,0.676004,0.365263,0.0,1.0


In [42]:
import altair as alt

df_f1 = df_comic_styles[['f1_macro']].reset_index().rename(columns={'index': 'model'})
df_hamming = df_comic_styles[['hamming_loss']].reset_index().rename(columns={'index': 'model'})

f1_order = df_f1.sort_values('f1_macro')['model'].tolist()[::-1]
hamming_order = df_hamming.sort_values('hamming_loss')['model'].tolist()

chart_f1 = alt.layer(
    alt.Chart(df_f1).mark_bar(size=80).encode(
        x=alt.X('model:N', sort=f1_order, title=' ',
                scale=alt.Scale(paddingInner=0.1), axis=alt.Axis(grid=False)),
        y=alt.Y('f1_macro:Q', title='F1-Macro',
                scale=alt.Scale(domain=[0, 1]),
                axis=alt.Axis(grid=False)),
        color=alt.Color('model:N', legend=None)
    ),
    alt.Chart(df_f1).mark_text(
        align='center',
        baseline='bottom',
        dy=-2
    ).encode(
        x=alt.X('model:N', sort=f1_order),
        y='f1_macro:Q',
        text=alt.Text('f1_macro:Q', format=".2f")
    )
).properties(
    width=500,
    height=300,
    title='Avaliação dos Modelos na Classificação de Estilos Cômicos'
).configure_axisX(
    labelAngle=0,
    labelAlign='center'
)

chart_hamming = alt.layer(
    alt.Chart(df_hamming).mark_bar(size=80).encode(
        x=alt.X('model:N', sort=hamming_order, title=' ',
                scale=alt.Scale(paddingInner=0.1), axis=alt.Axis(grid=False)),
        y=alt.Y('hamming_loss:Q', title='Média de Hamming Loss',
                scale=alt.Scale(domain=[0, 1]),
                axis=alt.Axis(grid=False)),
        color=alt.Color('model:N', legend=None)
    ),
    alt.Chart(df_hamming).mark_text(
        align='center',
        baseline='bottom',
        dy=-2
    ).encode(
        x=alt.X('model:N', sort=hamming_order),
        y='hamming_loss:Q',
        text=alt.Text('hamming_loss:Q', format=".2f")
    )
).properties(
    width=500,
    height=300,
    title='Erro dos Modelos na Classificação de Estilos Cômicos'
).configure_axisX(
    labelAngle=0,
    labelAlign='center'
)

chart_f1.save(os.path.join(visus_path, "f1_score.png"), scale_factor=3.0)
chart_hamming.save(os.path.join(visus_path, "hamming_loss.png"), scale_factor=3.0)

chart_f1.show()
chart_hamming.show()

In [43]:
cs_special_granite_results = df_comic_styles.loc['granite-3-3-8b-instruct']['f1_score']
cs_special_granite_results

{'fun': 0.0,
 'humor': 0.22222222222222202,
 'nonsense': 0.0,
 'wit': 0.305010893246187,
 'irony': 0.202453987730061,
 'satire': 0.113207547169811,
 'sarcasm': 0.16260162601626002,
 'cynicism': 0.060606060606060004}

In [44]:
df_styles = pd.DataFrame(list(cs_special_granite_results.items()), columns=['style', 'score'])

chart = alt.layer(
    alt.Chart(df_styles).mark_bar().encode(
        x=alt.X('style:N', title=' ', sort='-y', axis=alt.Axis(grid=False)),
        y=alt.Y('score:Q', title='F1-Score', axis=alt.Axis(grid=False)),
        color=alt.Color('style:N', legend=None)
    ),
    alt.Chart(df_styles).mark_text(
        align='center',
        baseline='bottom',
        dy=-2
    ).encode(
        x=alt.X('style:N', sort='-y'),
        y='score:Q',
        text=alt.Text('score:Q', format=".2f")
    )
).properties(
    title='Pontuação do Granite por Estilo Cômico',
    width=500,
    height=300
).configure_axisX(
    labelAngle=0
)

chart.save(os.path.join(visus_path, "granite_f1_per_style.png"), scale_factor=3.0)
chart.show()