In [3]:
import altair as alt
import cairosvg
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
df = pd.read_json('./evaluation/aggregate_metrics.json')

In [5]:
df_punchlines = pd.DataFrame(df.loc["punchlines"].tolist(), index=df.columns)
df_punchlines

Unnamed: 0,dice_similarity,jaccard_similarity,levenshtein_distance,hit_rate_pre_treatment,hit_rate
mixtral-8x7b-instruct-v01,0.543614,0.412302,25.267368,0.989474,0.989474
granite-3-3-8b-instruct,0.528261,0.399735,31.288421,0.842105,0.945263
gemini-2.5-flash,0.677834,0.560615,19.029474,0.993684,0.993684
gpt-4,0.728631,0.606435,24.04,0.997895,0.997895
llama-3-405b-instruct,0.590931,0.469324,24.128421,0.983158,0.993684


In [None]:
df_dice = df_punchlines[['dice_similarity']].reset_index().rename(columns={'index': 'model'})
df_lev = df_punchlines[['levenshtein_distance']].reset_index().rename(columns={'index': 'model'})

dice_order = df_dice.sort_values('dice_similarity')['model'].tolist()[::-1]
lev_order = df_lev.sort_values('levenshtein_distance')['model'].tolist()[::-1]

chart_dice = alt.Chart(df_dice).mark_bar(size=40).encode(
    x=alt.X('model:N', sort=dice_order, title=' ',
            scale=alt.Scale(paddingInner=0.1)),
    y=alt.Y('dice_similarity:Q', title='Dice Similarity'),
    color=alt.Color('model:N', legend=None)
).properties(
    width=600,
    height=300,
    title='Dice Similarity por Modelo'
).configure_axisX(
    labelAngle=0,
    labelAlign='center'
)

chart_lev = alt.Chart(df_lev).mark_bar(size=40).encode(
    x=alt.X('model:N', sort=lev_order, title=' ',
            scale=alt.Scale(paddingInner=0.1)),
    y=alt.Y('levenshtein_distance:Q', title='Levenshtein Distance'),
    color=alt.Color('model:N', legend=None)
).properties(
    width=600,
    height=300,
    title='Levenshtein Distance por Modelo'
).configure_axisX(
    labelAngle=0,
    labelAlign='center'
)

chart_dice.save("punchlines_dice.png", scale_factor=3.0)
chart_lev.save("punchlines_levenshtein.png", scale_factor=3.0)

chart_dice.show()
chart_lev.show()

In [7]:
df_texts_explanations = df.loc["texts_explanations"].to_frame().T
df_texts_explanations

Unnamed: 0,mixtral-8x7b-instruct-v01,granite-3-3-8b-instruct,gemini-2.5-flash,gpt-4,llama-3-405b-instruct
texts_explanations,3.362869,3.642105,4.635789,4.282105,3.972632


In [8]:
df_comic_styles = pd.DataFrame(df.loc["comic_styles"].dropna().tolist(), index=df.columns[df.loc["comic_styles"].notna()])
df_comic_styles

Unnamed: 0,f1_score,precision,recall,accuracy,f1_macro,f1_micro,hamming_loss,hit_rate_pre_treatment,hit_rate
granite-3-3-8b-instruct,"{'fun': 0.0, 'humor': 0.22222222222222202, 'no...","{'fun': 0.0, 'humor': 0.833333333333333, 'nons...","{'fun': 0.0, 'humor': 0.128205128205128, 'nons...","{'fun': 0.23368421052631502, 'humor': 0.336842...",0.133263,0.164804,0.472105,0.0,1.0
gemini-2.5-flash,"{'fun': 0.7873563218390801, 'humor': 0.5658914...","{'fun': 0.825301204819277, 'humor': 0.88484848...","{'fun': 0.7527472527472521, 'humor': 0.4159544...","{'fun': 0.688421052631578, 'humor': 0.52842105...",0.569205,0.626368,0.359474,1.0,1.0
gpt-4,"{'fun': 0.851511169513797, 'humor': 0.55619047...","{'fun': 0.816120906801007, 'humor': 0.83908045...","{'fun': 0.8901098901098901, 'humor': 0.4159544...","{'fun': 0.7621052631578941, 'humor': 0.5094736...",0.574178,0.637314,0.346842,1.0,1.0
llama-3-405b-instruct,"{'fun': 0.838874680306905, 'humor': 0.73851030...","{'fun': 0.7846889952153111, 'humor': 0.8321428...","{'fun': 0.9010989010989011, 'humor': 0.6638176...","{'fun': 0.734736842105263, 'humor': 0.65263157...",0.613782,0.676004,0.365263,0.0,1.0


In [9]:
# Extrai e prepara os dados
texts_expl = df.loc["texts_explanations"].astype(float)
texts_df = texts_expl.reset_index()
texts_df.columns = ["Modelo", "Concordância Média"]

# Gráfico de barras verticais
chart = alt.Chart(texts_df).mark_bar().encode(
    y=alt.Y('Concordância Média:Q',
            title='Nível médio de concordância (Modelo Juiz)',
            scale=alt.Scale(domain=[0, texts_df["Concordância Média"].max() + 0.5]),
            axis=alt.Axis(grid=False)),
    x=alt.X('Modelo:N',
            sort='-y',
            title='',
            axis=alt.Axis(grid=False, labelAngle = 0)),
    color=alt.Color('Modelo:N', legend=None),
    tooltip=["Modelo", "Concordância Média"]
).properties(
    width=600,
    height=400,
    title="Avaliação das Explicações dos Textos Humorísticos"
)
chart.save("explicacoes_concordancia.png", scale_factor=3.0)
chart

In [10]:
df = pd.DataFrame(df_punchlines, index=models)

# Normalizando a coluna 'levenshtein_distance' para escala 0-1 (invertendo, porque menor é melhor)
lev_min = df['levenshtein_distance'].min()
lev_max = df['levenshtein_distance'].max()
df['levenshtein_distance_norm'] = 1 - (df['levenshtein_distance'] - lev_min) / (lev_max - lev_min)

# Seleciona as colunas que vão para o gráfico (usa a normalizada no lugar da original)
df_plot = df.drop(columns=['levenshtein_distance']).rename(columns={'levenshtein_distance_norm': 'levenshtein_distance'})

# Plotando barras agrupadas
ax = df_plot.plot(kind='bar', figsize=(12, 6))

ax.set_ylabel('Valor (normalizado para Levenshtein)')
ax.set_title('Comparação de Métricas por Modelo')
ax.legend(title='Métricas')
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.show()

NameError: name 'models' is not defined