In [None]:
!pip install scikit-learn pandas numpy plotly

In [None]:
import json
import pandas as pd
import re

df = pd.read_pickle('all_runs.pkl')


def clean_text(text):
    text = text.replace("'", "\"")
    text = re.sub("\\\([_\[\]{}])", r"\1", text)
    for deletion in ["```json", r"\n"]:
        text = text.replace(deletion, "")
    return text


df["answer"] = df["answer"].apply(clean_text)


def extract_str(json_string: str, key: str):
    pattern = f'{key}"?: ("[^"]*")(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    return match.group(1).strip().strip('"') if match else None


def extract_float(json_string: str, key: str):
    pattern = f'{key}"?: (.*?)(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return float(value)
    except Exception:
        return None


def extract_list(json_string: str, key: str):
    pattern = f'{key}"?: ?(\[[^\]]*\])(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return json.loads(value)
    except Exception:
        return value


def extract_dict(json_string: str, key: str):
    pattern = key + r'"?: ?({[^}]*})'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return json.loads(value)
    except Exception:
        return value


list_keys = [
    "relevant_facts_from_the_cultural_context",
    "contained_explicit_racist_vocabulary",
    "contained_explicit_sexist_vocabulary",
    "contained_explicit_toxic_vocabulary",
]

for key in list_keys:
    df[key] = df['answer'].apply(lambda x: extract_list(x, key))

dict_keys = [
    "relevant_word_definitions"
]

for key in dict_keys:
    df[key] = df['answer'].apply(lambda x: extract_dict(x, key))

str_keys = [
    "argument_for_hate_speech",
    "argument_against_hate_speech",
]

for key in str_keys:
    df[key] = df['answer'].apply(lambda x: extract_str(x, key))

float_keys = [
    "likelihood_of_presence_of_implicit_hate_speech",
    "likelihood_of_presence_of_explicit_hate_speech",
    "likelihood_of_ad_hominin_attack",
    "likelihood_of_minority_attack",
    "likelihood_of_takedown_on_social_media",
    "hate_speech_probability",
]

for key in float_keys:
    df[key] = df['answer'].apply(lambda x: extract_float(x, key))

In [None]:
df[df['answer'].apply(lambda x: "hate_speech_probability" not in str(x))]

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values for each column:")
print(missing_values)

df

In [None]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 105, 5), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

df_grouped = df.groupby(["bin", "model", "prompt_id", "labeled_hateful"]).count().reset_index()
fig = px.box(df_grouped, x="bin", y="text", color="labeled_hateful", hover_name="model")
fig.show()

In [None]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 110, 10), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

true_df = df.query("not labeled_hateful")
df_grouped = true_df.groupby(["bin", "model", "prompt_id"]).count().reset_index()
fig = px.scatter(df_grouped, x="bin", y="text", color="model", hover_data="prompt_id", hover_name="model",
                 title="Hate Speech Probability (only non-hatefull, our probability should be 0)")
fig.show()

In [None]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 110, 10), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

true_df = df.query("labeled_hateful")
df_grouped = true_df.groupby(["bin", "model", "prompt_id"]).count().reset_index()
fig = px.scatter(df_grouped, x="bin", y="text", color="model", hover_data="prompt_id", hover_name="model",
                 title="Hate Speech Probability (only hatefull, our probability should be 1)")
fig.show()

In [None]:
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss, f1_score, accuracy_score

metrics = []
classified_metrics = []
for model in df["model"].unique():
    if "mistral" not in model:
        continue
    # Assuming y_true is your array of true labels and y_pred is your array of predicted probabilities
    no_nans = df[df["hate_speech_probability"].notna()]
    filtered_by_promt = no_nans[no_nans["prompt_id"].apply(lambda prompt_id:not (2<=prompt_id<=4))]
    filtered_by_model = filtered_by_promt[filtered_by_promt["model"] == model]
    
    for prompt_id in filtered_by_model["prompt_id"].unique():
        model_prompt_specific = filtered_by_model[filtered_by_model["prompt_id"] == prompt_id]
        y_true = model_prompt_specific['labeled_hateful']
        y_pred = model_prompt_specific['hate_speech_probability']

        print(f" {model} - {prompt_id} ".center(46, '-'))
        metrics.append(
            {"color": f"{model} - {prompt_id}", "metric": "roc_auc_score", "value": roc_auc_score(y_true, y_pred)})
        metrics.append({"color": f"{model} - {prompt_id}", "metric": "log_loss", "value": log_loss(y_true, y_pred)})
        metrics.append({"color": f"{model} - {prompt_id}", "metric": "brier_score_loss",
                        "value": brier_score_loss(y_true, y_pred)})

        for i in range(100):
            cuttoff = i / 100
            classification = y_pred.apply(lambda x: x > cuttoff)
            classified_metrics.append({"x": cuttoff,
                                       "color": f"{model} - {prompt_id}",
                                       "f1": f1_score(y_true, classification),
                                       "accuracy": accuracy_score(y_true, classification)})

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = px.bar(metrics, x="color", y="value", barmode="relative", color="color",
             facet_col="metric", facet_col_spacing=0.05, labels="color", height=700, width=800

             )
for i in range(1, 4):
    fig.update_layout({f"xaxis{str(i).replace('0', '')}": dict(title="model - promt variation"),
                       f"yaxis{str(i).replace('0', '')}": dict(title="metric value")})
fig.update_legends()
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.5,
        xanchor="center",
        x=0.5
    ))
fig.show()

In [None]:
import plotly

classified_stats = pd.DataFrame(classified_metrics)
fig = make_subplots(rows=1, cols=2, subplot_titles=("F1 scores", "Accuracy scores"))

for i, label in enumerate(classified_stats["color"].unique()):
    filtered = classified_stats[classified_stats['color'] == label]
    color = plotly.colors.DEFAULT_PLOTLY_COLORS[i]
    fig.add_trace(
        go.Scatter(x=filtered["x"], y=filtered["f1"], text=label, legendgroup=label, legendgrouptitle=dict(text=label),
                   mode='markers', name="f1 score", marker=go.scatter.Marker(color=color, symbol="x-thin-open")),
        row=1, col=1)
    fig.add_trace(
        go.Scatter(x=filtered["x"], y=filtered["accuracy"], legendgroup=label, legendgrouptitle=dict(text=label),
                   text=label, mode='markers', name="accuracy",
                   marker=go.scatter.Marker(color=color, symbol="cross-thin-open")),
        row=1, col=2)

fig.update_layout(xaxis=dict(title="by cutoff"), xaxis2=dict(title="by cutoff"), yaxis=dict(title="f1 score"),
                  yaxis2=dict(title="accuracy score"))
fig.show()