In [ ]:
!pip install scikit-learn pandas numpy plotly kaleido

In [ ]:
import json
import pandas as pd
import re

df = pd.read_pickle('all_runs.pkl')


def clean_text(text):
    text = text.replace("'", "\"")
    text = re.sub("\\\([_\[\]{}])", r"\1", text)
    for deletion in ["```json", r"\n"]:
        text = text.replace(deletion, "")
    return text


df["answer"] = df["answer"].apply(clean_text)


def extract_str(json_string: str, key: str):
    pattern = f'{key}"?: ("[^"]*")(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    return match.group(1).strip().strip('"') if match else None


def extract_float(json_string: str, key: str):
    pattern = f'{key}"?: (.*?)(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return float(value)
    except Exception:
        return None


def extract_list(json_string: str, key: str):
    pattern = f'{key}"?: ?(\[[^\]]*\])(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return json.loads(value)
    except Exception:
        return value


def extract_dict(json_string: str, key: str):
    pattern = key + r'"?: ?({[^}]*})'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return json.loads(value)
    except Exception:
        return value


list_keys = [
    "relevant_facts_from_the_cultural_context",
    "contained_explicit_racist_vocabulary",
    "contained_explicit_sexist_vocabulary",
    "contained_explicit_toxic_vocabulary",
]

for key in list_keys:
    df[key] = df['answer'].apply(lambda x: extract_list(x, key))

dict_keys = [
    "relevant_word_definitions"
]

for key in dict_keys:
    df[key] = df['answer'].apply(lambda x: extract_dict(x, key))

str_keys = [
    "argument_for_hate_speech",
    "argument_against_hate_speech",
]

for key in str_keys:
    df[key] = df['answer'].apply(lambda x: extract_str(x, key))

float_keys = [
    "likelihood_of_presence_of_implicit_hate_speech",
    "likelihood_of_presence_of_explicit_hate_speech",
    "likelihood_of_ad_hominin_attack",
    "likelihood_of_minority_attack",
    "likelihood_of_takedown_on_social_media",
    "hate_speech_probability",
]

for key in float_keys:
    df[key] = df['answer'].apply(lambda x: extract_float(x, key))

In [ ]:
df[df['answer'].apply(lambda x: "hate_speech_probability" not in str(x))]

In [ ]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values for each column:")
print(missing_values)

df

In [ ]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 105, 5), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

df_grouped = df.groupby(["bin", "model", "prompt_id", "labeled_hateful"]).count().reset_index()
fig = px.box(df_grouped, x="bin", y="text", color="labeled_hateful", hover_name="model")
fig.update_layout(xaxis=dict(title="our hate speech probability"), yaxis=dict(title="number of scores by model"))
fig.write_image("labeled_all_runs_all_models.png", width=2000, height=600)
fig.show()

In [ ]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 110, 10), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

true_df = df.query("not labeled_hateful")
df_grouped = true_df.groupby(["bin", "model", "prompt_id"]).count().reset_index()
fig = px.scatter(df_grouped, x="bin", y="text", color="model", hover_data="prompt_id", hover_name="model",
                 title="Hate Speech Probability (only non-hatefull, our probability should be 0)")
fig.update_layout(xaxis=dict(title="our hate speech probability"), yaxis=dict(title="number of scores by model"))
fig.write_image("hate_prob_per_model_only-non-hatefull.png", width=2000, height=600)
fig.show()

In [ ]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 110, 10), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

true_df = df.query("labeled_hateful")
df_grouped = true_df.groupby(["bin", "model", "prompt_id"]).count().reset_index()
fig = px.scatter(df_grouped, x="bin", y="text", color="model", hover_data="prompt_id", hover_name="model", title="Hate Speech Probability (only hatefull, our probability should be 1)")
fig.update_layout(xaxis=dict(title="our hate speech probability"), yaxis=dict(title="number of scores by model"))
fig.write_image("hate_prob_per_model_only-hatefull.png", width=2000, height=600)
fig.show()

In [ ]:
from sklearn.metrics import auc


def draw_roc_curve(fpr, tpr, label):
    fig = px.line(
        x=fpr, y=tpr,
        title=f'ROC Curve {label} (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='False Positive Rate', y='True Positive Rate'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    fig.update_layout(title_x=0.5)
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.write_image(label.replace(" - ", "_")+"_roc_curve.png")
    fig.show()

In [ ]:
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss, f1_score, accuracy_score, roc_curve

metrics = []
classified_metrics = []
for model in df["model"].unique():
    if "mistral" not in model:
        continue
    # Assuming y_true is your array of true labels and y_pred is your array of predicted probabilities
    no_nans = df[df["hate_speech_probability"].notna()]
    #filtered_by_promt = no_nans[no_nans["prompt_id"].apply(lambda prompt_id:not (2<=prompt_id<=4))]
    filtered_by_model = no_nans[no_nans["model"] == model]
    
    for prompt_id in filtered_by_model["prompt_id"].unique():
        model_prompt_specific = filtered_by_model[filtered_by_model["prompt_id"] == prompt_id]
        y_true = model_prompt_specific['labeled_hateful']
        y_pred = model_prompt_specific['hate_speech_probability']

        print(f" {model} - {prompt_id} ".center(46, '-'))
        label=f"{model} - {prompt_id}"
        metrics.append({"color": label, "metric": "roc_auc_score", "value": roc_auc_score(y_true, y_pred)})
        metrics.append({"color": label, "metric": "log_loss", "value": log_loss(y_true, y_pred)})
        metrics.append({"color": label, "metric": "brier_score_loss", "value": brier_score_loss(y_true, y_pred)})

        for i in range(100):
            cuttoff = i / 100
            classification = y_pred.apply(lambda x: x > cuttoff)
            classified_metrics.append({"x": cuttoff,
                                       "color": label,
                                       "f1": f1_score(y_true, classification),
                                       "accuracy": accuracy_score(y_true, classification)})
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        draw_roc_curve(fpr,tpr, label)


In [ ]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = px.bar(metrics, x="value", y="color", barmode="relative", color="color",
             facet_col="metric", facet_col_spacing=0.05, labels="color")
for i in range(1, 4):
    fig.update_layout({f"xaxis{str(i).replace('0', '')}": dict(title="model - promt variation"),
                       f"yaxis{str(i).replace('0', '')}": dict(title="metric value")})
fig.update_xaxes(matches=None)


fig.write_image("model_promt_variations.png", width=2000, height=600)
fig.show()

In [ ]:
import plotly

classified_stats = pd.DataFrame(classified_metrics)
fig = make_subplots(rows=1, cols=2, subplot_titles=("F1 scores", "Accuracy scores"))

for i, label in enumerate(classified_stats["color"].unique()):
    if "- 0" not in label and "- 6" not in label:
        continue
    filtered = classified_stats[classified_stats['color'] == label]
    color = plotly.colors.DEFAULT_PLOTLY_COLORS[i % len(plotly.colors.DEFAULT_PLOTLY_COLORS)]
    fig.add_trace(
        go.Scatter(x=filtered["x"], y=filtered["f1"], text=label, legendgroup=label, legendgrouptitle=dict(text=label),
                   mode='markers', name="f1 score", marker=go.scatter.Marker(color=color, symbol="x-thin-open")),
        row=1, col=1)
    fig.add_trace(
        go.Scatter(x=filtered["x"], y=filtered["accuracy"], legendgroup=label, legendgrouptitle=dict(text=label),
                   text=label, mode='markers', name="accuracy",
                   marker=go.scatter.Marker(color=color, symbol="cross-thin-open")),
        row=1, col=2)

fig.update_layout(xaxis=dict(title="by cutoff"), xaxis2=dict(title="by cutoff"), yaxis=dict(title="f1 score"),
                  yaxis2=dict(title="accuracy score"))
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.5,
        xanchor="center",
        x=0.5
    ))
fig.update_yaxes(range=[0, 1])

fig.write_image("fi_accuracy.png", width=2000, height=700)
fig.show()

In [ ]:
df = df[["prompt_id", "model", "hate_speech_probability", "labeled_hateful", "text"]]
# False Negatives
false_negatives = df[
    (df['model'] == 'mistral-7b-instruct') &
    (df['prompt_id'] == 6) &
    (df['labeled_hateful'] == True) &
    (df['hate_speech_probability'] < 0.2)
]

# False Positives
false_positives = df[
    (df['model'] == 'mistral-7b-instruct') &
    (df['prompt_id'] == 6) &
    (df['labeled_hateful'] == False) &
    (df['hate_speech_probability'] >= 0.5)
]

false_negatives = false_negatives.sort_values(by=['text'], key=lambda x: x.str.len())
false_positives = false_positives.sort_values(by=['text'], key=lambda x: x.str.len())

pd.set_option('display.max_colwidth', None)
print(len(false_negatives))
print(len(false_positives))

print(false_negatives['text'].values[0])
false_positives