In [1]:
!pip install scikit-learn pandas numpy plotly

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import json
import pandas as pd
import re

df = pd.read_pickle('all_runs.pkl')


def clean_text(text):
    text = text.replace("'", "\"")
    text = re.sub("\\\([_\[\]{}])", r"\1", text)
    for deletion in ["```json", r"\n"]:
        text = text.replace(deletion, "")
    return text


df["answer"] = df["answer"].apply(clean_text)


def extract_str(json_string: str, key: str):
    pattern = f'{key}"?: ("[^"]*")(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    return match.group(1).strip().strip('"') if match else None


def extract_float(json_string: str, key: str):
    pattern = f'{key}"?: (.*?)(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return float(value)
    except Exception:
        return None


def extract_list(json_string: str, key: str):
    pattern = f'{key}"?: ?(\[[^\]]*\])(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return json.loads(value)
    except Exception:
        return value


def extract_dict(json_string: str, key: str):
    pattern = key + r'"?: ?({[^}]*})'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return json.loads(value)
    except Exception:
        return value


list_keys = [
    "relevant_facts_from_the_cultural_context",
    "contained_explicit_racist_vocabulary",
    "contained_explicit_sexist_vocabulary",
    "contained_explicit_toxic_vocabulary",
]

for key in list_keys:
    df[key] = df['answer'].apply(lambda x: extract_list(x, key))

dict_keys = [
    "relevant_word_definitions"
]

for key in dict_keys:
    df[key] = df['answer'].apply(lambda x: extract_dict(x, key))

str_keys = [
    "argument_for_hate_speech",
    "argument_against_hate_speech",
]

for key in str_keys:
    df[key] = df['answer'].apply(lambda x: extract_str(x, key))

float_keys = [
    "likelihood_of_presence_of_implicit_hate_speech",
    "likelihood_of_presence_of_explicit_hate_speech",
    "likelihood_of_ad_hominin_attack",
    "likelihood_of_minority_attack",
    "likelihood_of_takedown_on_social_media",
    "hate_speech_probability",
]

for key in float_keys:
    df[key] = df['answer'].apply(lambda x: extract_float(x, key))

In [3]:
df[df['answer'].apply(lambda x: "hate_speech_probability" not in str(x))]

Unnamed: 0,prompt_id,model,sample_size,text,answer,labeled_hateful,relevant_facts_from_the_cultural_context,contained_explicit_racist_vocabulary,contained_explicit_sexist_vocabulary,contained_explicit_toxic_vocabulary,relevant_word_definitions,argument_for_hate_speech,argument_against_hate_speech,likelihood_of_presence_of_implicit_hate_speech,likelihood_of_presence_of_explicit_hate_speech,likelihood_of_ad_hominin_attack,likelihood_of_minority_attack,likelihood_of_takedown_on_social_media,hate_speech_probability
2,0,mixtral-8x7b-instruct,478,I was trying to find out if the Egyptians were...,Here is the analysis of the sentence you provi...,False,,,,,,,,,,,,,
9,0,mixtral-8x7b-instruct,478,I bet they wouldnt have done that crap to a bl...,"\n{\n ""relevant_facts"": [\n ""The sentence ...",True,,,,,,,,,,,,,
10,0,mixtral-8x7b-instruct,478,Those boys will use the plastic whisks as swor...,"\n{\n ""relevant_facts"": [\n ""The sentence ...",True,,,,,,,There is no use of explicit hate speech vocabu...,0.67,,,,,
11,0,mixtral-8x7b-instruct,478,The majority of them are as stupid as real Neg...,"\n{\n ""relevant_facts"": [\n ""The sentence ...",True,,,,,,The sentence uses a racial slur and makes a de...,There is no immediate call to violence or harm...,0.20,0.9,,,,
19,0,mixtral-8x7b-instruct,478,More than that I cringe every time I go to han...,"\n{\n ""relevant_facts"": [\n ""The statement...",False,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,3,mistral-7b-instruct,478,This guy seems to get it but otherwise : Sorry...,"{\n""relevant_facts_from_the_cultural_context"":...",True,[White supremacist forums in the US often use ...,[],[],[],"{\n""supremacist"": ""A belief that one""s own rac...",The sentence may not contain any explicit hate...,The sentence does not contain any overtly raci...,0.50,0.1,,0.05,0.3,
234,4,mistral-7b-instruct,478,They are beating each other to death everywher...,"{\n ""relevant_word_definitions"": {\n ""they...",True,,,,,"{\n ""they"": referring to a group of people,...",This statement suggests a sense of superiority...,The statement does not explicitly use derogato...,0.90,0.6,,1.00,,
320,1,fine-tuned-mistral-7b-v0.2-instruct,478,I'm not sure what sort of information you 're ...,"{\n""relevant_facts_from_the_cultural_context"":...",True,[],,,,,,,,,,,,
320,3,fine-tuned-mistral-7b-v0.2-instruct,478,I'm not sure what sort of information you 're ...,"{\n""relevant_facts_from_the_cultural_context"":...",True,[],,,,,,,,,,,,


In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values for each column:")
print(missing_values)

df

Missing values for each column:
prompt_id                                            0
model                                                0
sample_size                                          0
text                                                 0
answer                                               0
labeled_hateful                                      0
relevant_facts_from_the_cultural_context          6271
contained_explicit_racist_vocabulary              6239
contained_explicit_sexist_vocabulary              6692
contained_explicit_toxic_vocabulary               6646
relevant_word_definitions                         5290
argument_for_hate_speech                          2585
argument_against_hate_speech                      1859
likelihood_of_presence_of_implicit_hate_speech     254
likelihood_of_presence_of_explicit_hate_speech     392
likelihood_of_ad_hominin_attack                   8604
likelihood_of_minority_attack                      241
likelihood_of_takedown_on_social_

Unnamed: 0,prompt_id,model,sample_size,text,answer,labeled_hateful,relevant_facts_from_the_cultural_context,contained_explicit_racist_vocabulary,contained_explicit_sexist_vocabulary,contained_explicit_toxic_vocabulary,relevant_word_definitions,argument_for_hate_speech,argument_against_hate_speech,likelihood_of_presence_of_implicit_hate_speech,likelihood_of_presence_of_explicit_hate_speech,likelihood_of_ad_hominin_attack,likelihood_of_minority_attack,likelihood_of_takedown_on_social_media,hate_speech_probability
0,0,mistral-7b-instruct,478,So you would refuse even other Nordic whites f...,"{ \n ""relevant_facts"": [\n ""The sentence e...",True,,,,,,The sentence implies that people from certain ...,,0.56,0.12,,0.61,,0.51
1,0,mistral-7b-instruct,478,Just Google it and you 'll see what I mean .,"{ \n ""relevant_facts"": [\n ""The sentence c...",False,,,,,,The sentence does not contain any explicit hat...,,0.05,0.00,,0.00,,0.00
2,0,mistral-7b-instruct,478,"During the early punk scene , he was always se...","{ \n ""relevant_facts"": [\n ""The person in ...",False,,,,,,,,0.27,0.00,,0.10,,0.15
3,0,mistral-7b-instruct,478,"DuetschGirlTX, No I live in a tiny ass town wh...","{ \n ""relevant_facts"": [""The speaker is ident...",True,,,,,,The speaker expresses a sense of needing to ha...,The speaker is not directly targeting or attac...,0.19,0.01,,0.00,,0.06
4,0,mistral-7b-instruct,478,YouTube - Teacher Attacked for Being White 9 y...,"\n{ \n ""relevant_facts"": [\n ""A 9-year-old...",False,,,,,,,The incident occurred during a tantrum or outb...,0.29,0.95,,0.31,,0.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,4,fine-tuned-mistral-7b-v0.2-instruct,478,I have one word for you ....... MOVE . We chos...,"\n{\n ""relevant_word_definitions"": {\n ""wh...",False,,,,,"{\n ""white"": ""a person having European ance...",The speaker expresses a desire for their child...,,0.00,1.00,,0.00,1.0,1.00
474,4,fine-tuned-mistral-7b-v0.2-instruct,478,There are only eight people living within a mi...,"{\n""relevant_word_definitions"": {\n},\n""contai...",False,,,,,{},The text does not contain any explicit hate sp...,,0.00,0.00,,0.00,0.0,0.00
475,4,fine-tuned-mistral-7b-v0.2-instruct,478,you just say blacks hate the cold. the only re...,"\n{\n ""relevant_word_definitions"": {\n ""bl...",True,,,,,"{\n ""blacks"": ""a racial group of people ori...",The statement implies that black people are on...,The statement does not contain any explicit th...,1.00,0.00,,1.00,1.0,1.00
476,4,fine-tuned-mistral-7b-v0.2-instruct,478,"And all those holes below 40,000 feet are fill...","\n{\n ""relevant_word_definitions"": {\n ""ho...",False,,,,,"{\n ""holes"": ""an empty space or an opening ...",The text does not contain any explicit hate sp...,The text is a statement about the presence of ...,0.00,0.00,,0.00,0.0,0.00


In [5]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 105, 5), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

df_grouped = df.groupby(["bin", "model", "prompt_id", "labeled_hateful"]).count().reset_index()
fig = px.box(df_grouped, x="bin", y="text", color="labeled_hateful", hover_name="model")
fig.show()

  df_grouped = df.groupby(["bin", "model", "prompt_id", "labeled_hateful"]).count().reset_index()


In [6]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 110, 10), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

true_df = df.query("not labeled_hateful")
df_grouped = true_df.groupby(["bin", "model", "prompt_id"]).count().reset_index()
fig = px.scatter(df_grouped, x="bin", y="text", color="model", hover_data="prompt_id", hover_name="model",
                 title="Hate Speech Probability (only non-hatefull, our probability should be 0)")
fig.show()





In [7]:
import numpy as np
import plotly.express as px

df["bin"] = pd.cut(df['hate_speech_probability'] * 100, bins=np.arange(0, 110, 10), include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

true_df = df.query("labeled_hateful")
df_grouped = true_df.groupby(["bin", "model", "prompt_id"]).count().reset_index()
fig = px.scatter(df_grouped, x="bin", y="text", color="model", hover_data="prompt_id", hover_name="model",
                 title="Hate Speech Probability (only hatefull, our probability should be 1)")
fig.show()





In [8]:
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss, f1_score, accuracy_score

metrics = []
classified_metrics = []
for model in df["model"].unique():
    if "mistral" not in model:
        continue
    # Assuming y_true is your array of true labels and y_pred is your array of predicted probabilities
    no_nans = df[df["hate_speech_probability"].notna()]
    filtered_by_promt = no_nans[no_nans["prompt_id"].apply(lambda prompt_id:not (2<=prompt_id<=4))]
    filtered_by_model = filtered_by_promt[filtered_by_promt["model"] == model]
    
    for prompt_id in filtered_by_model["prompt_id"].unique():
        model_prompt_specific = filtered_by_model[filtered_by_model["prompt_id"] == prompt_id]
        y_true = model_prompt_specific['labeled_hateful']
        y_pred = model_prompt_specific['hate_speech_probability']

        print(f" {model} - {prompt_id} ".center(46, '-'))
        metrics.append(
            {"color": f"{model} - {prompt_id}", "metric": "roc_auc_score", "value": roc_auc_score(y_true, y_pred)})
        metrics.append({"color": f"{model} - {prompt_id}", "metric": "log_loss", "value": log_loss(y_true, y_pred)})
        metrics.append({"color": f"{model} - {prompt_id}", "metric": "brier_score_loss",
                        "value": brier_score_loss(y_true, y_pred)})

        for i in range(100):
            cuttoff = i / 100
            classification = y_pred.apply(lambda x: x > cuttoff)
            classified_metrics.append({"x": cuttoff,
                                       "color": f"{model} - {prompt_id}",
                                       "f1": f1_score(y_true, classification),
                                       "accuracy": accuracy_score(y_true, classification)})

---------- mistral-7b-instruct - 0 -----------
---------- mistral-7b-instruct - 1 -----------
---------- mistral-7b-instruct - 5 -----------
---------- mistral-7b-instruct - 6 -----------
---------- mistral-7b-instruct - 7 -----------
---------- mistral-7b-instruct - 8 -----------
-- fine-tuned-mistral-7b-v0.2-instruct - 0 ---
-- fine-tuned-mistral-7b-v0.2-instruct - 1 ---


In [9]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = px.bar(metrics, x="color", y="value", barmode="relative", color="color",
             facet_col="metric", facet_col_spacing=0.05, labels="color", height=700, width=800

             )
for i in range(1, 4):
    fig.update_layout({f"xaxis{str(i).replace('0', '')}": dict(title="model - promt variation"),
                       f"yaxis{str(i).replace('0', '')}": dict(title="metric value")})
fig.update_legends()
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.5,
        xanchor="center",
        x=0.5
    ))
fig.show()

In [10]:
import plotly

classified_stats = pd.DataFrame(classified_metrics)
fig = make_subplots(rows=1, cols=2, subplot_titles=("F1 scores", "Accuracy scores"))

for i, label in enumerate(classified_stats["color"].unique()):
    filtered = classified_stats[classified_stats['color'] == label]
    color = plotly.colors.DEFAULT_PLOTLY_COLORS[i]
    fig.add_trace(
        go.Scatter(x=filtered["x"], y=filtered["f1"], text=label, legendgroup=label, legendgrouptitle=dict(text=label),
                   mode='markers', name="f1 score", marker=go.scatter.Marker(color=color, symbol="x-thin-open")),
        row=1, col=1)
    fig.add_trace(
        go.Scatter(x=filtered["x"], y=filtered["accuracy"], legendgroup=label, legendgrouptitle=dict(text=label),
                   text=label, mode='markers', name="accuracy",
                   marker=go.scatter.Marker(color=color, symbol="cross-thin-open")),
        row=1, col=2)

fig.update_layout(xaxis=dict(title="by cutoff"), xaxis2=dict(title="by cutoff"), yaxis=dict(title="f1 score"),
                  yaxis2=dict(title="accuracy score"))
fig.show()

In [62]:
df = df[["prompt_id", "model", "hate_speech_probability", "labeled_hateful", "text"]]
# False Negatives
false_negatives = df[
    (df['model'] == 'mistral-7b-instruct') &
    (df['prompt_id'] == 6) &
    (df['labeled_hateful'] == True) &
    (df['hate_speech_probability'] < 0.2)
]

# False Positives
false_positives = df[
    (df['model'] == 'mistral-7b-instruct') &
    (df['prompt_id'] == 6) &
    (df['labeled_hateful'] == False) &
    (df['hate_speech_probability'] >= 0.5)
]

false_negatives = false_negatives.sort_values(by=['text'], key=lambda x: x.str.len())
false_positives = false_positives.sort_values(by=['text'], key=lambda x: x.str.len())

pd.set_option('display.max_colwidth', None)
print(len(false_negatives))
print(len(false_positives))

print(false_negatives['text'].values[0])
false_positives

52
20
He got what he deserved .


Unnamed: 0,prompt_id,model,hate_speech_probability,labeled_hateful,text
89,6,mistral-7b-instruct,0.1,True,He got what he deserved .
295,6,mistral-7b-instruct,0.05,True,Hell to have a teacher like this .
438,6,mistral-7b-instruct,0.0,True,They probably stole a bunch of stuff .
172,6,mistral-7b-instruct,0.0,True,Maybe spread some HATE and intolerance ?
56,6,mistral-7b-instruct,0.1,True,If we do n't act soon they will bread us out .
423,6,mistral-7b-instruct,0.0,True,To hell with what this creature thinks anyway .
29,6,mistral-7b-instruct,0.03,True,That 's to assume the gay ones are n't abusive .
443,6,mistral-7b-instruct,0.0,True,"Its simple , they want to create white guilt , ."
473,6,mistral-7b-instruct,0.1,True,"But as for them adopting kids , the answer is NO ."
199,6,mistral-7b-instruct,0.1,True,If that stopped the worthless thing would starve !
