In [33]:
!pip install scikit-learn pandas numpy plotly



In [34]:
import json
import pandas as pd
import re

df = pd.read_pickle('all_runs.pkl')
def clean_text(text):
    text = text.replace("'", "\"")
    text = re.sub("\\\([_\[\]{}])", r"\1",text)
    for deletion in ["```json", r"\n"]:
        text = text.replace(deletion, "")
    return text
df["answer"]=df["answer"].apply(clean_text)

def extract_str(json_string:str, key: str):
    pattern = f'{key}"?: ("[^"]*")(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    return match.group(1).strip().strip('"') if match else None

def extract_float(json_string:str, key: str):
    pattern = f'{key}"?: (.*?)(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return float(value)
    except Exception:
        return None

def extract_list(json_string:str, key: str):
    pattern = f'{key}"?: ?(\[[^\]]*\])(?:,|\n)'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return json.loads(value)
    except Exception:
        return value

def extract_dict(json_string:str, key: str):
    pattern = key+ r'"?: ?({[^}]*})'
    match = re.search(pattern, json_string)
    if not match:
        return None
    value = match.group(1).strip()
    try:
        return json.loads(value)
    except Exception:
        return value

list_keys=     [
    "relevant_facts_from_the_cultural_context",
    "contained_explicit_racist_vocabulary",
    "contained_explicit_sexist_vocabulary",
    "contained_explicit_toxic_vocabulary",
]

for key in list_keys:
    df[key] = df['answer'].apply(lambda x: extract_list(x, key))

dict_keys=[
    "relevant_word_definitions"    
]

for key in dict_keys:
    df[key] = df['answer'].apply(lambda x: extract_dict(x, key))

str_keys = [
    "argument_for_hate_speech",
    "argument_against_hate_speech",
]

for key in str_keys:
    df[key] = df['answer'].apply(lambda x: extract_str(x, key))
    
float_keys = [
    "likelihood_of_presence_of_implicit_hate_speech",
    "likelihood_of_presence_of_explicit_hate_speech",
    "likelihood_of_ad_hominin_attack",
    "likelihood_of_minority_attack",
    "likelihood_of_takedown_on_social_media",
    "hate_speech_probability",
]

for key in float_keys:
    df[key] = df['answer'].apply(lambda x: extract_float(x, key))

In [35]:
df[df['answer'].apply(lambda x: "hate_speech_probability" not in str(x))]

Unnamed: 0,prompt_id,model,sample_size,text,answer,labeled_hateful,relevant_facts_from_the_cultural_context,contained_explicit_racist_vocabulary,contained_explicit_sexist_vocabulary,contained_explicit_toxic_vocabulary,relevant_word_definitions,argument_for_hate_speech,argument_against_hate_speech,likelihood_of_presence_of_implicit_hate_speech,likelihood_of_presence_of_explicit_hate_speech,likelihood_of_ad_hominin_attack,likelihood_of_minority_attack,likelihood_of_takedown_on_social_media,hate_speech_probability
2,0,mixtral-8x7b-instruct,478,I was trying to find out if the Egyptians were...,Here is the analysis of the sentence you provi...,False,,,,,,,,,,,,,
9,0,mixtral-8x7b-instruct,478,I bet they wouldnt have done that crap to a bl...,"\n{\n ""relevant_facts"": [\n ""The sentence ...",True,,,,,,,,,,,,,
10,0,mixtral-8x7b-instruct,478,Those boys will use the plastic whisks as swor...,"\n{\n ""relevant_facts"": [\n ""The sentence ...",True,,,,,,,There is no use of explicit hate speech vocabu...,0.67,,,,,
11,0,mixtral-8x7b-instruct,478,The majority of them are as stupid as real Neg...,"\n{\n ""relevant_facts"": [\n ""The sentence ...",True,,,,,,The sentence uses a racial slur and makes a de...,There is no immediate call to violence or harm...,0.20,0.9,,,,
19,0,mixtral-8x7b-instruct,478,More than that I cringe every time I go to han...,"\n{\n ""relevant_facts"": [\n ""The statement...",False,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,1,mixtral-8x7b-instruct,478,I got sloppy seconds after some Muslims used t...,"\n{\n ""relevant_facts_from_the_cultural_conte...",True,"[\n ""In the USA, there is a history of Isla...",[],[sloppy seconds],,{'Muslims': 'Members of a monotheistic religio...,,,,,,,,
3,2,mistral-7b-instruct,478,The students should be able to express themsel...,I understand your request and I will do my bes...,False,,,,,,,,,,,,,
91,3,mistral-7b-instruct,478,You could try looking at the National socialis...,"I understand the format you""d like me to respo...",False,,,,,,,,,,,,,
305,3,mistral-7b-instruct,478,This guy seems to get it but otherwise : Sorry...,"{\n""relevant_facts_from_the_cultural_context"":...",True,[White supremacist forums in the US often use ...,[],[],[],"{\n""supremacist"": ""A belief that one""s own rac...",The sentence may not contain any explicit hate...,The sentence does not contain any overtly raci...,0.50,0.1,,0.05,0.3,


In [36]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values for each column:")
print(missing_values)

data_types = df.dtypes

Missing values for each column:
prompt_id                                            0
model                                                0
sample_size                                          0
text                                                 0
answer                                               0
labeled_hateful                                      0
relevant_facts_from_the_cultural_context          2925
contained_explicit_racist_vocabulary              2891
contained_explicit_sexist_vocabulary              3180
contained_explicit_toxic_vocabulary               3144
relevant_word_definitions                         2419
argument_for_hate_speech                          1464
argument_against_hate_speech                      1197
likelihood_of_presence_of_implicit_hate_speech     213
likelihood_of_presence_of_explicit_hate_speech     300
likelihood_of_ad_hominin_attack                   4302
likelihood_of_minority_attack                      237
likelihood_of_takedown_on_social_

In [37]:
import numpy as np
import plotly.express as px
df["bin"] = pd.cut(df['hate_speech_probability']*100, bins=np.arange(0, 105, 5),include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

df_grouped = df.groupby(["bin", "model", "labeled_hateful"]).count().reset_index()
fig = px.box(df_grouped, x="bin", y="text", color="labeled_hateful", hover_name="model")
fig.show()





In [38]:
import numpy as np
import plotly.express as px
df["bin"] = pd.cut(df['hate_speech_probability']*100, bins=np.arange(0, 110, 10),include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

true_df = df.query("not labeled_hateful")
df_grouped = true_df.groupby(["bin", "model", "prompt_id"]).count().reset_index()
fig = px.scatter(df_grouped, x="bin", y="text", color="model", hover_data="prompt_id", hover_name="model", title="Hate Speech Probability (only non-hatefull, our probability should be 0)")
fig.show()





In [43]:
import numpy as np
import plotly.express as px
df["bin"] = pd.cut(df['hate_speech_probability']*100, bins=np.arange(0, 110, 10),include_lowest=True)
df['bin'] = df["bin"].apply(lambda bin: str(bin))

true_df = df.query("labeled_hateful")
df_grouped = true_df.groupby(["bin", "model", "prompt_id"]).count().reset_index()
fig = px.scatter(df_grouped, x="bin", y="text", color="model", hover_data="prompt_id", hover_name="model", title="Hate Speech Probability (only hatefull, our probability should be 1)")
fig.show()





In [40]:
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss

for model in df["model"].unique():
    # Assuming y_true is your array of true labels and y_pred is your array of predicted probabilities
    no_nans = df[df["hate_speech_probability"].notna()]
    filtered_by_model = no_nans[no_nans["model"]==model]
    for prompt_id in filtered_by_model["prompt_id"].unique():
        model_prompt_specific = filtered_by_model[filtered_by_model["prompt_id"] == prompt_id]
        y_true = model_prompt_specific['labeled_hateful']
        y_pred = model_prompt_specific['hate_speech_probability']
        
        print(f" {model} - {prompt_id} ".center(46,'-'))
        print(f"Refusals {df[df['hate_speech_probability'].isna()]['hate_speech_probability'].count()}")
        print(f"ROC AUC Score: {roc_auc_score(y_true, y_pred)}")
        print(f"Log Loss:      {log_loss(y_true, y_pred)}")
        print(f"Brier Score:   {brier_score_loss(y_true, y_pred)}")

---------- mistral-7b-instruct - 0 -----------
Refusals 0
ROC AUC Score: 0.8663902242119554
Log Loss:      1.5364423370243363
Brier Score:   0.17990835117773019
---------- mistral-7b-instruct - 1 -----------
Refusals 0
ROC AUC Score: 0.7974544778892605
Log Loss:      1.43790559586025
Brier Score:   0.20107241379310345
---------- mistral-7b-instruct - 2 -----------
Refusals 0
ROC AUC Score: 0.835662668280694
Log Loss:      1.2815671586254054
Brier Score:   0.1900421841541756
---------- mistral-7b-instruct - 3 -----------
Refusals 0
ROC AUC Score: 0.7867479823917829
Log Loss:      1.387492181645839
Brier Score:   0.20950749464668092
---------- mistral-7b-instruct - 4 -----------
Refusals 0
ROC AUC Score: 0.7981905801054736
Log Loss:      2.2914927813506214
Brier Score:   0.2304957356076759
--------- mixtral-8x7b-instruct - 0 ----------
Refusals 0
ROC AUC Score: 0.8256066849816851
Log Loss:      0.5657028888427675
Brier Score:   0.1899998205741627
--------- mixtral-8x7b-instruct - 1 -----

In [41]:
df

Unnamed: 0,prompt_id,model,sample_size,text,answer,labeled_hateful,relevant_facts_from_the_cultural_context,contained_explicit_racist_vocabulary,contained_explicit_sexist_vocabulary,contained_explicit_toxic_vocabulary,relevant_word_definitions,argument_for_hate_speech,argument_against_hate_speech,likelihood_of_presence_of_implicit_hate_speech,likelihood_of_presence_of_explicit_hate_speech,likelihood_of_ad_hominin_attack,likelihood_of_minority_attack,likelihood_of_takedown_on_social_media,hate_speech_probability,bin
0,0,mistral-7b-instruct,478,So you would refuse even other Nordic whites f...,"{ \n ""relevant_facts"": [\n ""The sentence e...",True,,,,,,The sentence implies that people from certain ...,,0.56,0.12,,0.61,,0.51,"(50.0, 60.0]"
1,0,mistral-7b-instruct,478,Just Google it and you 'll see what I mean .,"{ \n ""relevant_facts"": [\n ""The sentence c...",False,,,,,,The sentence does not contain any explicit hat...,,0.05,0.00,,0.00,,0.00,"(-0.001, 10.0]"
2,0,mistral-7b-instruct,478,"During the early punk scene , he was always se...","{ \n ""relevant_facts"": [\n ""The person in ...",False,,,,,,,,0.27,0.00,,0.10,,0.15,"(10.0, 20.0]"
3,0,mistral-7b-instruct,478,"DuetschGirlTX, No I live in a tiny ass town wh...","{ \n ""relevant_facts"": [""The speaker is ident...",True,,,,,,The speaker expresses a sense of needing to ha...,The speaker is not directly targeting or attac...,0.19,0.01,,0.00,,0.06,"(-0.001, 10.0]"
4,0,mistral-7b-instruct,478,YouTube - Teacher Attacked for Being White 9 y...,"\n{ \n ""relevant_facts"": [\n ""A 9-year-old...",False,,,,,,,The incident occurred during a tantrum or outb...,0.29,0.95,,0.31,,0.92,"(90.0, 100.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,4,mistral-7b-instruct,478,Only scored a 120 .,"{\n""relevant_word_definitions"": {\n""scored"": ""...",False,,,,,{'scored': 'a metric used to evaluate performa...,,,0.30,0.00,,0.00,0.3,0.20,"(10.0, 20.0]"
474,4,mistral-7b-instruct,478,The White race is best because when the Creato...,"\n{\n ""relevant_word_definitions"": {\n ""Wh...",True,,,,,{'White': 'a racial categorization distinguish...,This statement directly expresses the belief t...,"The statement is simply expressing a belief, i...",0.30,0.90,,1.00,0.9,0.90,"(80.0, 90.0]"
475,4,mistral-7b-instruct,478,I tried to sign up there once & they refused t...,"{\n""relevant_word_definitions"": {\n},\n""contai...",False,,,,,{},This statement could be construed as suggestin...,,0.30,0.00,,0.10,0.5,0.10,"(-0.001, 10.0]"
476,4,mistral-7b-instruct,478,"I'm more of a grappler than a striker , I woul...","{\n""relevant_word_definitions"": {},\n""containe...",False,,,,,{},,The statement does not contain any explicit or...,0.00,0.00,,0.00,0.0,0.00,"(-0.001, 10.0]"
