In [None]:
import pandas as pd
import torch
import gc
from numba import cuda
from tqdm import tqdm
import json
from datasets import Dataset
import sys
import re
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
import random
import outlines

from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import TextClassificationPipeline
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import pipeline as transformers_pipeline

In [None]:
#IMPORT AND PREPARE DATASET
path = '' # Specify the path to your dataset file


ds = pd.read_csv(path)

dataset = Dataset.from_pandas(ds)
print(dataset)


In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
hf_token = "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"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
pipeline = transformers_pipeline(   "text-generation", 
                                    model = model_name,
                                    torch_dtype= torch.float32,
                                    device_map="auto",
                                    token=hf_token
                                )


In [None]:
#IMPORT DEFINTIONS - STEP 1
def_step_1 = {    "NO":"",
                "FtW":"Hate is defined as abusive speech targeting specific group characteristics, such as ethnic origin, religion, gender, or sexual orientation.",
                "OL":"Hate Speech is considered any kind of content that conveys malevolent intentions toward a group or an individual.",
                "HSB":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual, and motivated by inherent characteristics that are attributed to that group and shared among its members.",
                "HSB_EDFoC":"Hate speech is considered any kind of content or communication expressed using language (written or spoken) or actions,  that convey malevolent intentions toward a group or an individual, and motivated by inherent characteristics that are attributed to that group and shared among its members.",
                "HSB_EDPC":"Hate speech is considered any kind of content that conveys malevolent intentions such as statements of inferiority, aversion, cursing, calls for exclusion, threaten, harass or violence, and directed toward a group or an individual, and motivated by inherent characteristics that are attributed to that group and shared among its members.",
                "HSB_EDT":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members.",
                "HSB_EDFoC_EDT":"Hate speech is considered any kind of content or communication expressed using language (written or spoken) or actions,  that convey malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members.",
                "HSB_EDFoC_EDPC":"Hate speech is considered any kind of content or communication expressed using language (written or spoken) or actions, that convey malevolent intentions such as statements of inferiority, aversion, cursing, calls for exclusion, threaten, harass or violence, and directed toward a group or an individual and motivated by inherent characteristics that are attributed to that group and shared among its members.",
                "HSB_EDT_EDPC":"Hate speech is considered any kind of content that conveys malevolent intentions such as statements of inferiority, aversion, cursing, calls for exclusion, threaten, harass or violence, toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members.",
                "HSB_EDFoC_EDPC_EDT":"Hate speech is considered any kind of content or communication expressed using language (written or spoken) or actions,  that conveys malevolent intentions such as statements of inferiority, aversion, cursing, calls for exclusion, threaten, harass or violence, toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members."
                }

def_step_2 = {"HSB_EDT_LAA":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members such as race, color, ethnicity, gender, sexual orientation, nationality, religion, disability, social status, health conditions, or other characteristics.",
            "HSB_EDT_LAA_PI":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members such as race, color, ethnicity, gender, sexual orientation, nationality, religion, disability, social status, health conditions, or other characteristics. The outcome of Hate Speech could be the promotion of division among people, undermining of social cohesion in communities, inciting others to commit violence or discrimination, and could have consequences for individuals’ health and safety.",
            "HSB_EDT_LAA_Exc":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members such as race, color, ethnicity, gender, sexual orientation, nationality, religion, disability, social status, health conditions, or other characteristics. However, even if it is offensive, it is not considered Hate Speech any content that attacks a person’s personality traits, ideas, or opinions",
            "HSB_EDT_LAA_IHS":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members such as race, color, ethnicity, gender, sexual orientation, nationality, religion, disability, social status, health conditions, or other characteristics. Hate Speech can also be implicit, portrayed as an indirect or coded language that uses Irony, Stereotypes, or Misinformation.",
            "HSB_EDT_LAA_PI_Exc":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members such as race, color, ethnicity, gender, sexual orientation, nationality, religion, disability, social status, health conditions, or other characteristics. The outcome of Hate Speech could be the promotion of division among people, undermining of social cohesion in communities, inciting others to commit violence or discrimination, and could have consequences for individuals’ health and safety. However, even if it is offensive, it is not considered Hate Speech any content that attacks a person’s personality traits, ideas, or opinions.",
            "HSB_EDT_LAA_Exc_IHS":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members such as race, color, ethnicity, gender, sexual orientation, nationality, religion, disability, social status, health conditions, or other characteristics. Hate Speech can also be implicit, portrayed as an indirect or coded language that uses Irony, Stereotypes, or Misinformation. However, even if it is offensive, it is not considered Hate Speech any content that attacks a person’s personality traits, ideas, or opinions.",
            "HSB_EDT_LAA_PI_IHS":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members such as race, color, ethnicity, gender, sexual orientation, nationality, religion, disability, social status, health conditions, or other characteristics. The outcome of Hate Speech could be the promotion of division among people, undermining of social cohesion in communities, inciting others to commit violence or discrimination, and could have consequences for individuals’ health and safety. Hate Speech can also be implicit, portrayed as an indirect or coded language that uses Irony, Stereotypes, or Misinformation.",
            "HSB_EDT_LAA_PI_IHS_Exc":"Hate speech is considered any kind of content that conveys malevolent intentions toward a group or an individual which is, or thought to be, a member of that group, and motivated by inherent characteristics that are attributed to that group and shared among its members such as race, color, ethnicity, gender, sexual orientation, nationality, religion, disability, social status, health conditions, or other characteristics. The outcome of Hate Speech could be the promotion of division among people, undermining of social cohesion in communities, inciting others to commit violence or discrimination, and could have consequences for individuals’ health and safety. Hate Speech can also be implicit, portrayed as an indirect or coded language that uses Irony, Stereotypes, or Misinformation. However, even if it is offensive, it is not considered Hate Speech any content that attacks a person’s personality traits, ideas, or opinions."
            }



In [None]:
#PREPARE DATAPOINTS

model_inputs = []
labels = []
instructions = "Classify if the following text is considered hate speech or not hate speech:\n"\
              "[TEXT]\n"\
              "Please answer ONLY with HS if it is hate speech or NHS if it is not."

for datapoint in tqdm(dataset):
    model_input = instructions
    model_input = model_input.replace("[TEXT]", datapoint['text'])
    

    if datapoint['label'] == 'hate':
        labels.append(1)
    else:
        labels.append(0)
    model_inputs.append(model_input)
print(model_inputs[1])


In [None]:
def metriche(predictions,labels):

    predictions = [np.nan if x == 'NaN' else x for x in predictions]

    # Convert predictions to numpy array to handle NaN values
    predictions = np.array(predictions, dtype=float)  # Convert to float array for NaN handling

    # Replace NaN values with a default value (e.g., 0) if needed
    predictions[np.isnan(predictions)] = 0  # Replace NaN with 0 (or any other appropriate value)

    # Calculate precision (handling NaN appropriately)
    precision = precision_score(labels, predictions, average='binary', zero_division=0)

    # Calculate recall (handling NaN appropriately)
    recall = recall_score(labels, predictions, average='binary', zero_division=0)

    # Calculate F1 score (handling NaN appropriately)
    f1 = f1_score(labels, predictions, average='macro', zero_division=0)
    
    return precision, recall, f1
    

In [None]:
#DEFINITION IN USER PROMPT

def runner2(model_inputs, labels, num_runs, id_def, definition, sample):
    results = []

    for j in range(num_runs):  
        if sample == 'all' or sample >= 3901:
            sample = 3901
        else:
            model_inputs = model_inputs[:sample]
            labels = labels[:sample]

        completions = []
        responses = []
        
        #if definition == '':
        system_prompt = ''
        #else:
        #    system_prompt = 'Given the following definition of Hate Speech: ' + str(definition)
            
        model_prompt = '''<|begin_of_text|><|start_header_id|>system<|end_header_id|>
                         {{ [SYSTEM_PROMPT] }}<|eot_id|><|start_header_id|>user<|end_header_id|>
                        {{ [USER_MESSAGE] }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'''

        
        
        for i, inputtone in enumerate(tqdm(model_inputs)): 
            
            if id_def == 'NO':
                tot_in = inputtone
            else:
                tot_in = 'Given the following definition of Hate Speech: \n' + str(definition) +'\n' + inputtone 
                
            model_in = model_prompt.replace("[SYSTEM_PROMPT]", system_prompt)
            model_in = model_in.replace("[USER_MESSAGE]", tot_in)
            completion = pipeline(
                                    model_in,
                                    do_sample=True,
                                    num_return_sequences=1,
                                    pad_token_id=tokenizer.eos_token_id,
                                    max_new_tokens=20,
                                    temperature = 0.95,
                                    return_full_text=False
                                    )

            response_complete = completion[0]['generated_text']
            response = response_complete
            if "Answer" in response_complete:
                response = response.split("Answer")[1]
            response = response.strip(": \n")
            responses.append(response)
            completions.append(completion[0])


            predictions = []
            for response in responses:
                cleaned_response = response.strip().upper()
                if re.search(r'\bHS\b', cleaned_response):
                    predictions.append(1)
                elif re.search(r'\bNHS\b', cleaned_response):
                    predictions.append(0)
                else:
                    predictions.append('NaN')
                    #print(cleaned_response)



        num = j+1
        print('Run number:', num, 'With definition: ', id_def)     
        numeratore = sum(1 for p,l in zip(predictions,labels) if p ==l)
        denominatore = len(predictions)
        acc = numeratore/denominatore
        print('Accuracy: ', acc)
        nans = sum(1 for p in predictions if p =='NaN')
        print('Numbers of non-answer: ',nans)
        if denominatore == nans:
            acc_no_nans = 0
        else:
            acc_no_nans = numeratore/(denominatore-nans)
        print('Accuracy without no-answers: ', acc_no_nans)
        
        
    
        precision, recall, f1 = metriche(predictions, labels)

        print('Precision: ', precision)
        print('Recall: ', recall)
        print('F1 Score: ', f1)
        
        results.append({'Run': num,
                        'Model': model_name, 
                        'ID_def': id_def,
                        'Accuracy': acc, 
                        'NaNs': nans, 
                        'Acc_No_Nans': acc_no_nans, 
                        'Precision': precision, 
                        'Recall': recall, 
                        'F1': f1 ,
                        'Input': dataset['text'],
                        'Responses': responses,
                        'Predictions': predictions, 
                        'Labels': labels, 
                        'Definition': definition,
                        'Target': dataset['target'],
                        'Type': dataset['type']})
                
    return results


In [None]:
#ALL THE DEFINITIONS
sample = 'all'
num_runs = 3
def_id = None
definition = None
step = 1

results = []

if step == '1':
    def_dict = def_step_1
    step_name = 'step1'
elif step == '2':
    def_dict = def_step_2
    step_name = 'step2'


for k, v in def_dict.items():
    def_id = k
    definition = v
    result = runner2(model_inputs, labels, num_runs=num_runs, id_def=def_id, definition=definition, sample=sample)
    results.extend(result) 

In [None]:
#CREATE DF AND SAVE
df = pd.DataFrame(results)

csv_file_path = '../Outputs/LLama3_FtW_step'+ str(step) +'.csv'

df.to_csv(csv_file_path, index=False)  # Set index=False to exclude row numbers in the CSV file

In [None]:
import pandas as pd

# Specify the path to your CSV file
csv_file_path = '../Outputs/LLama3_FtW_step'+ str(step) +'.csv'


# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Group the DataFrame by 'id_def' and calculate the mean of 'Accuracy' within each group
avg_accuracy_per_id = df.groupby('ID_def')['F1'].mean().reset_index()

avg_accuracy_per_id_sorted = avg_accuracy_per_id.sort_values(by='F1', ascending=False)

# Display the average accuracy for each id_def
print(avg_accuracy_per_id_sorted)

