In [1]:
import math
import time
import datetime
!pip install numpy
!pip install pandas
!pip install tables
import pandas as pd
import requests
import os
from dotenv import load_dotenv
import json



In [2]:
df = pd.read_pickle('processed_data.pkl')
training_df = df[df["train"]]
testing_df = df[df["test"]]

In [3]:
df

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,hate,text,train,test
0,12834217_1,572066,1346,0,False,"As of March 13th , 2014 , the booklet had been...",False,False
1,12834217_2,572066,1346,0,False,In order to help increase the booklets downloa...,True,False
2,12834217_3,572066,1346,0,False,( Simply copy and paste the following text int...,False,False
3,12834217_4,572066,1346,0,True,Click below for a FREE download of a colorfull...,True,False
4,12834217_5,572066,1346,0,False,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,False,False
...,...,...,...,...,...,...,...,...
10939,33676864_5,734541,1388,0,False,"Billy - `` That guy would n't leave me alone ,...",False,False
10940,33677019_1,735154,1388,0,False,Wish we at least had a Marine Le Pen to vote f...,False,False
10941,33677019_2,735154,1388,0,False,Its like the choices are white genocide candid...,False,False
10942,33677053_1,572266,1388,0,True,Why White people used to say that sex was a si...,True,False


In [4]:
file_names = ['prompt-variations/v0.txt', 'prompt-variations/v1.txt']
SYSTEM_PROMPT = []
for file_name in file_names:
    with open(file_name, 'r') as file:
        SYSTEM_PROMPT.append(file.read())

In [7]:
load_dotenv()
PERPLEXITY_API_KEY = os.getenv("PPLX_KEY")
VALID_MODELS = ['codellama-34b-instruct', 'llama-2-70b-chat', 'mistral-7b-instruct', 'mixtral-8x7b-instruct',
                'pplx-7b-chat', 'pplx-70b-chat']


def call_api(model: str, promt: str, text: str) -> requests.Response:
    payload = {
        "model": model,
        "messages": [
            {
                "role": "system",
                "content": promt
            },
            {
                "role": "user",
                "content": text
            }
        ]
    }
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "Authorization": f"Bearer {PERPLEXITY_API_KEY}"
    }

    return requests.post("https://api.perplexity.ai/chat/completions", json=payload, headers=headers)


def run_model(prompt_id, model, sample_size, print_results=False):
    if model not in VALID_MODELS:
        raise ValueError(f'Invalid model {model}. Valid options are {VALID_MODELS}')

    sample_df = testing_df.sample(n=sample_size)

    results = []
    start_time = datetime.datetime.now()
    for i, (index, row) in enumerate(sample_df.iterrows()):
        total = sample_df["text"].count()
        elapsed = datetime.datetime.now() - start_time
        percentage = (i + 1) / total
        print(f'[{elapsed}<{elapsed * (1.0 - percentage)}, {elapsed / i + 1}s/generations] '
              f'{model} - promt {prompt_id}: {i + 1}/{total} | {percentage * 100:.2f}%')
        backoff_time = 10
        while True:
            response = call_api(model, SYSTEM_PROMPT[prompt_id], row["text"])
            if response.status_code == 200:
                break
            print(
                f"{model} (Promt {prompt_id}): Error {response.status_code} => sleeping for {backoff_time}s: {response.text}")
            time.sleep(backoff_time)
            backoff_time = min(int(1.2 * backoff_time), 60)
        data = json.loads(response.text)
        answer = data['choices'][0]['message']['content']
        results.append({
            'prompt_id': prompt_id,
            'model': model,
            'sample_size': sample_size,
            "text": row["text"],
            "answer": answer,
            "labeled_hateful": row["hate"]
        })
        if print_results:
            print(row["text"])
            if lines := [line for line in answer.split("\n") if "hate_speech_probability" in line]:
                print(lines)
            else:
                print(answer)

    return pd.DataFrame(results)

In [8]:
from concurrent.futures import ThreadPoolExecutor

total_samples = len(testing_df)
all_runs = pd.read_pickle("all_runs.pkl")

CONFIG = [
    (3, 'mistral-7b-instruct', total_samples),
    
]
with ThreadPoolExecutor(max_workers=3) as executor:
    for run in executor.map(lambda config: run_model(*config), CONFIG):
        all_runs = pd.concat([all_runs, run])
        pd.DataFrame(all_runs).to_pickle("all_runs.pkl")

[0:00:00.001642] mixtral-8x7b-instruct (Promt 0): 1/478 generations => 0.21%
[0:00:00.002999] pplx-70b-chat (Promt 0): 1/478 generations => 0.21%
[0:00:00.001350] pplx-7b-chat (Promt 0): 1/478 generations => 0.21%
[0:00:02.448209] pplx-7b-chat (Promt 0): 2/478 generations => 0.42%
[0:00:03.371108] mixtral-8x7b-instruct (Promt 0): 2/478 generations => 0.42%
[0:00:04.809358] pplx-7b-chat (Promt 0): 3/478 generations => 0.63%
[0:00:04.814241] pplx-70b-chat (Promt 0): 2/478 generations => 0.42%
[0:00:05.625866] pplx-7b-chat (Promt 0): 4/478 generations => 0.84%
[0:00:06.957002] mixtral-8x7b-instruct (Promt 0): 3/478 generations => 0.63%
[0:00:07.943561] pplx-7b-chat (Promt 0): 5/478 generations => 1.05%
[0:00:08.595559] mixtral-8x7b-instruct (Promt 0): 4/478 generations => 0.84%
[0:00:09.412905] pplx-70b-chat (Promt 0): 3/478 generations => 0.63%
[0:00:10.230944] pplx-7b-chat (Promt 0): 6/478 generations => 1.26%
[0:00:11.049429] pplx-7b-chat (Promt 0): 7/478 generations => 1.46%
[0:00:13.

In [9]:
all_runs

Unnamed: 0,prompt_id,model,sample_size,text,answer,labeled_hateful
0,0,mistral-7b-instruct,478,So you would refuse even other Nordic whites f...,"{ \n ""relevant_facts"": [\n ""The sentence e...",True
1,0,mistral-7b-instruct,478,Just Google it and you 'll see what I mean .,"{ \n ""relevant_facts"": [\n ""The sentence c...",False
2,0,mistral-7b-instruct,478,"During the early punk scene , he was always se...","{ \n ""relevant_facts"": [\n ""The person in ...",False
3,0,mistral-7b-instruct,478,"DuetschGirlTX, No I live in a tiny ass town wh...","{ \n ""relevant_facts"": [""The speaker is ident...",True
4,0,mistral-7b-instruct,478,YouTube - Teacher Attacked for Being White 9 y...,"```json\n{ \n ""relevant_facts"": [\n ""A 9-y...",False
...,...,...,...,...,...,...
473,1,mixtral-8x7b-instruct,478,"God bless them all , to hell with the blacks .","{\n""relevant\_facts\_from\_the\_cultural\_cont...",True
474,1,mixtral-8x7b-instruct,478,Interesting article about what to do with thes...,"Sure, I understand the importance of accurate ...",True
475,1,mixtral-8x7b-instruct,478,Lets keep it that way and not destroy it Check...,I am an AI and do not have the capacity to fee...,False
476,1,mixtral-8x7b-instruct,478,Agreed! rep will have to wait though .. Cheap ...,"{\n""relevant\_facts\_from\_the\_cultural\_cont...",False
