In [None]:
import datetime
import time
!pip install numpy
!pip install pandas
!pip install tables
import pandas as pd
import requests
import os
from dotenv import load_dotenv
import json

In [None]:
df = pd.read_pickle('processed_data.pkl')
training_df = df[df["train"]]
testing_df = df[df["test"]]

In [None]:
from pathlib import Path

PROMT_DIR_PATH=Path("prompt-variations")

PROMT_PATHS = sorted(PROMT_DIR_PATH.glob("v*.txt"), key=lambda f:int(f.name.strip("v.txt")))
SYSTEM_PROMPT = [f.read_text() for f in PROMT_PATHS]

In [None]:
load_dotenv()
PERPLEXITY_API_KEY = os.getenv("PPLX_KEY")
VALID_MODELS = ['codellama-34b-instruct', 'llama-2-70b-chat', 'mistral-7b-instruct', 'mixtral-8x7b-instruct',
                'pplx-7b-chat', 'pplx-70b-chat']


def call_api(model: str, promt: str, text: str) -> requests.Response:
    payload = {
        "model": model,
        "messages": [
            {
                "role": "system",
                "content": promt
            },
            {
                "role": "user",
                "content": text
            }
        ]
    }
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "Authorization": f"Bearer {PERPLEXITY_API_KEY}"
    }

    return requests.post("https://api.perplexity.ai/chat/completions", json=payload, headers=headers)


def run_model(prompt_id, model, sample_size, print_results=False):
    if model not in VALID_MODELS:
        raise ValueError(f'Invalid model {model}. Valid options are {VALID_MODELS}')

    sample_df = testing_df.sample(n=sample_size)

    results = []
    start_time = datetime.datetime.now()
    for i, (row_index, row) in enumerate(sample_df.iterrows()):
        total = sample_df["text"].count()
        counter = i + 1
        elapsed = datetime.datetime.now() - start_time
        percentage = counter / total
        s_per_gen=elapsed / counter
        print(f'[{elapsed}<{s_per_gen * (total - counter)}, {s_per_gen}s/generations] '
              f'{model} - promt {prompt_id}: {counter}/{total} | {percentage * 100:.2f}%')
        backoff_time = 10
        while True:
            response = call_api(model, SYSTEM_PROMPT[prompt_id], row["text"])
            if response.status_code == 200:
                break
            print(
                f"{model} (Promt {prompt_id}): Error {response.status_code} => sleeping for {backoff_time}s: {response.text}")
            time.sleep(backoff_time)
            backoff_time = min(int(1.2 * backoff_time), 60)
        data = json.loads(response.text)
        answer = data['choices'][0]['message']['content']
        results.append({
            'prompt_id': prompt_id,
            'model': model,
            'sample_size': sample_size,
            "text": row["text"],
            "answer": answer,
            "labeled_hateful": row["hate"]
        })
        if print_results:
            print(row["text"])
            if lines := [line for line in answer.split("\n") if "hate_speech_probability" in line]:
                print(lines)
            else:
                print(answer)

    return pd.DataFrame(results)

In [None]:
from concurrent.futures import ThreadPoolExecutor

total_samples = len(testing_df)
all_runs = pd.read_pickle("all_runs.pkl")

CONFIG = [
    (5, 'mistral-7b-instruct', total_samples),
    (6, 'mistral-7b-instruct', total_samples),
    (7, 'mistral-7b-instruct', total_samples),
    (8, 'mistral-7b-instruct', total_samples),
]
with ThreadPoolExecutor() as executor:
    for run in executor.map(lambda config: run_model(*config), CONFIG):
        all_runs = pd.concat([all_runs, run])
        pd.DataFrame(all_runs).to_pickle("all_runs.pkl")

In [None]:
all_runs