In [3]:
import getpass
import json
import pandas as pd
import tiktoken
import tqdm

from data_processing import pers_labels
from openai import OpenAI
from sklearn.metrics import f1_score, precision_score, recall_score

ModuleNotFoundError: No module named 'tiktoken'

In [None]:
client = OpenAI(api_key=getpass.getpass("please enter your openai api key"))

In [None]:
PWD = os.environ["WORKSPACE_PATH"]

data_type = "big 5_4_top_lbl"

label_mode = pers_labels.MBPT if pers_labels.MBPT.lower() in data_type else pers_labels.BIG_5

index = [idx for idx in range(5) if str(idx) in data_type][0]

pers_defs = {
    pers_labels.MBPT: {
        0: (("I", "introverted"), ("E", "extroverted")),
        1: (("S", "sensing"), ("N", "intuitive")),
        2: (("F", "feeling"), ("T", "thinking")),
        3: (("J", "judging"), ("P", "perceiving")),
    },
    pers_labels.BIG_5: {
        0: (("S", "social"), ("R", "reserved")),
        1: (("L", "limbic"), ("C", "calm")),
        2: (("O", "organized"), ("U", "unstructured")),
        3: (("A", "agreeable"), ("E", "egocentric")),
        4: (("N", "non-curious"), ("I", "inquisitive")),
    }
}

((label1, label1_def), (label2, label2_def)) = pers_defs[label_mode][index]

## loading data

In [None]:
with open(f"{PWD}/data/cornell_movies/speakers.json", "r+") as fp:
    fp_parsed = json.load(fp)
    chars_meta = {}
    chars_meta_rows = []
    for char in fp_parsed:
        meta = fp_parsed[char]["meta"]
        meta["character_name"] = meta["character_name"].lower()
        meta["char_id"] = char
        chars_meta[char] = meta
        chars_meta_rows.append(meta)

In [None]:
dataset = pd.read_json(f"{PWD}/data/datasets/{data_type}.jsonl", lines=True)

## query gpt

In [None]:
def classify_text(text, char):

    _, scene = text.split("\n", 1)

    # To get the tokeniser corresponding to a specific model in the OpenAI API:
    enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

    prompt = f"""
Read the scenes below and then categorize {char}'s personality as {label1} for {label1_def} or "{label2}" for {label2_def}, according to the {label_mode.lower()} personality typology. Response with only one word.

scenes:
    {scene}
    """

    prompt = enc.decode(enc.encode(prompt)[:4000])

    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",  # GPT-3.5 Turbo engine
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1,  # Restrict output to only one token
        n=1,  # Generate only one response,
        temperature=0
    )

    # Extract the first choice (response)
    predicted_label = response.choices[0].message.content
    return prompt, predicted_label

In [None]:
def verify_output(pred):
    if pred.upper() not in [label1, label2]:
        raise Exception(f"Invalid response: '{pred}'")

In [None]:
chars = [char_id for char_id in chars_meta]
preds = {}
labels = {}

In [None]:
for row in tqdm.tqdm(dataset.iloc):
    if row.char_id in preds:
        continue

    char_name = chars_meta[row.char_id]["character_name"]
    scene = row.text

    # print(char_name)

    prompt, response = classify_text(scene, char_name)

    # print(prompt)
    # print()

    try:
        verify_output(response)
    except:
        print("invalid response")
        continue
    # print("Correct:", row.label)
    # print("Predicted:", response)
    preds[row.char_id] = response
    labels[row.char_id] = row.label


472it [06:18,  1.23s/it]

invalid response


732it [10:06,  2.34it/s]

invalid response


866it [11:45,  1.23it/s]


In [108]:
with open(f"{PWD}/data/gpt_preds/{data_type}-{len(preds)}.json", "w+") as fp:
    json.dump(preds, fp)

## evaluation

### overall metrics

In [109]:
preds_len = len(preds)
# preds_len = 1897

with open(f"{PWD}/data/gpt_preds/{data_type}-{preds_len}.json", "r+") as fp:
    preds = json.load(fp)

In [110]:
for row in dataset.iloc:
    if row.char_id in preds:
        labels[row.char_id] = row.label

In [111]:
pred_array = [preds[char_id] for char_id in chars if char_id in preds]
label_array = [labels[char_id] for char_id in chars if char_id in preds]

In [112]:
print(len(pred_array), len(label_array))

864 864


In [113]:
f1 = f1_score(label_array, pred_array, labels=[label1, label2], average="macro")
prec = precision_score(label_array, pred_array, labels=[label1, label2], average="macro")
recall = recall_score(label_array, pred_array, labels=[label1, label2], average="macro")

print("overall:")
print("f1", f1)
print("precision", prec)
print("recall", recall)


overall:
f1 0.4018303368598689
precision 0.6039663461538461
recall 0.5149476185333189


### label 1 metrics

In [114]:
f1 = f1_score(label_array, pred_array, labels=[label1, label2], pos_label=label1)
prec = precision_score(label_array, pred_array, labels=[label1, label2], pos_label=label1)
recall = recall_score(label_array, pred_array, labels=[label1, label2], pos_label=label1)

print(label1 + ":")
print("f1", f1)
print("precision", prec)
print("recall", recall)

N:
f1 0.09859154929577466
precision 0.65625
recall 0.0532994923857868


### label 2 metrics

In [115]:
f1 = f1_score(label_array, pred_array, labels=[label1, label2], pos_label=label2)
prec = precision_score(label_array, pred_array, labels=[label1, label2], pos_label=label2)
recall = recall_score(label_array, pred_array, labels=[label1, label2], pos_label=label2)

print(label2 + ":")
print("f1", f1)
print("precision", prec)
print("recall", recall)

I:
f1 0.7050691244239632
precision 0.5516826923076923
recall 0.9765957446808511
