In [2]:
import json
import pandas as pd

from collections import defaultdict
from data_processing import pers_labels

setting label types

In [3]:
PWD = os.environ["WORKSPACE_PATH"]

pers_mode = pers_labels.BIG_5
dim_index = 4

config_name = f"{pers_mode.lower()}_{dim_index}_all_lbls"

config_name

'big 5_4_all_lbls'

## loading speakers

In [32]:
with open(f"{PWD}/data/cornell_movies/speakers.json", "r+") as fp:
    fp_parsed = json.load(fp)
    chars_meta = {}
    chars_meta_rows = []
    for char in fp_parsed:
        meta = fp_parsed[char]["meta"]
        meta["character_name"] = meta["character_name"].lower()
        meta["char_id"] = char
        chars_meta[char] = meta
        chars_meta_rows.append(meta)


## loading labels

In [33]:
pers_df = pers_labels.get_pers_df()

In [34]:
def get_label(row, mode, dim_index):
    dim_labels = pers_labels.get_dim_votes(row, mode)
    return dim_labels[dim_index] if dim_labels else dim_labels

In [35]:
char_id_to_lbl = dict()
for row in pers_df.iloc:
    char_id_to_lbl[row.char_id] = get_label(row, pers_mode, dim_index)

## joining convos and speakers

In [36]:
with open(f"{PWD}/data/cornell_movies/utterances.jsonl", "r+") as fp:
    utt_df = pd.read_json(fp, lines=True)

In [37]:
convos = defaultdict(lambda: {"characters": set(), "lines": []})

for row in utt_df.iloc:
    convos[row.conversation_id]["lines"].append({
        "id": row.id,
        "speaker": row.speaker,
        "text": row.text
    })
    convos[row.conversation_id]["movie"] = row.meta["movie_id"]
    convos[row.conversation_id]["characters"].add(row.speaker)

In [38]:
def format_convo(lines):
    fmtd_lines = []
    for l in sorted(lines, key=lambda l: l["id"]):
        char_id = l['speaker']
        char = chars_meta[char_id]["character_name"]
        fmtd_lines.append(f"{char}: {l['text']}")
    return "\n".join(fmtd_lines)

In [39]:
def get_line_count(lines, chr):
    return len([l for l in lines if l["speaker"] == chr])

In [40]:
char_to_scenes = defaultdict(list)

for conv_id in convos:
    conv_info = convos[conv_id]
    lines = conv_info["lines"]
    fmtd_convo = format_convo(lines)

    for char in conv_info["characters"]:
        if char in char_id_to_lbl:
            if (lbl := char_id_to_lbl[char]) and get_line_count(lines, char) >= 3:
                char_to_scenes[char].append(fmtd_convo)

In [41]:
dataset = []
for char in char_to_scenes:
    for lbl in char_id_to_lbl[char]:
        scenes = "\n\n".join(char_to_scenes[char])
        text = f"Please categorize {chars_meta[char]['character_name']}.\n\n{scenes}"
        dataset.append({"text": text, "label": lbl, "char_id": char, "movie_id": chars_meta[char]["movie_idx"]})

len(dataset)

6690

In [42]:
with open(f"{PWD}/data/model_datasets/{config_name}.jsonl", "w+") as fp:
    fp.write("\n".join([json.dumps(ex) for ex in dataset]))