In [1]:
import os
import json
import requests
import numpy as np

In [None]:
HOST = "http://127.0.0.1"
PORT = "3009"

TRANSCRIBE_ENDPOINT = f"{HOST}:{PORT}/api"

In [None]:
def embed_voice(input_path, from_time, to_time):
    body = {
        "input_path": input_path,
        "output_path": None,
        "from_time": from_time,
        "to_time": to_time,
    }
    r = requests.post(f"{TRANSCRIBE_ENDPOINT}/pyannote/embed", json=body)
    if r.status_code != 200:
        raise Exception("Error embedding")

    return r.json()["embedding"]

In [None]:
def average_list(lists):
    num_lists = len(lists)
    list_length = len(lists[0])
    sum_list = [0] * list_length
    for lst in lists:
        sum_list = [sum(x) for x in zip(sum_list, lst)]
    average_list = [x / num_lists for x in sum_list]

    return average_list

In [None]:
def get_DiorizationObjects(file_path, year, government):
    speakers = {}
    objects = []
    with open(file_path, "r") as f:
        # Get all speakers and their speaker times
        for line in f:
            splitted_line = line.split(" ")
            speaker = splitted_line[7]
            if not speakers.get(speaker):
                speakers[speaker] = []
            speakers[speaker].append(
                {
                    "start": splitted_line[3],
                    "end": splitted_line[3] + splitted_line[4],
                }
            )

    for speaker in speakers:
        vectors = []
        for i, time in enumerate(speakers[speaker]):
            if i > 5:
                break
            vector = embed_voice(PATH, time["start"], time["end"])
            vectors.append(vector)

        avg_vector = average_list(vectors)
        code = file_path.split("/")[-1].split(".")[0]

        objects.append(
            {
                "object": {
                    "startEnds": speakers[speaker],
                    "code": code,
                    "year": year,
                    "government": government,
                },
                "vector": avg_vector,
            }
        )

    return objects

In [None]:
BASE_PATH = "data/haarlem"

for year in os.listdir(BASE_PATH):
    print(f"Doing year {year}")
    if year == ".DS_Store":
        continue

    if not os.path.isdir(f"{BASE_PATH}/{year}/diorizations"):
        continue

    if not os.path.isdir(f"{BASE_PATH}/{year}/DiorizationObjects"):
        os.mkdir(f"{BASE_PATH}/{year}/DiorizationObjects")

    for diorization in os.listdir(f"{BASE_PATH}/{year}/diorizations"):
        if diorization == ".DS_Store":
            continue

        input_path = os.path.abspath(f"{BASE_PATH}/{year}/diorizations/{diorization}")
        output_path = f"{BASE_PATH}/{year}/DiorizationObjects/{diorization}"
        if not os.path.isfile(input_path):
            continue
        if os.path.isfile(output_path):
            print("Already did", output_path)
            continue

        diorization_bjects = get_DiorizationObjects(input_path, year, "haarlem")
        with open(output_path, "w") as f:
            json.dump(diorization_bjects, f)

        print("did", input_path)