In [9]:
import os
import json
import requests
import random

In [10]:
HOST = "http://127.0.0.1"
PORT = "3009"

TRANSCRIBE_ENDPOINT = f"{HOST}:{PORT}/api"

In [11]:
def embed_voice(input_path, from_time, to_time):
    body = {
        "input_path": input_path,
        "output_path": "",
        "from_time": from_time,
        "to_time": to_time,
    }
    r = requests.post(f"{TRANSCRIBE_ENDPOINT}/pyannote/embed", json=body)
    if r.status_code != 200:
        return None
        # raise Exception("Error embedding")

    return r.json()["embedding"]

In [12]:
def average_list(lists):
    num_lists = len(lists)
    list_length = len(lists[0])
    sum_list = [0] * list_length
    for lst in lists:
        sum_list = [sum(x) for x in zip(sum_list, lst)]
    average_list = [x / num_lists for x in sum_list]

    return average_list

In [13]:
def get_DiorizationObjects(file_path, audio_path, year, government):
    speakers = {}
    objects = []
    with open(file_path, "r") as f:
        # Get all speakers and their speaker times
        for line in f:
            splitted_line = line.split(" ")
            speaker = splitted_line[7]
            if not speakers.get(speaker):
                speakers[speaker] = []
            speakers[speaker].append(
                {
                    "start": float(splitted_line[3]),
                    "end": float(splitted_line[3]) + float(splitted_line[4]),
                }
            )

    for speaker in speakers:
        vectors = []
        for time in random.sample(speakers[speaker], min(60, len(speakers[speaker]))):
            vector = embed_voice(audio_path, time["start"], time["end"])
            if vector is not None:
                vectors.append(vector)

        if len(vectors) == 0:
            print(f"Speaker {speaker} has no vectors!")
            continue
        avg_vector = average_list(vectors)
        code = file_path.split("/")[-1].split(".")[0]

        objects.append(
            {
                "object": {
                    "speaker": speaker,
                    "startEnds": speakers[speaker],
                    "code": code,
                    "year": year,
                    "government": government,
                },
                "vector": avg_vector,
            }
        )

    return objects

In [14]:
gemeente = "barendrecht"
BASE_PATH = f"data/{gemeente}"

for year in os.listdir(BASE_PATH):
    if year.startswith("."):
        continue

    print(f"Doing year {year}")
    if not os.path.isdir(f"{BASE_PATH}/{year}/diorizations"):
        continue

    if not os.path.isdir(f"{BASE_PATH}/{year}/DiorizationObjects"):
        os.mkdir(f"{BASE_PATH}/{year}/DiorizationObjects")

    for diorization in os.listdir(f"{BASE_PATH}/{year}/diorizations"):
        if diorization.startswith("."):
            continue

        input_path = os.path.abspath(f"{BASE_PATH}/{year}/diorizations/{diorization}")
        audio_path = os.path.abspath(
            f"{BASE_PATH}/{year}/audio/{diorization.replace('.rttm', '')}"
        )
        output_path = f"{BASE_PATH}/{year}/DiorizationObjects/{diorization.replace('.rttm', '.json')}"
        if not os.path.isfile(input_path):
            continue
        if not os.path.isfile(audio_path):
            continue
        if os.path.isfile(output_path):
            print("Already did", output_path)
            continue

        print(f"Doing {input_path}")

        diorization_objects = get_DiorizationObjects(
            input_path, audio_path, year, gemeente
        )
        print("did", input_path)
        # print(diorization_objects)
        with open(output_path, "w") as f:
            json.dump(diorization_objects, f)

Doing year 2022
Doing year 2024
Doing year 2023
Doing year 2019
Already did data/barendrecht/2019/DiorizationObjects/597060.wav.json
Already did data/barendrecht/2019/DiorizationObjects/679118.wav.json
Already did data/barendrecht/2019/DiorizationObjects/682696.wav.json
Already did data/barendrecht/2019/DiorizationObjects/616211.wav.json
Already did data/barendrecht/2019/DiorizationObjects/558098.wav.json
Already did data/barendrecht/2019/DiorizationObjects/639325.wav.json
Already did data/barendrecht/2019/DiorizationObjects/688164.wav.json
Already did data/barendrecht/2019/DiorizationObjects/614790.wav.json
Already did data/barendrecht/2019/DiorizationObjects/631364.wav.json
Already did data/barendrecht/2019/DiorizationObjects/619946.wav.json
Already did data/barendrecht/2019/DiorizationObjects/619010.wav.json
Already did data/barendrecht/2019/DiorizationObjects/593129.wav.json
Already did data/barendrecht/2019/DiorizationObjects/593128.wav.json
Doing /Users/personal/Desktop/scriptie/