In [None]:
import json
import requests
import pandas as pd

In [None]:
port = "3012"
API_ENDPOINT = f"http://localhost:{port}/api/weaviate"

In [None]:
def get_speaker_embedding(government, meeting_type, year, code, speakerID):
    # path = f"data/{government}/{meeting_type}/{year}/finalObjects/{code}.mp4.json"
    path = f"data/{government}/{year}/finalObjects/{code}.mp4.json"
    with open(path, "r") as f:
        data = json.load(f)

    for d in data:
        if d["object"]["speaker"] == speakerID:
            return d["vector"]["speaker"]

    # raise Exception("No speakerID found")
    print(f"ERROR! Could not find speakerID in video {code}")

In [None]:
def add_weaviate(government, name, embedding):
    r = requests.post(
        f"{API_ENDPOINT}/insert",
        json={
            "collection": "Speakers",
            "objects": [
                {
                    "object": {"government": government, "name": name},
                    "vector": embedding,
                }
            ],
        },
    )

    if r.status_code != 200:
        print(f"ERROR INSERTING {name}!")
        return
    print(f"Inserted {name} in weaviate")

In [None]:
def is_in_weaviate(embedding):
    r = requests.post(f"{API_ENDPOINT}/searchVector", json={})

    if r.status_code != 200:
        print("ERROR LOOKING FOR SPEAKER!")
        return

    print(r.json())
    # TODO: Return true if found, false otherwise

In [None]:
def handle_file(government, meeting_type, year, path):
    code = path.split("/")[-1].split(".")[0]
    data = pd.read_excel(path)
    done = []
    for index, row in data.iterrows():
        name = row["Naam"]
        speakerID = row["sprekerID"]
        # If speaker is named
        if not pd.isna(name) and speakerID not in done:
            if name.strip().lower() == "inspreker":
                continue
            print(
                f"{path}, Row {index+1}: 'Naam' is not empty and its value is '{name}', '{speakerID}', {code}"
            )
            embedding = get_speaker_embedding(
                government, meeting_type, year, code, speakerID
            )
            add_weaviate(government, name, embedding)
            done.append(speakerID)

In [None]:
annotated_files_2023_hoekschewaard = [
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/sheets/1068470.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/sheets/1068534.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/sheets/1068543.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/sheets/1109657.wav.rttm.xlsx",
]
annotated_files_2024_hoekschewaard = [
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2024/sheets/1178278.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2024/sheets/1178261.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2024/sheets/1192781.wav.rttm.xlsx",
]
for path in annotated_files_2023_hoekschewaard:
    handle_file("hoekschewaard", "vergadering", "2023", path)
for path in annotated_files_2024_hoekschewaard:
    handle_file("hoekschewaard", "vergadering", "2024", path)


In [None]:
annotated_files_2023_ridderkerk = [
    "/Users/personal/Desktop/scriptie/notebooks/data/ridderkerk/2023/sheets/1068434.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/ridderkerk/2023/sheets/1068445.wav.rttm.xlsx",
]
annotated_files_2024_ridderkerk = [
    "/Users/personal/Desktop/scriptie/notebooks/data/ridderkerk/2024/sheets/1147151.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/ridderkerk/2024/sheets/1147158.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/ridderkerk/2024/sheets/1147176.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/ridderkerk/2024/sheets/1147208.wav.rttm.xlsx",
]
for path in annotated_files_2023_ridderkerk:
    handle_file("ridderkerk", "vergadering", "2023", path)
for path in annotated_files_2024_ridderkerk:
    handle_file("ridderkerk", "vergadering", "2024", path)

In [None]:
annotated_files_2023_barendrecht = [
    "/Users/personal/Desktop/scriptie/notebooks/data/barendrecht/2023/sheets/1094927.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/barendrecht/2023/sheets/1108841.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/barendrecht/2023/sheets/1115813.wav.rttm.xlsx",
]
annotated_files_2024_barendrecht = [
    "/Users/personal/Desktop/scriptie/notebooks/data/barendrecht/2024/sheets/1195585.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/barendrecht/2024/sheets/1203464.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/barendrecht/2024/sheets/1203469.wav.rttm.xlsx",
    "/Users/personal/Desktop/scriptie/notebooks/data/barendrecht/2024/sheets/1223517.wav.rttm.xlsx",
]
for path in annotated_files_2023_barendrecht:
    handle_file("barendrecht", "vergadering", "2023", path)
for path in annotated_files_2024_barendrecht:
    handle_file("barendrecht", "vergadering", "2024", path)