In [1]:
import os
import json
import requests

In [2]:
HOST = "http://127.0.0.1"
PORT = "3009"

TRANSCRIBE_ENDPOINT = f"{HOST}:{PORT}/api"

In [3]:
def embed_text(text):
    r = requests.post(f"{TRANSCRIBE_ENDPOINT}/embed", json={"texts": [text]})
    if r.status_code != 200:
        raise Exception("Error embedding")
    print(r.content)

In [4]:
def get_objects(file_path):
    with open(file_path, "r") as f:
        whisper_output = json.load(f)
        objects = []
        if whisper_output.get("segments") is None:
            raise Exception("Whisper file has no segments")

        for segment in whisper_output.get("segments"):
            start = segment.get("start")
            end = segment.get("end")
            text = segment.get("text").replace("...", "")
            if len(text) < 10:
                continue
            vector = embed_text(text)

            objects.append(
                {
                    "object": {
                        "text": text,
                        "start": start,
                        "end": end,
                    },
                    "vector": vector,
                }
            )

        return objects

In [5]:
BASE_PATH = "data/haarlem"

for year in os.listdir(BASE_PATH):
    print(f"Doing year {year}")
    if year == ".DS_Store":
        continue

    if not os.path.isdir(f"{BASE_PATH}/{year}/transcripts"):
        continue

    for transcript in os.listdir(f"{BASE_PATH}/{year}/transcripts"):
        if transcript == ".DS_Store":
            continue

        input_path = os.path.abspath(f"{BASE_PATH}/{year}/transcripts/{transcript}")
        if not os.path.isfile(input_path):
            continue

        objects = get_objects(input_path)
        print(objects)
        break
    break
        # r = requests.post(f"{TRANSCRIBE_ENDPOINT}/weaviate/insert")

Doing year 2014
[{'object': {'text': ' Dames en heren, ik zou de vergadering wel openen.', 'start': 0.0, 'end': 4.96}, 'vector': None}, {'object': {'text': ' De heer van de Raad zet een waarschietlijke antrekening erin.', 'start': 4.96, 'end': 8.120000000000001}, 'vector': None}, {'object': {'text': ' Er wordt nog wat Ger Mutselingen in de achterzaal.', 'start': 8.120000000000001, 'end': 10.84}, 'vector': None}, {'object': {'text': ' Ja, de vergadering is geopend.', 'start': 10.84, 'end': 14.52}, 'vector': None}, {'object': {'text': ' En ik kan nu zeggen dat de jubilage met Angers leeft als nooit tevoren.', 'start': 14.52, 'end': 21.84}, 'vector': None}, {'object': {'text': ' Dat verklaart een aantal afwezigen, namelijk mevrouw Ramzoudit, de heer van Leeuwen, de', 'start': 21.84, 'end': 27.76}, 'vector': None}, {'object': {'text': ' heer Smit en de heer Visser.', 'start': 27.76, 'end': 29.44}, 'vector': None}, {'object': {'text': ' Die zijn inmiddels onderweg naar Angers om daar het 50