In [None]:
import os
import json
import requests

In [None]:
HOST = "http://127.0.0.1"
PORT = "3009"

TRANSCRIBE_ENDPOINT = f"{HOST}:{PORT}/api"

In [None]:
def embed_text(text):
    r = requests.post(f"{TRANSCRIBE_ENDPOINT}/embed", json={"text": [text]})
    if r.status_code != 200:
        raise Exception("Error embedding")

    return r.json()["embeddings"]


In [None]:
def get_bm25_context(complete_text, substring):
    index = complete_text.find(substring)
    if index == -1:
        return ""

    end_index = min(len(complete_text), index + len(substring) + 1000)

    return complete_text[index:end_index]

In [None]:
def get_objects(file_path, year, government):
    objects = []
    with open(file_path, "r") as f:
        whisper_output = json.load(f)
        if whisper_output.get("segments") is None:
            raise Exception("Whisper file has no segments")

        complete_text = whisper_output.get("text")

        for segment in whisper_output.get("segments"):
            start = segment.get("start")
            end = segment.get("end")
            text = segment.get("text")
            if len(text) < 10:
                # print("skipping", text)
                continue
            bm_context = get_bm25_context(complete_text, text)
            # Text snippets can be embedded in bulk for a speed up, but at the moment this is not that important.
            vector = embed_text(text.replace("...", ""))[0]
            code = file_path.split("/")[-1].split(".")[0]

            objects.append(
                {
                    "object": {
                        "text": text,
                        "bmContext": bm_context,
                        "start": round(start, 2),
                        "end": round(end, 2),
                        "code": code,
                        "year": year,
                        "government": government,
                    },
                    "vector": vector,
                }
            )

    return objects

In [None]:
BASE_PATH_INIT = "data/nijmegen"

for category in os.listdir(BASE_PATH_INIT):
    if category.startswith("."):
        continue
    BASE_PATH = f"{BASE_PATH_INIT}/{category}"
    for year in os.listdir(BASE_PATH):
        print(f"Doing category {category}, year {year}")
        if year == ".DS_Store":
            continue

        if not os.path.isdir(f"{BASE_PATH}/{year}/transcripts"):
            continue

        if not os.path.isdir(f"{BASE_PATH}/{year}/objects"):
            os.mkdir(f"{BASE_PATH}/{year}/objects")

        for transcript in os.listdir(f"{BASE_PATH}/{year}/transcripts"):
            if transcript == ".DS_Store":
                continue

            input_path = os.path.abspath(f"{BASE_PATH}/{year}/transcripts/{transcript}")
            output_path = f"{BASE_PATH}/{year}/objects/{transcript}"
            if not os.path.isfile(input_path):
                continue
            if os.path.isfile(output_path):
                print("Already did", output_path)
                continue

            objects = get_objects(input_path, year, "haarlem")
            with open(output_path, "w") as f:
                json.dump(objects, f)

            print("did", input_path)