In [None]:
import json
import os
import requests

In [None]:
HOST = "http://127.0.0.1"
PORT = "3090"

WEAVIATE_ENDPOINT = f"{HOST}:{PORT}/api/weaviate"
BASE_PATH = "data/haarlem"

In [None]:
def create_transcript_collection():
    requests.post(
        f"{WEAVIATE_ENDPOINT}/createCollection",
        json={
            "name": "transcripts",
            "vector_index_hnsw": True,
            "distance_config": "cosine",
            "properties": [
                {"name": "text", "data_type": "text"},
                {"name": "bmContext", "data_type": "text"},
                {"name": "start", "data_type": "number"},
                {"name": "end", "data_type": "number"},
                {"name": "code", "data_type": "text"},
                {"name": "year", "data_type": "text"},
                {"name": "government", "data_type": "text"},
            ],
        },
    )


def create_diorization_collection():
    requests.post(
        f"{WEAVIATE_ENDPOINT}/createCollection",
        json={
            "name": "diorizations",
            "vector_index_hnsw": True,
            "distance_config": "cosine",
            "properties": [
                {"name": "startEnds", "data_type": "objectList"},
                {"name": "code", "data_type": "text"},
                {"name": "year", "data_type": "text"},
                {"name": "government", "data_type": "text"},
            ],
        },
    )


# create_transcript_collection()
create_diorization_collection()

In [None]:
def get_embedding(text):
    pass

In [None]:
def add_transcripts():
    for year in os.listdir(BASE_PATH):
        if not os.path.isdir(f"{BASE_PATH}/{year}") or not os.path.isdir(
            f"{BASE_PATH}/{year}/turnObjects"
        ):
            continue
        print("Doing", year)
        for meeting in os.listdir(f"{BASE_PATH}/{year}/turnObjects"):
            if not meeting.endswith(".json"):
                continue
            with open(f"{BASE_PATH}/{year}/turnObjects/{meeting}", "r") as f:
                objects = json.load(f)
                objects_with_embedding = []

                # In order to later retrieve the next and previous spoken texts from one specific spoken text
                speak_num = 0
                for o in objects:
                    if o["text"] != "":
                        dior_obj_name = meeting.replace(".mp4", ".wav")

                        # Gets embedding of the current speaker.
                        speaker_embedding = None
                        with open(
                            f"{BASE_PATH}/{year}/DiorizationObjects/{dior_obj_name}",
                            "r",
                        ) as dior_f:
                            dior_data = json.load(dior_f)
                            for dd in dior_data:
                                if dd["object"]["speaker"] == o["speaker"]:
                                    speaker_embedding = dd["vector"]
                                    break
                        if speaker_embedding is None:
                            print(
                                f"DID NOT FIND SPEAKER {o['speaker']} IN DIORIZATION  OBJECTS!"
                            )
                            continue

                        objects_with_embedding.append(
                            {
                                "object": {
                                    "text": o["text"],
                                    "start": o["start"],
                                    "end": o["end"],
                                    "code": o["code"],
                                    "year": o["year"],
                                    "government": o["government"],
                                    "type": o["type"],
                                    "speaker": o["speaker"],  # TODO, name speaker?
                                    "speechNumber": speak_num,
                                },
                                "vector": {
                                    "text": get_embedding(o["text"]),
                                    "speaker": speaker_embedding,
                                },
                            }
                        )
                        speak_num += 1
                requests.post(
                    f"{WEAVIATE_ENDPOINT}/insert",
                    json={
                        "collection": "transcriptsV2",
                        "objects": objects_with_embedding,
                    },
                )


add_transcripts()

In [None]:
def add_diorizations():
    for year in os.listdir(BASE_PATH):
        if not os.path.isdir(f"{BASE_PATH}/year") or not os.path.isdir(
            f"{BASE_PATH}/year/DiorizationObjects"
        ):
            continue
        for meeting in os.listdir(f"{BASE_PATH}/year/DiorizationObjects"):
            if not meeting.endswith(".json"):
                continue
            with open(f"{BASE_PATH}/year/DiorizationObjects/{meeting}", "r") as f:
                objects = json.load(f)
                requests.post(
                    f"{WEAVIATE_ENDPOINT}/insert",
                    json={
                        "collection": "diorizations",
                        "objects": objects,
                    },
                )