In [None]:
import json
import os
import requests

In [None]:
HOST = "http://127.0.0.1"
PORT = "3012"

WEAVIATE_ENDPOINT = f"{HOST}:{PORT}/api/weaviate"
EMBED_ENDPOINT = f"{HOST}:{PORT}/api/embed"
gemeente = "hoekschewaard"
BASE_PATH_INIT = f"/Volumes/Samsung_T5/data/{gemeente}"

In [None]:
def create_transcript_collection():
    requests.post(
        f"{WEAVIATE_ENDPOINT}/createCollection",
        json={
            "name": "TranscriptsV2",
            "vector_index_hnsw": True,
            "distance_config": "cosine",
            "properties": [
                {"name": "text", "data_type": "text"},
                # {"name": "bmContext", "data_type": "text"},
                {"name": "start", "data_type": "number"},
                {"name": "end", "data_type": "number"},
                {"name": "code", "data_type": "text"},
                {"name": "year", "data_type": "text"},
                {"name": "government", "data_type": "text"},
                {"name": "type", "data_type": "text"},
                {"name": "speaker", "data_type": "text"},
            ],
        },
    )


def create_speaker_collection():
    requests.post(
        f"{WEAVIATE_ENDPOINT}/createCollection",
        json={
            "name": "Speakers",
            "vector_index_hnsw": True,
            "distance_config": "cosine",
            "properties": [
                {"name": "government", "data_type": "text"},
                {"name": "name", "data_type": "text"},
            ],
        },
    )


# create_transcript_collection()
# create_speaker_collection()

In [None]:
def get_embedding(text):
    body = {"text": [text]}
    r = requests.post(EMBED_ENDPOINT, json=body)
    if not r.status_code == 200:
        print("Error embedding!")
        raise Exception("Error embedding")

    return r.json()["embeddings"][0]

In [None]:
def create_transcripts():
    for meeting_type in os.listdir(BASE_PATH_INIT):
        if meeting_type.startswith("."):
            continue
        BASE_PATH = f"{BASE_PATH_INIT}/{meeting_type}"
        for year in os.listdir(BASE_PATH):
            if year.startswith("."):
                continue
            if not os.path.isdir(f"{BASE_PATH}/{year}") or not os.path.isdir(
                f"{BASE_PATH}/{year}/turnObjects"
            ):
                continue
            print("Doing", gemeente, meeting_type, year)
            for meeting in os.listdir(f"{BASE_PATH}/{year}/turnObjects"):
                if not meeting.endswith(".json"):
                    continue
                if not os.path.isdir(f"{BASE_PATH}/{year}/finalObjects"):
                    os.mkdir(f"{BASE_PATH}/{year}/finalObjects/")
                if os.path.isfile(f"{BASE_PATH}/{year}/finalObjects/{meeting}"):
                    print(f"{BASE_PATH}/{year}/finalObjects/{meeting} already exists")
                    continue

                with open(f"{BASE_PATH}/{year}/turnObjects/{meeting}", "r") as f:
                    objects = json.load(f)
                    objects_with_embedding = []

                    # In order to later retrieve the next and previous spoken texts from one specific spoken text
                    speak_num = 0
                    for o in objects:
                        if o["text"] != "":
                            dior_obj_name = meeting.replace(".mp4", ".wav")

                            if not os.path.isfile(
                                f"{BASE_PATH}/{year}/DiorizationObjects/{dior_obj_name}"
                            ):
                                print(
                                    f"{BASE_PATH}/{year}/DiorizationObjects/{dior_obj_name} does not exist, skipping"
                                )
                                break

                            # Gets embedding of the current speaker.
                            speaker_embedding = None
                            with open(
                                f"{BASE_PATH}/{year}/DiorizationObjects/{dior_obj_name}",
                                "r",
                            ) as dior_f:
                                dior_data = json.load(dior_f)
                                for dd in dior_data:
                                    if dd["object"]["speaker"] == o["speaker"]:
                                        speaker_embedding = dd["vector"]
                                        break
                            if speaker_embedding is None:
                                print(
                                    f"DID NOT FIND SPEAKER {o['speaker']} IN DIORIZATION  OBJECTS!"
                                )
                                continue

                            objects_with_embedding.append(
                                {
                                    "object": {
                                        "text": o["text"],
                                        "start": o["start"],
                                        "end": o["end"],
                                        "code": o["code"],
                                        "year": o["year"],
                                        "government": o["government"],
                                        "type": o["type"],
                                        "speaker": o["speaker"],  # TODO, name speaker?
                                        "speechNumber": speak_num,
                                    },
                                    "vector": {
                                        "text": get_embedding(o["text"]),
                                        "speaker": speaker_embedding,
                                    },
                                }
                            )
                            speak_num += 1

                    with open(f"{BASE_PATH}/{year}/finalObjects/{meeting}", "w") as fof:
                        json.dump(objects_with_embedding, fof)


create_transcripts()

In [None]:
def add_transcripts():
    for meeting_type in os.listdir(BASE_PATH_INIT):
        if meeting_type.startswith("."):
            continue
        BASE_PATH = f"{BASE_PATH_INIT}/{meeting_type}"
        for year in os.listdir(BASE_PATH):
            if year.startswith("."):
                continue
            if not os.path.isdir(f"{BASE_PATH}/{year}") or not os.path.isdir(
                f"{BASE_PATH}/{year}/finalObjects"
            ):
                continue
            print("Doing", gemeente, meeting_type, year)
            for meeting in os.listdir(f"{BASE_PATH}/{year}/finalObjects"):
                with open(f"{BASE_PATH}/{year}/finalObjects/{meeting}", "r") as fof:
                    objects_with_embedding = json.load(fof)
                    requests.post(
                        f"{WEAVIATE_ENDPOINT}/insert",
                        json={
                            "collection": "TranscriptsV2",
                            "objects": objects_with_embedding,
                        },
                    )


add_transcripts()