In [1]:
import json
import os

In [2]:
government = 'hoekschewaard'
BASE_PATH = f"/Volumes/Samsung_T5/data/{government}/vergaderingen"

In [3]:
def load_transcript(path):
    with open(path, "r") as f:
        data = json.load(f)

    data = [
        {
            "start": l["start"],
            "end": l["end"],
            "text": l["text"].replace("...", "").strip(),
        }
        for l in data["segments"]
    ]

    return data


def load_diorizations(path):
    with open(path, "r") as f:
        lines = f.readlines()
        data = [
            {"start": float(rttm[3]), "duration": float(rttm[4]), "speaker": rttm[7]}
            for rttm in [line.strip().split(" ") for line in lines]
        ]

    return data

In [4]:
def create_speaking_turns(diorizations):
    current_speaker = ""
    current_speaker_start = 0
    current_speaker_duration = 0

    speaking_turns = []

    for i, dior in enumerate(diorizations):
        # If speaker speaks for less than a few seconds, and they do not keep speaking,
        # it is probably a diorization error and we discard it.
        if (
            i < len(diorizations) - 2
            and dior["duration"] < 1
            and dior["speaker"] != diorizations[i + 1]["speaker"]
        ):
            current_speaker_duration = (
                dior["start"] + dior["duration"] - current_speaker_start
            )
            continue
        if dior["speaker"] == current_speaker:
            current_speaker_duration = (
                dior["start"] + dior["duration"] - current_speaker_start
            )
        else:
            # TODO: handle
            if i != 0:
                speaking_turns.append(
                    {
                        "speaker": current_speaker,
                        "start": round(current_speaker_start, 2),
                        "duration": round(current_speaker_duration, 2),
                    }
                )

            current_speaker = dior["speaker"]
            current_speaker_start = dior["start"]
            current_speaker_duration = dior["duration"]

    return speaking_turns

In [5]:
def create_speakers_text(
    speaking_turns, transcript, code, year, government, meeting_type
):
    speakers_text = []
    current_speaker_text = ""
    for speaker in speaking_turns:
        speaker_start = round(speaker["start"], 2)
        speaker_end = round(speaker["start"] + speaker["duration"], 2)
        for i, sentence in enumerate(transcript):
            # If the end of the sentence is further than the speaker's end,
            # speaker is done with talking and his text is added to the list.
            if sentence["start"] < speaker["start"]:
                continue
            if sentence["end"] > speaker_end - 3:
                if current_speaker_text != "":
                    # Prevents some sentences from not being caught due to beginnen
                    # and end rounding errors.
                    if i < len(transcript) - 2:
                        current_speaker_text += (
                            transcript[i + 1]["text"] + transcript[i + 2]["text"]
                        )
                    speakers_text.append(
                        {
                            "text": current_speaker_text,
                            "start": speaker_start,
                            "end": speaker_end,
                            "code": code,
                            "year": year,
                            "government": government,
                            "type": meeting_type,
                            "speaker": speaker["speaker"],
                        }
                    )
                current_speaker_text = ""
                break
            else:
                current_speaker_text += sentence["text"]

    return speakers_text

In [6]:
def insert_no_speaks(speakers_text):
    speakers_text_with_no_speaks = []
    prev_end = 0
    for speaker in speakers_text:
        curr_start = speaker["start"]
        if curr_start > prev_end:
            speakers_text_with_no_speaks.append(
                {
                    "text": "",
                    "start": prev_end,
                    "end": curr_start,
                    "code": code,
                    "year": year,
                    "government": government,
                    "type": meeting_type,
                    "speaker": "",
                }
            )
            speakers_text_with_no_speaks.append(speaker)

        prev_end = speaker["end"]

    return speakers_text_with_no_speaks

In [14]:
meeting_type = "vergadering"
for year in os.listdir(BASE_PATH):
    if year == ".DS_Store":
        continue
    if not os.path.isdir(f"{BASE_PATH}/{year}/transcripts") or not os.path.isdir(
        f"{BASE_PATH}/{year}/diorizations"
    ):
        continue

    save_dir = f"{BASE_PATH}/{year}/turnObjects"
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)

    for transcript_file in os.listdir(f"{BASE_PATH}/{year}/transcripts"):
        if transcript_file.startswith("."):
            continue
        if os.path.isfile(f"{save_dir}/{transcript_file}"):
            print(f"{save_dir}/{transcript_file} already exists")
            continue
        code = transcript_file.split(".")[0]
        if not os.path.isfile(f"{BASE_PATH}/{year}/diorizations/{code}.wav.rttm"):
            print(f"diorization file for {code} not found.")
            continue

        diorizations = load_diorizations(
            f"{BASE_PATH}/{year}/diorizations/{code}.wav.rttm"
        )

        print(f"Doing {BASE_PATH}/{year}/transcripts/{transcript_file}")
        try:
            transcript = load_transcript(
                f"{BASE_PATH}/{year}/transcripts/{transcript_file}"
            )
        except Exception:
            continue
        speaking_turns = create_speaking_turns(diorizations)

        speakers_text = create_speakers_text(
            speaking_turns, transcript, code, year, government, meeting_type
        )

        speakers_text_nospeaks = insert_no_speaks(speakers_text)

        with open(f"{save_dir}/{transcript_file}", "w") as f:
            json.dump(speakers_text_nospeaks, f)

/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/586219.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/586232.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/586242.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/586238.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/586215.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/657080.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/586231.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/586224.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/586241.mp4.json already exists
/Volumes/Samsung_T5/data/hoekschewaard/vergaderingen/2019/turnObjects/598036.mp4.json alrea