In [153]:
import json
import os

In [154]:
government = 'haarlem'
BASE_PATH = f"data/{government}"

In [155]:
def load_transcript(path):
    with open(path, "r") as f:
        data = json.load(f)

    data = [{"start": l["start"], "end": l["end"], "text": l["text"].replace("...", "").strip()} for l in data["segments"]]

    return data


def load_diorizations(path):
    with open(path, "r") as f:
        lines = f.readlines()
        data = [
            {"start": float(rttm[3]), "duration": float(rttm[4]), "speaker": rttm[7]}
            for rttm in [line.strip().split(" ") for line in lines]
        ]

    return data

In [156]:
def create_speaking_turns(diorizations):
    current_speaker = ""
    current_speaker_start = 0
    current_speaker_duration = 0

    speaking_turns = []

    for i, dior in enumerate(diorizations):
        # If speaker speaks for less than a few seconds, and they do not keep speaking,
        # it is probably a diorization error and we discard it.
        if (
            i < len(diorizations) - 2
            and dior["duration"] < 1
            and dior["speaker"] != diorizations[i + 1]["speaker"]
        ):
            current_speaker_duration = (
                dior["start"] + dior["duration"] - current_speaker_start
            )
            continue
        if dior["speaker"] == current_speaker:
            current_speaker_duration = (
                dior["start"] + dior["duration"] - current_speaker_start
            )
        else:
            # TODO: handle
            if i != 0:
                speaking_turns.append(
                    {
                        "speaker": current_speaker,
                        "start": round(current_speaker_start, 2),
                        "duration": round(current_speaker_duration, 2),
                    }
                )

            current_speaker = dior["speaker"]
            current_speaker_start = dior["start"]
            current_speaker_duration = dior["duration"]

    return speaking_turns

In [157]:
def create_speakers_text(speaking_turns, transcript, code, year, government):
    speakers_text = []
    current_speaker_text = ""
    for speaker in speaking_turns:
        speaker_start = round(speaker["start"], 2)
        speaker_end = round(speaker["start"] + speaker["duration"], 2)
        for i, sentence in enumerate(transcript):
            # If the end of the sentence is further than the speaker's end,
            # speaker is done with talking and his text is added to the list.
            if sentence["start"] < speaker["start"]:
                continue
            if sentence["end"] > speaker_end - 3:
                if current_speaker_text != "":
                    # Prevents some sentences from not being caught due to beginnen
                    # and end rounding errors.
                    if i < len(transcript) - 2:
                        current_speaker_text += (
                            transcript[i + 1]["text"] + transcript[i + 2]["text"]
                        )
                    speakers_text.append(
                        {
                            "text": current_speaker_text,
                            "start": speaker_start,
                            "end": speaker_end,
                            "code": code,
                            "year": year,
                            "government": government,
                        }
                    )
                current_speaker_text = ""
                break
            else:
                current_speaker_text += sentence["text"]

    return speakers_text

In [158]:
for year in os.listdir(BASE_PATH):
    if year == ".DS_Store":
        continue
    if not os.path.isdir(f"{BASE_PATH}/{year}/transcripts") or not os.path.isdir(
        f"{BASE_PATH}/{year}/diorizations"
    ):
        continue

    save_dir = f"{BASE_PATH}/{year}/turnObjects"
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)

    for transcript_file in os.listdir(f"{BASE_PATH}/{year}/transcripts"):
        code = transcript_file.split(".")[0]
        if not os.path.isfile(f"{BASE_PATH}/{year}/diorizations/{code}.wav.rttm"):
            print(f"diorization file for {code} not found.")
            continue

        diorizations = load_diorizations(
            f"{BASE_PATH}/{year}/diorizations/{code}.wav.rttm"
        )

        print(f"Doing {BASE_PATH}/{year}/transcripts/{transcript_file}")
        transcript = load_transcript(
            f"{BASE_PATH}/{year}/transcripts/{transcript_file}"
        )

        speaking_turns = create_speaking_turns(diorizations)

        speakers_text = create_speakers_text(
            speaking_turns, transcript, code, year, government
        )

        with open(f"{save_dir}/{transcript_file}", "w") as f:
            json.dump(speakers_text, f)

Doing data/haarlem/2014/transcripts/125416.mp4.json
Doing data/haarlem/2014/transcripts/103253.mp4.json
Doing data/haarlem/2014/transcripts/101050.mp4.json
Doing data/haarlem/2014/transcripts/101051.mp4.json
diorization file for  not found.
Doing data/haarlem/2014/transcripts/101057.mp4.json
Doing data/haarlem/2014/transcripts/101041.mp4.json
Doing data/haarlem/2014/transcripts/101037.mp4.json
Doing data/haarlem/2014/transcripts/101663.mp4.json
Doing data/haarlem/2014/transcripts/101058.mp4.json
Doing data/haarlem/2014/transcripts/101039.mp4.json
Doing data/haarlem/2014/transcripts/101044.mp4.json
Doing data/haarlem/2014/transcripts/101045.mp4.json
Doing data/haarlem/2014/transcripts/101048.mp4.json
Doing data/haarlem/2014/transcripts/101043.mp4.json
Doing data/haarlem/2014/transcripts/101042.mp4.json
Doing data/haarlem/2014/transcripts/148629.mp4.json
Doing data/haarlem/2022/transcripts/938922.mp4.json
Doing data/haarlem/2022/transcripts/938863.mp4.json
Doing data/haarlem/2022/transcr