In [1]:
import glob 
import json
import pandas as pd
import os
import numpy as np
import random
import librosa
import string

In [2]:
files = glob.glob(os.path.join("data/voice/SZ_parliament/audio", "*.wav"))

In [3]:
print(files[2400])

data/voice/SZ_parliament/audio/sentence_0428_Part1_246.wav


In [4]:
transcript_0224 = pd.read_csv("output/audio_output_0224/sentence_info_0224.csv", index_col=0)
transcript_0324 = pd.read_csv("output/audio_output_0324/sentence_info_0324.csv", index_col=0)
transcript_0428_1 = pd.read_csv("output/audio_output_0428_Part1/sentence_info_0428_Part1.csv", index_col=0)
transcript_0428_2 = pd.read_csv("output/audio_output_0428_Part2/sentence_info_0428_Part2.csv", index_col=0)

In [5]:
transcript_0224['session_index'] = np.repeat("0224", len(transcript_0224))
transcript_0324['session_index'] = np.repeat("0324", len(transcript_0324))
transcript_0428_1['session_index'] = np.repeat("0428_Part1", len(transcript_0428_1))
transcript_0428_2['session_index'] = np.repeat("0428_Part2", len(transcript_0428_2))

In [6]:
transcripts = pd.concat([transcript_0224, transcript_0324, transcript_0428_1, transcript_0428_2], axis = 0)

In [7]:
print(transcripts)

      truth_length  stt_length      score  stt_confidence  \
73             209         180 -98.478667        0.858000   
75             217         195 -99.257728        0.897702   
76             194         152 -89.375014        0.885997   
78              84          72 -31.559422        0.956565   
81             153         111 -68.379946        0.864266   
...            ...         ...        ...             ...   
1404            79          68 -29.352541        0.898084   
1406           204         174 -97.608686        0.855947   
1409            66          46 -29.536461        0.861031   
1411           193         137 -80.369447        0.887506   
1412           169         135 -65.612923        0.923816   

                                           truth_string  \
73    Im Namen einer Mehrheit der CVP-Fraktion und i...   
75    Glücklicherweise hat dieser Druck bereits genü...   
76    In dieser Stellungnahme fordert die Regierung ...   
78    Natürlich sollen all dies

In [8]:
print(len(transcripts))

2861


In [9]:
split_indices = pd.DataFrame(range(len(transcripts)))
print(split_indices)

         0
0        0
1        1
2        2
3        3
4        4
...    ...
2856  2856
2857  2857
2858  2858
2859  2859
2860  2860

[2861 rows x 1 columns]


In [10]:
np.random.seed(42)
train, validate, test = np.split(split_indices.sample(frac=1), [int(.8*len(split_indices)), int(.9*len(split_indices))])

In [11]:
print(len(train) / len(transcripts))
print(len(validate) / len(transcripts))
print(len(test) / len(transcripts))

0.7997203774903879
0.09996504718629849
0.10031457532331353


In [12]:
train_indices = train[0].values.tolist()
val_indices = validate[0].values.tolist()
test_indices = test[0].values.tolist()

In [13]:
# --- Building Manifest Files --- #
# Function to build a manifest

def build_manifest(transcript_file, manifest_path, split_indices):
    split_index = 0
    with open(manifest_path, 'w') as fout:
        for index, row in transcript_file.iterrows():
            if split_index in split_indices:
                sentence = row['truth_string'].lower().translate(str.maketrans('', '', string.punctuation))
                session = row['session_index']
                audio_path = f'data/voice/SZ_parliament/audio/sentence_{session}_{index}.wav'
                duration = duration = librosa.core.get_duration(filename=audio_path)

                # Write the metadata to the manifest
                metadata = {
                    "audio_filepath": audio_path,
                    "text": sentence,
                    "duration": duration
                }
                json.dump(metadata, fout)
                fout.write('\n')
            split_index+=1


In [14]:
# Building Manifests
build_manifest(transcripts, "data/voice/SZ_parliament/train_manifest.json", train_indices)

In [15]:
# Building Manifests
build_manifest(transcripts, "data/voice/SZ_parliament/val_manifest.json", val_indices)

In [16]:
# Building Manifests
build_manifest(transcripts, "data/voice/SZ_parliament/test_manifest.json", test_indices)