In [56]:
import os
import pandas as pd
import IPython.display as ipd
from pydub.utils import mediainfo
import re
import time

# Make LibriSpeech Great Again

In [57]:
paths = [
    "/home/morph/Desktop/FINAL/LS/train-other-500/LibriSpeech",
    "/home/morph/Desktop/FINAL/LS/train-clean-360/LibriSpeech"
]

def load_librispeech_data(paths):
    data = []
    for path in paths:
        for subdir, dirs, files in os.walk(path):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(subdir, file)
                    with open(file_path, 'r') as f:
                        for line in f:
                            audio_id, sentence = line.strip().split(maxsplit=1)
                            audio_path = os.path.join(subdir, audio_id + ".flac")
                            data.append([audio_path, sentence])
    return pd.DataFrame(data, columns=["audio_path", "sentence"])

df = load_librispeech_data(paths)
df.head()

Unnamed: 0,audio_path,sentence
0,/home/morph/Desktop/FINAL/LS/train-other-500/L...,CHAPTER TWENTY FIVE A VISIT FROM LOUISE THAT D...
1,/home/morph/Desktop/FINAL/LS/train-other-500/L...,TO STIFLE HER WITH HOUSEHOLD AMMONIA AND LIDDY...
2,/home/morph/Desktop/FINAL/LS/train-other-500/L...,SHE CLUTCHED AT MY SLEEVE WHEN I WENT CLOSE TO...
3,/home/morph/Desktop/FINAL/LS/train-other-500/L...,COMING JUST AFTER THE FIRE THE HOUSEHOLD WAS D...
4,/home/morph/Desktop/FINAL/LS/train-other-500/L...,BUT SHE WAS SO EXCITED I WAS AFRAID SHE WOULD ...


In [58]:
df = df.head(20000)

In [59]:
len(df)

20000

In [60]:
df['sentence'][2]

'SHE CLUTCHED AT MY SLEEVE WHEN I WENT CLOSE TO HER AND REFUSED TO LET GO'

In [61]:
ipd.Audio(df['audio_path'][2])

In [62]:
def get_phonemes(text):
    phonemes_list = []
    text = text.lower()
    for sent in sentences(text, 
                          lang="en-US", 
                          espeak=False, 
                          punctuations=False, 
                          major_breaks=False, 
                          minor_breaks=False):
        for word in sent:
            if word.phonemes: 
                phonemes_list.append(''.join(word.phonemes))  

    return ' '.join(phonemes_list)

start_time = time.time()
total_rows = len(df)
print("Начало обработки...")

df['gruut_phonemes'] = ""

for i in range(total_rows):
    sentence = df.loc[i, 'sentence']
    df.at[i, 'gruut_phonemes'] = get_phonemes(sentence) 

    if (i + 1) % 20 == 0 or (i + 1) == total_rows:
        print(f"Обработано {i + 1} строк из {total_rows}")

print("Обработка завершена.")

Начало обработки...
Обработано 20 строк из 20000
Обработано 40 строк из 20000
Обработано 60 строк из 20000
Обработано 80 строк из 20000
Обработано 100 строк из 20000
Обработано 120 строк из 20000
Обработано 140 строк из 20000
Обработано 160 строк из 20000
Обработано 180 строк из 20000
Обработано 200 строк из 20000
Обработано 220 строк из 20000
Обработано 240 строк из 20000
Обработано 260 строк из 20000
Обработано 280 строк из 20000
Обработано 300 строк из 20000
Обработано 320 строк из 20000
Обработано 340 строк из 20000
Обработано 360 строк из 20000
Обработано 380 строк из 20000
Обработано 400 строк из 20000
Обработано 420 строк из 20000
Обработано 440 строк из 20000
Обработано 460 строк из 20000
Обработано 480 строк из 20000
Обработано 500 строк из 20000
Обработано 520 строк из 20000
Обработано 540 строк из 20000
Обработано 560 строк из 20000
Обработано 580 строк из 20000
Обработано 600 строк из 20000
Обработано 620 строк из 20000
Обработано 640 строк из 20000
Обработано 660 строк из 

In [63]:
df['gruut_phonemes'] = df['gruut_phonemes'].apply(lambda x: re.sub(r'[ˈˌ]', '', x))
df['gruut_phonemes'] = df['gruut_phonemes'].apply(lambda x: x.split())
df.head()

Unnamed: 0,audio_path,sentence,gruut_phonemes
0,/home/morph/Desktop/FINAL/LS/train-other-500/L...,CHAPTER TWENTY FIVE A VISIT FROM LOUISE THAT D...,"[t͡ʃæptɚ, twɛnti, faɪv, ə, vɪzɪt, fɹʌm, luiz, ..."
1,/home/morph/Desktop/FINAL/LS/train-other-500/L...,TO STIFLE HER WITH HOUSEHOLD AMMONIA AND LIDDY...,"[tə, staɪfəl, hɚ, wɪθ, haʊshoʊld, əmoʊnjə, ænd..."
2,/home/morph/Desktop/FINAL/LS/train-other-500/L...,SHE CLUTCHED AT MY SLEEVE WHEN I WENT CLOSE TO...,"[ʃi, klʌt͡ʃt, æt, maɪ, sliv, wɛn, aɪ, wɛnt, kl..."
3,/home/morph/Desktop/FINAL/LS/train-other-500/L...,COMING JUST AFTER THE FIRE THE HOUSEHOLD WAS D...,"[kʌmɪŋ, d͡ʒʌst, æftɚ, ðə, faɪɚ, ðə, haʊshoʊld,..."
4,/home/morph/Desktop/FINAL/LS/train-other-500/L...,BUT SHE WAS SO EXCITED I WAS AFRAID SHE WOULD ...,"[bʌt, ʃi, wəz, soʊ, ɪksaɪtɪd, aɪ, wəz, əfɹeɪd,..."


In [64]:
def get_duration_in_minutes(audio_path):
    audio_info = mediainfo(audio_path)
    duration_seconds = float(audio_info['duration'])
    return duration_seconds / 60 

df['duration'] = df['audio_path'].apply(get_duration_in_minutes)

In [65]:
final_df = df[['audio_path', 'sentence', 'gruut_phonemes', 'duration']].copy()
final_df.columns = ['audio_path', 'sentence', 'gruut_phonemes', 'duration']

final_df.head()

Unnamed: 0,audio_path,sentence,gruut_phonemes,duration
0,/home/morph/Desktop/FINAL/LS/train-other-500/L...,CHAPTER TWENTY FIVE A VISIT FROM LOUISE THAT D...,"[t͡ʃæptɚ, twɛnti, faɪv, ə, vɪzɪt, fɹʌm, luiz, ...",0.236833
1,/home/morph/Desktop/FINAL/LS/train-other-500/L...,TO STIFLE HER WITH HOUSEHOLD AMMONIA AND LIDDY...,"[tə, staɪfəl, hɚ, wɪθ, haʊshoʊld, əmoʊnjə, ænd...",0.249167
2,/home/morph/Desktop/FINAL/LS/train-other-500/L...,SHE CLUTCHED AT MY SLEEVE WHEN I WENT CLOSE TO...,"[ʃi, klʌt͡ʃt, æt, maɪ, sliv, wɛn, aɪ, wɛnt, kl...",0.071583
3,/home/morph/Desktop/FINAL/LS/train-other-500/L...,COMING JUST AFTER THE FIRE THE HOUSEHOLD WAS D...,"[kʌmɪŋ, d͡ʒʌst, æftɚ, ðə, faɪɚ, ðə, haʊshoʊld,...",0.179167
4,/home/morph/Desktop/FINAL/LS/train-other-500/L...,BUT SHE WAS SO EXCITED I WAS AFRAID SHE WOULD ...,"[bʌt, ʃi, wəz, soʊ, ɪksaɪtɪd, aɪ, wəz, əfɹeɪd,...",0.24575


In [66]:
total_duration = df['duration'].sum()

print(f'Общая длительность всего датасета: {total_duration:.2f} минут')

Общая длительность всего датасета: 4035.93 минут


In [67]:
total_duration / 60

np.float64(67.26553506444445)

In [68]:
final_df.to_csv('/home/morph/Desktop/FINAL/LSoutput/ls_dataset.csv', index=False)