Sređivanje TextGridova koje nam je dao MFA 

In [2]:
#Dobili smo od MFA segmentisane recenice u nekom ruznom formatu pa sad izvlacimo samo foneme, njihove od-do trenutke i naziv fajla iz kog su

def parse_textgrid(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        lines = file.readlines()

    phonemes = []
    in_phones_section = False
    phoneme = {}

    for line in lines:
        line = line.strip()
        if 'name = "phones"' in line: 
            in_phones_section = True
        elif 'name = "words"' in line:  
            in_phones_section = False

        if in_phones_section:
            if line.startswith('intervals ['):
                phoneme = {}  
            elif line.startswith('xmin ='):
                phoneme['xmin'] = float(line.split('=')[1].strip())
            elif line.startswith('xmax ='):
                phoneme['xmax'] = float(line.split('=')[1].strip())
            elif line.startswith('text ='):
                phoneme['text'] = line.split('=')[1].strip().strip('"')
                phonemes.append(phoneme) 

    return phonemes #ovo je niz koji sadrzi fonemu, od kad do kad traje i naziv fajla iz kog je uzeta

In [3]:
#Sad koristimo ovu gore funkciju da prodjemo kroz sve TextGridove i napravimo 'svaka_fonema_od_do.txt' 
#Svaki red ce imati format: fonema xmin xmax duration filename

import os 

textgrid_directory = 'LJSpeech Dataset/LJSpeech-1.1/out'
output_file_path = 'svaka_fonema_od_do.txt'

for filename in os.listdir(textgrid_directory):
    if filename.endswith('.TextGrid'):
        file_path = os.path.join(textgrid_directory, filename)
        phonemes = parse_textgrid(file_path)

        with open(output_file_path, 'a', encoding='utf-8') as output_file:
            for phoneme in phonemes:
                if phoneme.get('text'):  # Ignorisanje praznog teksta
                    duration = phoneme['xmax'] - phoneme['xmin']
                    output_file.write(f"{phoneme['text']}\t{phoneme['xmin']}\t{phoneme['xmax']}\t{duration:.3f}\t{filename.replace('.TextGrid', '.wav')}\n")

print("Završeno je parsiranje i čuvanje fonema.")


In [None]:
#Sad koristimo fajl 'svaka_fonema_od_do.txt'  da dobijemo fajl 'foneme_prosecna_duzina.txt'

from collections import defaultdict

input_file_path = 'svaka_fonema_od_do.txt'
output_file_path = 'foneme_prosecna_duzina.txt'

phoneme_durations = defaultdict(list)

with open(input_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t')
        print(parts)
        if len(parts) == 5:
            phoneme, xmin, xmax, leng, fajl = parts
            duration = float(leng)
            phoneme_durations[phoneme].append(duration)
            
average_durations = {phoneme: sum(durations) / len(durations)
    for phoneme, durations in phoneme_durations.items()}

with open(output_file_path, 'w', encoding='utf-8') as file:
    for phoneme, avg_duration in average_durations.items():
        file.write(f"{phoneme}\t{avg_duration:.6f}\n")
        
print("Završeno je izračunavanje i čuvanje prosečne dužine trajanja fonema.")

Cuvanje optimalnog snimka svake foneme

In [None]:
#Pravimo rucno razmak jer ga MFA nije snimio

def create_silent_audio(duration_ms=150):
    silent_segment = AudioSegment.silent(duration=duration_ms)
    
    silent_file_path = os.path.join('foneme_wavs', 'silent.wav')
    
    if not os.path.exists('foneme_wavs'):
        os.makedirs('foneme_wavs')
    
    silent_segment.export(silent_file_path, format='wav')
    print(f"Kratak prazan audio fajl je kreiran na putanji: {silent_file_path}")

create_silent_audio()

In [None]:
#Funkcija za brze citanje prosecne duzine foneme

def prosecna_duzina(fonema):
    with open('foneme_prosecna_duzina.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            phoneme_text, avg_duration = parts
            if phoneme_text == fonema:
                return float(avg_duration)
    return None

In [None]:
#Prolazimo kroz sva ponavljanja neke foneme i trazimo ono cija je duzina najbliza prosecnoj duzini te foneme
#Rezultat isecemo iz fajla u kom je nadjen i ubacujemo ga u folder 'foneme_wavs'

import os
from pydub import AudioSegment
import numpy as np

def optimalna_fonema(fonema):
    avg_duration = prosecna_duzina(fonema)
    if avg_duration is None:
        print(f"Prosečna dužina za fonemu {fonema} nije pronađena")
        return

    optimal_duration_diff = float('inf')
    optimal_start = None
    optimal_end = None
    optimal_file = None

    with open('svaka_fonema_od_do.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) == 5:
            fon, pocetak, kraj, trajanje, fajl = parts
            trajanje = float(trajanje)  
            pocetak = float(pocetak)   
            kraj = float(kraj)          

            if fon == fonema:
                duration_diff = abs(trajanje - avg_duration)
                if duration_diff < optimal_duration_diff:
                    optimal_duration_diff = duration_diff
                    optimal_start = pocetak
                    optimal_end = kraj
                    optimal_file = fajl

    if optimal_file is not None:
        audio = AudioSegment.from_wav(os.path.join('LJSpeech Dataset/LJSpeech-1.1/wavs', optimal_file))
        segment = audio[optimal_start * 1000:optimal_end * 1000]

        output_dir = 'foneme_wavs'
        os.makedirs(output_dir, exist_ok=True)

        output_path = os.path.join(output_dir, f"{fonema}_optimal.wav")
        segment.export(output_path, format='wav')
        print(f"Optimalni segment foneme {fonema} sačuvan u {output_path}")

    else:
        print(f"Fonema {fonema} nije pronađena u fajlu")

In [None]:
#Pozivamo gornju funkciju za svaku fonemu da bismo imali optimalne snimke koje cemo posle da lepimo

def sacuvaj_sve_foneme():
    foneme = []
    with open('foneme_prosecna_duzina.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                foneme.append(parts[0])

    for fonema in foneme:
        optimalna_fonema(fonema)

In [None]:
#Sad tek zapravo pozivamo gornje 3 funkcije i dobijamo snimke fonema u novom folderu

sacuvaj_sve_foneme()

Pretvaranje teksta u foneme (G2P)

In [None]:
#Koristimo G2P jer ne MFA retardiran i nmg da ga namestim kod sebe pa su foneme drugacije oznacene

from g2p_en import G2p

def text2phoneme(text):
    g2p = G2p()
    fonemi = g2p(text)
    return fonemi

print(text2phoneme("Hello I am Natasa and I love coconuts"))

['HH', 'AH0', 'L', 'OW1', ' ', 'AY1', ' ', 'AE1', 'M', ' ', 'N', 'AA0', 'T', 'AA1', 'S', 'AH0', ' ', 'AH0', 'N', 'D', ' ', 'AY1', ' ', 'L', 'AH1', 'V', ' ', 'K', 'OW1', 'K', 'AH0', 'N', 'AH2', 'T', 'S']


In [None]:
#Mapiramo G2P foneme (dobijene od teksta) na MFA foneme (dobijene od audia)

phoneme_map = {
    'AA': 'ɑː', 'AA0': 'ɑː', 'AA1': 'ɑː', 'AA2': 'ɑː',
    'AE': 'æ', 'AE0': 'æ', 'AE1': 'æ', 'AE2': 'æ',
    'AH': 'ə', 'AH0': 'ə', 'AH1': 'ə', 'AH2': 'ə',
    'AO': 'ɒ', 'AO0': 'ɒ', 'AO1': 'ɒ', 'AO2': 'ɒ',
    'AW': 'aw', 'AW0': 'aw', 'AW1': 'aw', 'AW2': 'aw',
    'AY': 'aj', 'AY0': 'aj', 'AY1': 'aj', 'AY2': 'aj',
    'B': 'b',
    'CH': 'tʃ',
    'D': 'd',
    'DH': 'ð',
    'EH': 'ɛ', 'EH0': 'ɛ', 'EH1': 'ɛ', 'EH2': 'ɛ',
    'ER': 'ɝ', 'ER0': 'ɝ', 'ER1': 'ɝ', 'ER2': 'ɝ',
    'EY': 'ej', 'EY0': 'ej', 'EY1': 'ej', 'EY2': 'ej',
    'F': 'f',
    'G': 'ɡ',
    'HH': 'h',
    'IH': 'ɪ', 'IH0': 'ɪ', 'IH1': 'ɪ', 'IH2': 'ɪ',
    'IY': 'iː', 'IY0': 'iː', 'IY1': 'iː', 'IY2': 'iː',
    'JH': 'dʒ',
    'K': 'k',
    'L': 'l',
    'M': 'm',
    'N': 'n',
    'NG': 'ŋ',
    'OW': 'ow', 'OW0': 'ow', 'OW1': 'ow', 'OW2': 'ow',
    'OY': 'ɔj', 'OY0': 'ɔj', 'OY1': 'ɔj', 'OY2': 'ɔj',
    'P': 'p',
    'R': 'ɹ',
    'S': 's',
    'SH': 'ʃ',
    'T': 't',
    'TH': 'θ',
    'UH': 'ʊ', 'UH0': 'ʊ', 'UH1': 'ʊ', 'UH2': 'ʊ',
    'UW': 'ʉː', 'UW0': 'ʉː', 'UW1': 'ʉː', 'UW2': 'ʉː',
    'V': 'v',
    'W': 'w',
    'Y': 'j',
    'Z': 'z',
    'ZH': 'ʒ',
    ',': ' '
}

In [None]:
#Koristimo mapu da prevedemo foneme dobijene od teksta u foneme koje znamo da procitamo

def translate_phoneme(phoneme, phoneme_map):
    return phoneme_map.get(phoneme, phoneme)

In [None]:
#Direktno pretvaranje teksta u mfa foneme

def text2phoneme_mfa(text):
    fonemi = text2phoneme(text)
    fonemi_mfa = [translate_phoneme(p, phoneme_map) for p in fonemi]
    return fonemi_mfa

Lepljenje fonema i spajanje u snimak (text2speech v1)

In [None]:
#Funkcija koja dobije niz fonema, nalazi njihove optimalne snimke i lepi ih u jedan audio snimak
#Ovde direktno pustamo taj audio snimak da cujemo prvu verziju koja je retardirana

from pydub import AudioSegment
import io
from IPython.display import Audio, display

def zalepi(foneme):
    combined = AudioSegment.empty()

    for fonema in foneme:
        segment_path = os.path.join('foneme_wavs', f"{fonema}_optimal.wav")
        if fonema == " ":
            segment_path = "foneme_wavs/silent.wav"

        if os.path.exists(segment_path):
            segment = AudioSegment.from_wav(segment_path)
            combined += segment
        else:
            print(f"Segment za fonemu {fonema} nije pronađen na putanji {segment_path}")

    buffer = io.BytesIO()
    combined.export(buffer, format='wav')
    buffer.seek(0)

    display(Audio(buffer.read(), rate=combined.frame_rate))

In [None]:
#Sve je spremno sad samo spajamo pretvaranje teksta u G2P foneme, pa u MFA foneme pa ih procitamo

def izgovori(text):
    zalepi(text2phoneme_mfa(text))

In [None]:
#Pozivanje funkcije za citanje, ovde mogu da se igram i pokazujem ljudima kako ne radi

izgovori("I can buy myself flowers   Write my name in the sand   Talk to myself for hours   Say things you dont understand")

Priprema outputaza CNN (spektrogrami pravih snimaka)

In [None]:
#Funkcija za pravljenje i cuvanje mel spektrograma od .wav fajla

import librosa
import matplotlib.pyplot as plt
import numpy as np

def create_mel_spectrogram(wav_file, output_file, sr=16000, duration=10):
    y, _ = librosa.load(wav_file, sr=sr, duration=duration)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_dB = librosa.power_to_db(S, ref=np.max)

    expected_length = int(duration * sr / 512)
    if S_dB.shape[1] < expected_length:
        padding = expected_length - S_dB.shape[1]
        S_dB = np.pad(S_dB, ((0, 0), (0, padding)), mode='constant', constant_values=-80)

    plt.figure(figsize=(10, 4))
    plt.imshow(S_dB, aspect='auto', origin='lower', cmap='inferno')
    plt.axis('off')
    
    plt.savefig(output_file, bbox_inches='tight', pad_inches=0)
    plt.close()

In [None]:
#Pravljenje outputa za dataset za CNN od originalnih .wav fajlova iz LJSpeech-a
#Pravimo spektrogram za sve snimke jer bi ovako trebalo da zvuci govor

def process_all_wavs(folder_path, output_folder_path):
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.wav'):
            wav_file = os.path.join(folder_path, file_name)
            output_file = os.path.join(output_folder_path, file_name.replace('.wav', '.png'))

            if not os.path.exists(output_file):
                create_mel_spectrogram(wav_file, output_file)

In [None]:
#Pozivanje funkcija i generisanje dataseta

folder_path = 'LJSpeech Dataset/LJSpeech-1.1/wavs'
output_folder_path = 'Spektrogrami/outputs'
process_all_wavs(folder_path, output_folder_path)

Priprema inputa za CNN (moji generisani snimci transkripcija iz LJSpeecha --> njihovi spektrogrami)

In [None]:
#Sada opet lepimo prosledjene foneme ali umesto da ih direktno display-ujemo ih returnujemo da bismo ih posle stavili u folder

def foneme2audio(foneme):
    combined = AudioSegment.empty()

    for fonema in foneme:
        segment_path = os.path.join('foneme_wavs', f"{fonema}_optimal.wav")
        if fonema == " ":
            segment_path = "foneme_wavs/silent.wav"

        if os.path.exists(segment_path):
            segment = AudioSegment.from_wav(segment_path)
            combined += segment
        else:
            print(f"Segment za fonemu {fonema} nije pronađen na putanji {segment_path}")

    buffer = io.BytesIO()
    combined.export(buffer, format='wav')
    buffer.seek(0)

    return buffer.getvalue()

In [None]:
#Funkcija kojoj prosledimo podatke .wav i adresu na koju da ga sacuva

def save_wav_from_bytes(data, file_path):
    with open(file_path, 'wb') as f:
        f.write(data)

In [None]:
#Uzima txt fajl i generise wav fajl od teksta
#Format txt fajla je da prvo pise naziv fajla i unutra pa tek onda pravi tekst

def process_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    first_line = lines[0].strip()
    first_word = first_line.split()[0]
    
    remaining_text = ' '.join(first_line.split()[1:]) + ' ' + ' '.join([line.strip() for line in lines[1:]])
    
    foneme = text2phoneme_mfa(remaining_text)
    
    audio_data = foneme2audio(foneme)
    
    output_wav_file = f"{first_word}.wav"
    
    save_wav_from_bytes(audio_data, output_wav_file)

In [None]:
#Pretvara sve txt iz datog foldera u wav i stavlja ih u drugi folder

def process_all_text_files(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            process_text_file(file_path)

In [None]:
#Pozivamo funkcije na LJSpeech recenicama i generisemo ih izgovorene

txt_folder_path = 'LJSpeech Dataset/LJSpeech-1.1/txts'
process_all_text_files(txt_folder_path)

In [None]:
import os

folder_path = 'my_wavs'
output_folder_path = 'Spektrogrami/inputs'
process_all_wavs(folder_path, output_folder_path)