In [2]:
!pip install tgt -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [37]:
import os
import pandas as pd
import tgt
import IPython.display as ipd
import soundfile as sf
from gruut import sentences

In [41]:
def process_textgrid(textgrid_path):
    try:
        tg = tgt.io.read_textgrid(textgrid_path)
    except Exception as e:
        print(f"Ошибка при чтении TextGrid файла: {textgrid_path}. Ошибка: {e}")
        return None, None

    if "words" not in tg.get_tier_names() or "phones" not in tg.get_tier_names():
        print(f"Слои 'words' или 'phones' отсутствуют в файле: {textgrid_path}")
        return None, None

    words_tier = tg.get_tier_by_name("words")
    phones_tier = tg.get_tier_by_name("phones")

    words = [(interval.text, interval.start_time, interval.end_time) for interval in words_tier]
    phones = [(interval.text, interval.start_time, interval.end_time) for interval in phones_tier]

    words = [word for word in words if word[0]]
    phones = [phone for phone in phones if phone[0] != "sp"]

    if not words or not phones:
        print(f"Отсутствуют валидные слова или фонемы в файле: {textgrid_path}")
        return None, None

    word_phonemes = []
    phone_index = 0

    for word, word_start, word_end in words:
        word_phoneme = []
        while phone_index < len(phones):
            phone, phone_start, phone_end = phones[phone_index]
            if phone_end <= word_end:
                word_phoneme.append(phone)
                phone_index += 1
            else:
                break

        word_phonemes.append(word_phoneme)

    return words, word_phonemes

def get_audio_duration(wav_path):
    try:
        with sf.SoundFile(wav_path) as audio_file:
            frames = audio_file.frames
            samplerate = audio_file.samplerate
            return frames / samplerate / 60 
    except Exception as e:
        print(f"Ошибка при определении длительности WAV файла: {wav_path}. Ошибка: {e}")
        return 0

def create_dataset(base_path):
    data = []
    problematic_files = []

    for speaker_folder in os.listdir(base_path):
        speaker_path = os.path.join(base_path, speaker_folder)
        if not os.path.isdir(speaker_path):
            continue

        textgrid_dir = os.path.join(speaker_path, "textgrid")
        wav_dir = os.path.join(speaker_path, "wav")
        transcript_dir = os.path.join(speaker_path, "transcript")

        if not os.path.exists(textgrid_dir):
            print(f"Папка textgrid отсутствует для спикера: {speaker_folder}")
            continue

        for file in os.listdir(textgrid_dir):
            if file.endswith(".TextGrid"):
                textgrid_path = os.path.join(textgrid_dir, file)
                wav_path = os.path.join(wav_dir, file.replace(".TextGrid", ".wav"))
                transcript_path = os.path.join(transcript_dir, file.replace(".TextGrid", ".txt"))

                if not os.path.exists(wav_path):
                    print(f"WAV файл не найден: {wav_path}")
                    problematic_files.append(textgrid_path)
                    continue

                if not os.path.exists(transcript_path):
                    print(f"Файл транскрипции не найден: {transcript_path}")
                    problematic_files.append(textgrid_path)
                    continue

                with open(transcript_path, "r") as f:
                    sentence = f.read().strip()

                words, word_phonemes = process_textgrid(textgrid_path)
                if words is None or word_phonemes is None:
                    problematic_files.append(textgrid_path)
                    continue

                duration = get_audio_duration(wav_path)

                data.append({
                    "audio_path": wav_path,
                    "sentence": sentence,
                    "word_phonemes": word_phonemes,
                    "duration": duration,
                })

    if problematic_files:
        print(f"Обнаружены проблемные файлы: {len(problematic_files)}")
        for file in problematic_files:
            print(f" - {file}")

    return pd.DataFrame(data)

dataset_path = "/home/morph/Desktop/FINAL/l2arctic_release_v5.0"

df = create_dataset(dataset_path)

In [55]:
df['word_phonemes'] = df['word_phonemes'].apply(lambda x: [[phoneme for phoneme in sublist if phoneme != 'sil'] for sublist in x])

In [57]:
df

Unnamed: 0,audio_path,sentence,word_phonemes,duration
0,/home/morph/Desktop/FINAL/l2arctic_release_v5....,But she swung obediently on her heel into the ...,"[[B, AH1, T], [SH, IY1], [S, W, AH1, NG], [OW0...",0.062968
1,/home/morph/Desktop/FINAL/l2arctic_release_v5....,It resembled tea less than lager beer resemble...,"[[IH1, T], [R, IH0, Z, EH1, M, B, AH0, L, D], ...",0.083311
2,/home/morph/Desktop/FINAL/l2arctic_release_v5....,Close beside him gleamed the white fangs of th...,"[[K, L, OW1, S], [B, IY2, S, AY1, D], [HH, IH1...",0.084729
3,/home/morph/Desktop/FINAL/l2arctic_release_v5....,And Raoul listened again to the tale of the house,"[[AH0, N, D], [R, AA0, UW1, L], [L, IH1, S, AH...",0.051542
4,/home/morph/Desktop/FINAL/l2arctic_release_v5....,My age in years is twenty two,"[[M, AY1], [EY1, JH], [IH1, N], [Y, IH1, R, Z]...",0.039602
...,...,...,...,...
26862,/home/morph/Desktop/FINAL/l2arctic_release_v5....,Oolong was two hundred and fifty miles from th...,"[[UW1, L, AO0, NG], [W, AO1, Z], [T, UW1], [HH...",0.084266
26863,/home/morph/Desktop/FINAL/l2arctic_release_v5....,There was no forecasting this strange girl's p...,"[[DH, EH1, R], [W, AA1, Z], [N, OW1], [F, AO1,...",0.069351
26864,/home/morph/Desktop/FINAL/l2arctic_release_v5....,A little before dawn of the day following the ...,"[[AH0], [L, IH1, T, AH0, L], [B, IH0, F, AO1, ...",0.092997
26865,/home/morph/Desktop/FINAL/l2arctic_release_v5....,The ship should be in within a week or ten days,"[[DH, AH0], [SH, IH1, P], [SH, UH1, D], [B, IY...",0.064200


In [58]:
sum(df['duration']) / 60

27.07375992063486

In [59]:
print(df['sentence'][122])
print(df['word_phonemes'][122])

She saw the answer in his face
[['SH', 'IY1'], ['S', 'AO1'], ['DH', 'AH0'], ['AE1', 'N', 'S', 'ER0'], ['IH1', 'N'], ['HH', 'IH1', 'Z'], ['F', 'EY1', 'S']]


In [60]:
ipd.Audio(df['audio_path'][122])

In [70]:
def collect_unique_phonemes(dataframe):
    unique_phonemes = set()

    for phoneme_list in dataframe['word_phonemes']:
        for phoneme_sublist in phoneme_list:
            unique_phonemes.update(phoneme_sublist)

    return unique_phonemes

unique_phonemes_l2 = collect_unique_phonemes(df)

In [71]:
print(f"Уникальные фонемы: {unique_phonemes_l2}")

Уникальные фонемы: {'Z', 'OW', 'D', 'EH2', 'OW0', 'AE2', 'EY0', 'Y', 'UW1', 'IH1', 'W', 'DH', 'AH2', 'AY0', 'AE0', 'IH', 'AA2', 'IH0', 'B', 'IY0', 'SH', 'EH1', 'V', 'JH', 'M', 'IY2', 'OY0', 'AW0', 'AH1', 'N', 'ER2', 'IY', 'UH1', 'OW2', 'AA0', 'UW2', 'AW2', 'G', 'TH', 'HH', 'P', 'ER1', 'AA1', 'F', 'L', 'EY1', 'OY2', 'EY2', 'OY1', 'AY2', 'EH0', 'spn', 'UH0', 'T', 'CH', 'ER0', 'OW1', 'AO1', 'AY1', 'EY', 'S', 'UW0', 'AO2', 'IY1', 'AW1', 'UH2', 'IH2', 'AH0', 'AO', 'ZH', 'AE1', 'K', 'NG', 'AO0', 'R'}


In [72]:
len(unique_phonemes_l2)

75

In [73]:
for x in unique_phonemes_l2:
    print(x)

Z
OW
D
EH2
OW0
AE2
EY0
Y
UW1
IH1
W
DH
AH2
AY0
AE0
IH
AA2
IH0
B
IY0
SH
EH1
V
JH
M
IY2
OY0
AW0
AH1
N
ER2
IY
UH1
OW2
AA0
UW2
AW2
G
TH
HH
P
ER1
AA1
F
L
EY1
OY2
EY2
OY1
AY2
EH0
spn
UH0
T
CH
ER0
OW1
AO1
AY1
EY
S
UW0
AO2
IY1
AW1
UH2
IH2
AH0
AO
ZH
AE1
K
NG
AO0
R


In [75]:
unique_phonemes = {}

def get_phonemes(sentence):
    phonemes = []
    for sent in sentences(sentence, 
                          lang="en-US", 
                          espeak=False, 
                          punctuations=False, 
                          major_breaks=False, 
                          minor_breaks=False):
        for word in sent:
            if word.phonemes:
                phonemes.extend(word.phonemes) 
    return phonemes

for sentence in df['sentence']:
    phonemes = get_phonemes(sentence)
    for phoneme in phonemes:
        unique_phonemes[phoneme] = None 

print(unique_phonemes.keys())

dict_keys(['b', 'ˈʌ', 't', 'ʃ', 'ˈi', 's', 'w', 'ŋ', 'oʊ', 'd', 'i', 'ə', 'n', 'l', 'ˈɔ', 'h', 'ˈɚ', 'ˈɪ', 'u', 'ð', 'ɹ', 'ɪ', 'z', 'ˈɛ', 'm', 'ˈɑ', 'ɡ', 'ɚ', 'æ', 'p', 'ˈeɪ', 'k', 'ˈoʊ', 'ˈaɪ', 'f', 'ˈæ', 'v', 'ˈʊ', 'ɑ', 'ˈu', 'ˈaʊ', 'd͡ʒ', 'j', 'θ', 'ˈɔɪ', 't͡ʃ', 'ˌɛ', 'ˌɪ', 'ɔ', 'ʌ', 'ʒ', 'ˌʊ', 'ˌɑ', 'ˌæ', 'aʊ', 'ɛ', 'aɪ', 'ˌeɪ', 'ˌi', 'ˌu', 'ˌoʊ', 'ˌaɪ', 'ˌʌ', 'eɪ', 'ˌɔ', 'ʊ', 'ˌɚ', 'ˌɔɪ', 'ɔɪ', 'ˌaʊ'])


In [76]:
cleaned_phonemes = {phoneme.replace('ˌ', '').replace('ˈ', ''): None for phoneme in unique_phonemes}


In [78]:
list(cleaned_phonemes)

['b',
 'ʌ',
 't',
 'ʃ',
 'i',
 's',
 'w',
 'ŋ',
 'oʊ',
 'd',
 'ə',
 'n',
 'l',
 'ɔ',
 'h',
 'ɚ',
 'ɪ',
 'u',
 'ð',
 'ɹ',
 'z',
 'ɛ',
 'm',
 'ɑ',
 'ɡ',
 'æ',
 'p',
 'eɪ',
 'k',
 'aɪ',
 'f',
 'v',
 'ʊ',
 'aʊ',
 'd͡ʒ',
 'j',
 'θ',
 'ɔɪ',
 't͡ʃ',
 'ʒ']

In [None]:
L2_PHONEMES = {
    'Z', 'OW', 'D', 'EH2', 'OW0', 'AE2', 'EY0', 'Y', 'UW1', 'IH1', 'W', 'DH',
    'AH2', 'AY0', 'AE0', 'IH', 'AA2', 'IH0', 'B', 'IY0', 'SH', 'EH1', 'V', 'JH',
    'M', 'IY2', 'OY0', 'AW0', 'AH1', 'N', 'ER2', 'IY', 'UH1', 'OW2', 'AA0',
    'UW2', 'AW2', 'G', 'TH', 'HH', 'P', 'ER1', 'AA1', 'F', 'L', 'EY1', 'OY2',
    'EY2', 'OY1', 'AY2', 'EH0', 'spn', 'UH0', 'T', 'CH', 'ER0', 'OW1', 'AO1',
    'AY1', 'EY', 'S', 'UW0', 'AO2', 'IY1', 'AW1', 'UH2', 'IH2', 'AH0', 'AO',
    'ZH', 'AE1', 'K', 'NG', 'AO0', 'R'
}

GRUUT_NO_ESPEAK = {
    'b', 'ʌ', 't', 'ʃ', 'i', 's', 'w', 'ŋ', 'oʊ', 'd', 'ə', 'n', 'l', 'ɔ', 'h',
    'ɚ', 'ɪ', 'u', 'ð', 'ɹ', 'z', 'ɛ', 'm', 'ɑ', 'ɡ', 'æ', 'p', 'eɪ', 'k', 'aɪ',
    'f', 'v', 'ʊ', 'aʊ', 'd͡ʒ', 'j', 'θ', 'ɔɪ', 't͡ʃ', 'ʒ'
}


FROM_L2_TO_GRUUT = {
    'Z': 'z',
    'OW': 'oʊ',
    'D': 'd',
    'EH2': 'ɛ',
    'OW0': 'oʊ',
    'AE2': 'æ',
    'EY0': 'eɪ',
    'Y': 'j',
    'UW1': 'u',
    'IH1': 'ɪ',
    'W': 'w',
    'DH': 'ð',
    'AH2': 'ʌ',
    'AY0': 'aɪ',
    'AE0': 'æ',
    'IH': 'ɪ',
    'AA2': 'ɑ',
    'IH0': 'ɪ',
    'B': 'b',
    'IY0': 'i',
    'SH': 'ʃ',
    'EH1': 'ɛ',
    'V': 'v',
    'JH': 'd͡ʒ',
    'M': 'm',
    'IY2': 'i',
    'OY0': 'ɔɪ',
    'AW0': 'aʊ',
    'AH1': 'ʌ',
    'N': 'n',
    'ER2': 'ɚ',
    'IY': 'i',
    'UH1': 'ʊ',
    'OW2': 'oʊ',
    'AA0': 'ɑ',
    'UW2': 'u',
    'AW2': 'aʊ',
    'G': 'ɡ',
    'TH': 'θ',
    'HH': 'h',
    'P': 'p',
    'ER1': 'ɚ',
    'AA1': 'ɑ',
    'F': 'f',
    'L': 'l',
    'EY1': 'eɪ',
    'OY2': 'ɔɪ',
    'EY2': 'eɪ',
    'OY1': 'ɔɪ',
    'AY2': 'aɪ',
    'EH0': 'ɛ',
    'spn': 'ə',
    'UH0': 'ʊ',
    'T': 't',
    'CH': 't͡ʃ',
    'ER0': 'ɚ',
    'OW1': 'oʊ',
    'AO1': 'ɔ',
    'AY1': 'aɪ',
    'EY': 'eɪ',
    'S': 's',
    'UW0': 'u',
    'AO2': 'ɔ',
    'IY1': 'i',
    'AW1': 'aʊ',
    'UH2': 'ʊ',
    'IH2': 'ɪ',
    'AH0': 'ʌ',
    'AO': 'ɔ',
    'ZH': 'ʒ',
    'AE1': 'æ',
    'K': 'k',
    'NG': 'ŋ',
    'AO0': 'ɔ',
    'R': 'ɹ'
}


In [80]:
def convert_to_gruut(phoneme_list):
    words = []
    for word_phonemes in phoneme_list:
        word = ''.join(FROM_L2_TO_GRUUT.get(phoneme, phoneme) for phoneme in word_phonemes)
        words.append(word)
    return words

df['gruut_phonemes'] = df['word_phonemes'].apply(convert_to_gruut)

df

Unnamed: 0,audio_path,sentence,word_phonemes,duration,gruut_phonemes
0,/home/morph/Desktop/FINAL/l2arctic_release_v5....,But she swung obediently on her heel into the ...,"[[B, AH1, T], [SH, IY1], [S, W, AH1, NG], [OW0...",0.062968,"[bʌt, ʃi, swʌŋ, oʊbidiʌntli, ɑn, hɚ, hil, ɪntʌ..."
1,/home/morph/Desktop/FINAL/l2arctic_release_v5....,It resembled tea less than lager beer resemble...,"[[IH1, T], [R, IH0, Z, EH1, M, B, AH0, L, D], ...",0.083311,"[ɪt, ɹɪzɛmbʌld, ti, lɛs, ðæn, lɑɡɚ, bɪɹ, ɹɪzɛm..."
2,/home/morph/Desktop/FINAL/l2arctic_release_v5....,Close beside him gleamed the white fangs of th...,"[[K, L, OW1, S], [B, IY2, S, AY1, D], [HH, IH1...",0.084729,"[kloʊs, bisaɪd, hɪm, ɡlimd, ði, waɪt, fæŋz, ʌv..."
3,/home/morph/Desktop/FINAL/l2arctic_release_v5....,And Raoul listened again to the tale of the house,"[[AH0, N, D], [R, AA0, UW1, L], [L, IH1, S, AH...",0.051542,"[ʌnd, ɹɑul, lɪsʌnd, ʌɡeɪn, tʌ, ðʌ, teɪl, ʌv, ð..."
4,/home/morph/Desktop/FINAL/l2arctic_release_v5....,My age in years is twenty two,"[[M, AY1], [EY1, JH], [IH1, N], [Y, IH1, R, Z]...",0.039602,"[maɪ, eɪd͡ʒ, ɪn, jɪɹz, ɪz, twɛni, tu]"
...,...,...,...,...,...
26862,/home/morph/Desktop/FINAL/l2arctic_release_v5....,Oolong was two hundred and fifty miles from th...,"[[UW1, L, AO0, NG], [W, AO1, Z], [T, UW1], [HH...",0.084266,"[ulɔŋ, wɔz, tu, hʌndɹɪd, ænd, fɪfti, maɪlz, fɹ..."
26863,/home/morph/Desktop/FINAL/l2arctic_release_v5....,There was no forecasting this strange girl's p...,"[[DH, EH1, R], [W, AA1, Z], [N, OW1], [F, AO1,...",0.069351,"[ðɛɹ, wɑz, noʊ, fɔɹkæstɪŋ, ðɪs, stɹeɪnd͡ʒ, ɡɚl..."
26864,/home/morph/Desktop/FINAL/l2arctic_release_v5....,A little before dawn of the day following the ...,"[[AH0], [L, IH1, T, AH0, L], [B, IH0, F, AO1, ...",0.092997,"[ʌ, lɪtʌl, bɪfɔɹ, dɔn, ʌv, ðʌ, deɪ, fɑloʊɪŋ, ð..."
26865,/home/morph/Desktop/FINAL/l2arctic_release_v5....,The ship should be in within a week or ten days,"[[DH, AH0], [SH, IH1, P], [SH, UH1, D], [B, IY...",0.064200,"[ðʌ, ʃɪp, ʃʊd, bi, ɪn, wɪθɪn, ʌ, wik, ɔɹ, tɛn,..."


In [84]:
df.to_csv("/home/morph/Desktop/FINAL/ARTICoutput/arctic_final.csv", index=False)
print("Датасет сохранен")

Датасет сохранен


In [86]:
def collect_unique_phonemes(dataframe):
    unique_phonemes = set()

    for phoneme_list in dataframe['gruut_phonemes']:
        for phoneme_sublist in phoneme_list:
            unique_phonemes.update(phoneme_sublist)

    return unique_phonemes

unique_phonemes_l2 = collect_unique_phonemes(df)

for x in unique_phonemes_l2:
    print(x)

æ
ɪ
ɚ
ɛ
ð
ɑ
l
ʃ
i
ɹ
ɡ
h
f
m
ɔ
d
ə
a
ŋ
k
n
s
j
v
o
ʊ
u
ʌ
p
b
z
θ
w
t
͡
e
ʒ
