In [1]:
'''
%%capture
%pip install transformers==4.4.0
%pip install soundfile
%pip install jiwer

%pip install datasets>=1.18.3
%pip install evaluate
'''
import os
import re
import ast
import random
import librosa
import json
import evaluate
import numpy as np
import pandas as pd
from datasets import ClassLabel
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from IPython.display import display, HTML
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
import soundfile as sf

import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import IPython.display as ipd

from jiwer import cer
from jiwer import wer
import evaluate

from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer
from scipy.io.wavfile import write

# **Fine-tuning XLSR-Wav2Vec2 for Phoneme Recognition**

## Чтение и подготовка датасета

In [2]:
df = pd.read_csv('/home/morph/Desktop/FINAL/updated_FINAL_DATASET.csv')
df

Unnamed: 0,audio_path,sentence,phonemes,duration,dataset
0,/home/morph/Desktop/FINAL/processed_dataset/1.wav,This has been attributed to helium film flow i...,"['ðɪs', 'hɛz', 'bɛn', 'ɪt͡ʃɪbutɪd', 'tu', 'hɪl...",0.001351,TIMIT
1,/home/morph/Desktop/FINAL/processed_dataset/2.wav,Steve collects rare and novel coins.,"['stiv', 'kəlɛks', 'ɹɛɹ', 'ɪn', 'nɑvəl', 'kɔɪns']",0.000798,TIMIT
2,/home/morph/Desktop/FINAL/processed_dataset/3.wav,Cyclical programs will never compile.,"['sɪklɪkəl', 'pɹoʊɡɹæmz', 'wəl', 'nɛvɚ', 'kəmp...",0.000731,TIMIT
3,/home/morph/Desktop/FINAL/processed_dataset/4.wav,"Receiving no answer, they set the fire.","['ɹisivɪŋ', 'noʊ', 'ænsɚ', 'ðeɪ', 'sɛ', 'ðɪ', ...",0.000706,TIMIT
4,/home/morph/Desktop/FINAL/processed_dataset/5.wav,Don't ask me to carry an oily rag like that.,"['doʊn', 'æsk', 'mɪ', 'tɪ', 'kɛɹi', 'ɪn', 'ɔli...",0.000743,TIMIT
...,...,...,...,...,...
15107,/home/morph/Desktop/FINAL/processed_dataset/15...,The original storyline was developed by David ...,"['dʌ', 'ɔɹɪ͡ʒʌnʌl', 'stɔɹi', 'laɪn', 'wʌz', 'd...",0.002595,CV
15108,/home/morph/Desktop/FINAL/processed_dataset/15...,"All high-speed rail lines, except those in Rus...","['ɔl', 'haɪ', 'spɹid', 'ɹeɪlaɪns', 'ɛksɛpt', '...",0.002945,CV
15109,/home/morph/Desktop/FINAL/processed_dataset/15...,"For your first installation, I highly recommen...","['fɹɚ', 'jɚ', 'fɚst', 'ˌɪnstʌleɪʃʌn', 'aɪ', 'h...",0.002379,CV
15110,/home/morph/Desktop/FINAL/processed_dataset/15...,Pat Burchat is the deputy director.,"['hæt', 'bɚt', 't͡ʃæp', 'ɪz', 'ðʌ', 'dɛpjʌti',...",0.001188,CV


In [3]:
def validate_and_fix_phonemes(row):
    value = row['phonemes']
    if isinstance(value, list) and all(isinstance(item, str) for item in value):
        # Если уже корректный список строк, оставить без изменений
        return value
    try:
        # Попытка преобразовать строку в список с использованием ast.literal_eval
        parsed_value = ast.literal_eval(value)
        if isinstance(parsed_value, list) and all(isinstance(item, str) for item in parsed_value):
            return parsed_value
    except (ValueError, SyntaxError, TypeError):
        pass
    # Если не удалось преобразовать, вернуть None или оставить оригинальное значение
    return None

df['phonemes'] = df.apply(validate_and_fix_phonemes, axis=1)

df

Unnamed: 0,audio_path,sentence,phonemes,duration,dataset
0,/home/morph/Desktop/FINAL/processed_dataset/1.wav,This has been attributed to helium film flow i...,"[ðɪs, hɛz, bɛn, ɪt͡ʃɪbutɪd, tu, hɪliɪm, fɪlm, ...",0.001351,TIMIT
1,/home/morph/Desktop/FINAL/processed_dataset/2.wav,Steve collects rare and novel coins.,"[stiv, kəlɛks, ɹɛɹ, ɪn, nɑvəl, kɔɪns]",0.000798,TIMIT
2,/home/morph/Desktop/FINAL/processed_dataset/3.wav,Cyclical programs will never compile.,"[sɪklɪkəl, pɹoʊɡɹæmz, wəl, nɛvɚ, kəmpɑl]",0.000731,TIMIT
3,/home/morph/Desktop/FINAL/processed_dataset/4.wav,"Receiving no answer, they set the fire.","[ɹisivɪŋ, noʊ, ænsɚ, ðeɪ, sɛ, ðɪ, faɪɚ]",0.000706,TIMIT
4,/home/morph/Desktop/FINAL/processed_dataset/5.wav,Don't ask me to carry an oily rag like that.,"[doʊn, æsk, mɪ, tɪ, kɛɹi, ɪn, ɔli, ɹæɡ, laɪk, ...",0.000743,TIMIT
...,...,...,...,...,...
15107,/home/morph/Desktop/FINAL/processed_dataset/15...,The original storyline was developed by David ...,"[dʌ, ɔɹɪ͡ʒʌnʌl, stɔɹi, laɪn, wʌz, dɛvɛlʌpt, ba...",0.002595,CV
15108,/home/morph/Desktop/FINAL/processed_dataset/15...,"All high-speed rail lines, except those in Rus...","[ɔl, haɪ, spɹid, ɹeɪlaɪns, ɛksɛpt, doʊz, ɪn, ʌ...",0.002945,CV
15109,/home/morph/Desktop/FINAL/processed_dataset/15...,"For your first installation, I highly recommen...","[fɹɚ, jɚ, fɚst, ˌɪnstʌleɪʃʌn, aɪ, haɪli, ɹɛkʌm...",0.002379,CV
15110,/home/morph/Desktop/FINAL/processed_dataset/15...,Pat Burchat is the deputy director.,"[hæt, bɚt, t͡ʃæp, ɪz, ðʌ, dɛpjʌti, dɚɛktɚ]",0.001188,CV


In [4]:
def clean_phoneme_list(phonemes):
    cleaned_phonemes = []
    for phoneme in phonemes:
        # Удаляем пробелы и знаки ударения ˌ и ˈ
        cleaned_phoneme = re.sub(r"[ˌˈ\s]", "", phoneme)
        cleaned_phonemes.append(cleaned_phoneme)
    return cleaned_phonemes

# Применяем функцию к столбцу
df['phonemes'] = df['phonemes'].apply(clean_phoneme_list)

df

Unnamed: 0,audio_path,sentence,phonemes,duration,dataset
0,/home/morph/Desktop/FINAL/processed_dataset/1.wav,This has been attributed to helium film flow i...,"[ðɪs, hɛz, bɛn, ɪt͡ʃɪbutɪd, tu, hɪliɪm, fɪlm, ...",0.001351,TIMIT
1,/home/morph/Desktop/FINAL/processed_dataset/2.wav,Steve collects rare and novel coins.,"[stiv, kəlɛks, ɹɛɹ, ɪn, nɑvəl, kɔɪns]",0.000798,TIMIT
2,/home/morph/Desktop/FINAL/processed_dataset/3.wav,Cyclical programs will never compile.,"[sɪklɪkəl, pɹoʊɡɹæmz, wəl, nɛvɚ, kəmpɑl]",0.000731,TIMIT
3,/home/morph/Desktop/FINAL/processed_dataset/4.wav,"Receiving no answer, they set the fire.","[ɹisivɪŋ, noʊ, ænsɚ, ðeɪ, sɛ, ðɪ, faɪɚ]",0.000706,TIMIT
4,/home/morph/Desktop/FINAL/processed_dataset/5.wav,Don't ask me to carry an oily rag like that.,"[doʊn, æsk, mɪ, tɪ, kɛɹi, ɪn, ɔli, ɹæɡ, laɪk, ...",0.000743,TIMIT
...,...,...,...,...,...
15107,/home/morph/Desktop/FINAL/processed_dataset/15...,The original storyline was developed by David ...,"[dʌ, ɔɹɪ͡ʒʌnʌl, stɔɹi, laɪn, wʌz, dɛvɛlʌpt, ba...",0.002595,CV
15108,/home/morph/Desktop/FINAL/processed_dataset/15...,"All high-speed rail lines, except those in Rus...","[ɔl, haɪ, spɹid, ɹeɪlaɪns, ɛksɛpt, doʊz, ɪn, ʌ...",0.002945,CV
15109,/home/morph/Desktop/FINAL/processed_dataset/15...,"For your first installation, I highly recommen...","[fɹɚ, jɚ, fɚst, ɪnstʌleɪʃʌn, aɪ, haɪli, ɹɛkʌmɛ...",0.002379,CV
15110,/home/morph/Desktop/FINAL/processed_dataset/15...,Pat Burchat is the deputy director.,"[hæt, bɚt, t͡ʃæp, ɪz, ðʌ, dɛpjʌti, dɚɛktɚ]",0.001188,CV


In [5]:
def load_audio(audio_path):
    array, sr = librosa.load(audio_path, sr=16000)
    return array, sr


def prepare_dataset(df):
    data = []
    for idx, row in df.iterrows():
        file_path = row['audio_path']
        text = row['sentence']

        # Use ast.literal_eval to safely evaluate the phonemes string
        phonemes = row['phonemes']
        if isinstance(phonemes, str):
            phonemes = ast.literal_eval(phonemes)

        # Load audio and retrieve the array and sampling rate
        array, sr = load_audio(file_path)

        # Append the structured dictionary for each row
        data.append({
            'file': file_path,
            'audio': {
                'path': file_path,
                'array': array,
                'sampling_rate': sr
            },
            'text': text,
            'phonemes': phonemes,
            'id': idx
        })

    return data

In [6]:
data = prepare_dataset(df)

random.seed(42) 
random.shuffle(data)

train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]

dataset = DatasetDict({
    'train': Dataset.from_list(train_data),
    'test': Dataset.from_list(test_data)
})

dataset.save_to_disk('/home/morph/Desktop/FINAL/ivanov182/timit_huggingface_dataset')

print(dataset)

Saving the dataset (0/7 shards):   0%|          | 0/12089 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/3023 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonemes', 'id'],
        num_rows: 12089
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonemes', 'id'],
        num_rows: 3023
    })
})


In [7]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(dataset['train'].remove_columns(['file', 'id']), num_examples=5)

Unnamed: 0,audio,text,phonemes
0,"{'array': [-0.020477294921875, -0.01971435546875, -0.020538330078125, -0.020172119140625, -0.020599365234375, -0.0216064453125, -0.0213623046875, -0.0213623046875, -0.021514892578125, -0.021942138671875, -0.021820068359375, -0.022369384765625, -0.02252197265625, -0.022552490234375, -0.022308349609375, -0.022186279296875, -0.021728515625, -0.021392822265625, -0.021453857421875, -0.0205078125, -0.019866943359375, -0.0198974609375, -0.01904296875, -0.018707275390625, -0.018768310546875, -0.017730712890625, -0.017547607421875, -0.017242431640625, -0.016571044921875, -0.0169677734375, -0.0166015625, -0.016082763671875, -0.016021728515625, -0.015411376953125, -0.015899658203125, -0.015289306640625, -0.015533447265625, -0.015716552734375, -0.015625, -0.014984130859375, -0.0150146484375, -0.0142822265625, -0.013946533203125, -0.013336181640625, -0.013641357421875, -0.013580322265625, -0.01177978515625, -0.012237548828125, -0.011505126953125, -0.011749267578125, -0.01141357421875, -0.01171875, -0.010528564453125, -0.010101318359375, -0.0096435546875, -0.009613037109375, -0.00885009765625, -0.008880615234375, -0.008209228515625, -0.007843017578125, -0.0069580078125, -0.00677490234375, -0.006256103515625, -0.005706787109375, -0.005218505859375, -0.004547119140625, -0.003997802734375, -0.00360107421875, -0.002655029296875, -0.00238037109375, -0.001373291015625, -0.00128173828125, -0.001373291015625, -0.001495361328125, -0.001251220703125, -0.001708984375, -0.002777099609375, -0.00238037109375, -0.002197265625, -0.001983642578125, -0.0018310546875, -0.002685546875, -0.00262451171875, -0.002655029296875, -0.0028076171875, -0.003204345703125, -0.003326416015625, -0.003509521484375, -0.003021240234375, -0.003448486328125, -0.003509521484375, -0.00360107421875, -0.0023193359375, -0.001861572265625, -0.00140380859375, -0.000885009765625, -0.000335693359375, 0.00054931640625, 0.001556396484375, 0.002532958984375, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/6510.wav', 'sampling_rate': 16000}",The farmer works the soil and produces grain,"[ðʌ, fɑɹmɚ, wɚks, ðʌ, sɔɪl, ænd, pɹʌdusʌz, ɡɹeɪn]"
1,"{'array': [0.00732421875, 0.02044677734375, 0.016357421875, 0.00860595703125, -0.000335693359375, -0.005859375, -0.004547119140625, -0.001953125, 0.000946044921875, 0.00714111328125, 0.010955810546875, 0.01190185546875, 0.015045166015625, 0.008544921875, -0.007171630859375, -0.012786865234375, -0.002227783203125, -0.000701904296875, -0.0037841796875, 0.000213623046875, 0.0006103515625, 0.001617431640625, -0.0023193359375, -0.010528564453125, -0.010528564453125, -0.01275634765625, -0.012847900390625, -0.00054931640625, 0.004302978515625, 0.00823974609375, 0.0145263671875, 0.01361083984375, 0.009033203125, 0.000213623046875, -9.1552734375e-05, -0.00042724609375, -0.00506591796875, -0.0003662109375, -0.000335693359375, -0.002197265625, -0.001739501953125, -6.103515625e-05, 0.00469970703125, -0.003173828125, -0.0107421875, -0.0081787109375, -0.007781982421875, -0.002838134765625, -0.002349853515625, -0.0157470703125, -0.01617431640625, -0.000885009765625, 0.003631591796875, -0.00933837890625, -0.011627197265625, -0.00286865234375, -0.000244140625, -0.000396728515625, -0.000518798828125, -0.000457763671875, -0.00177001953125, -0.001007080078125, -0.00030517578125, 0.003082275390625, -0.005767822265625, -0.017852783203125, -0.01910400390625, -0.007720947265625, -0.006439208984375, -0.018402099609375, -0.01702880859375, -0.01007080078125, 0.00384521484375, 0.0023193359375, -0.01336669921875, -0.016082763671875, -0.006622314453125, -0.00482177734375, -0.012725830078125, -0.01312255859375, -0.006500244140625, -0.004302978515625, -0.00567626953125, -0.007171630859375, -0.010009765625, -0.006500244140625, -0.00775146484375, -0.013397216796875, -0.01220703125, -0.00665283203125, -0.004241943359375, -0.00537109375, -0.009429931640625, -0.01373291015625, -0.008575439453125, -0.003631591796875, -0.00701904296875, -0.01507568359375, -0.010223388671875, -0.0042724609375, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/13064.wav', 'sampling_rate': 16000}",I AM IN GREAT TROUBLE SAID SHE TAKING WHAT WAS INTENDED TO BE A HOPELESSLY MELANCHOLY SURVEY OF A FEW SMALL APPLES LYING UNDER THE TREE YET A CRITICAL EAR MIGHT HAVE NOTICED IN HER VOICE A TENTATIVE TONE,"[aɪ, æm, ɪn, ɡɹeɪt, tɹʌbəl, sɛd, ʃi, teɪkɪŋ, wʌt, wəz, ɪntɛndɪd, tə, bi, ə, hoʊpləsli, mɛləŋkɑli, sɚveɪ, əv, ə, fju, smɔl, æpəlz, laɪɪŋ, ʌndɚ, ðə, tɹi, jɛt, ə, kɹɪtɪkəl, ɪɹ, maɪt, hæv, noʊtɪst, ɪn, hɚ, vɔɪs, ə, tɛntɪtɪv, toʊn]"
2,"{'array': [0.0003662109375, 0.00030517578125, -0.000946044921875, -0.002716064453125, -0.00408935546875, -0.00299072265625, -0.00225830078125, -0.00286865234375, -0.002899169921875, -0.00250244140625, -0.0032958984375, -0.003814697265625, -0.00384521484375, -0.005096435546875, -0.005035400390625, -0.005126953125, -0.0047607421875, -0.00439453125, -0.0050048828125, -0.006744384765625, -0.006134033203125, -0.006072998046875, -0.00634765625, -0.0072021484375, -0.006317138671875, -0.005828857421875, -0.005279541015625, -0.0067138671875, -0.006622314453125, -0.007354736328125, -0.008056640625, -0.0084228515625, -0.00799560546875, -0.00775146484375, -0.00799560546875, -0.0081787109375, -0.00787353515625, -0.00689697265625, -0.007781982421875, -0.007904052734375, -0.00762939453125, -0.007293701171875, -0.0064697265625, -0.0081787109375, -0.008575439453125, -0.00811767578125, -0.008544921875, -0.009368896484375, -0.008880615234375, -0.008056640625, -0.008392333984375, -0.008148193359375, -0.008514404296875, -0.00970458984375, -0.01055908203125, -0.010772705078125, -0.01080322265625, -0.01080322265625, -0.010650634765625, -0.01055908203125, -0.010345458984375, -0.0101318359375, -0.011077880859375, -0.010589599609375, -0.00982666015625, -0.009490966796875, -0.0089111328125, -0.009124755859375, -0.008636474609375, -0.00872802734375, -0.00872802734375, -0.008758544921875, -0.00823974609375, -0.007598876953125, -0.008056640625, -0.006378173828125, -0.006805419921875, -0.00726318359375, -0.0062255859375, -0.004730224609375, -0.00537109375, -0.005889892578125, -0.0052490234375, -0.004791259765625, -0.0037841796875, -0.00372314453125, -0.004119873046875, -0.003692626953125, -0.00299072265625, -0.00250244140625, -0.001739501953125, -0.000732421875, -0.0003662109375, -0.0008544921875, -0.00048828125, 0.000335693359375, 6.103515625e-05, -0.00091552734375, -0.000274658203125, 0.0010986328125, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/8009.wav', 'sampling_rate': 16000}",Surely I will excuse you she cried,"[ʃʊɹli, aɪ, wɪl, ɪkskjuz, ju, ʃi, kɹaɪd]"
3,"{'array': [-0.00531005859375, -0.005889892578125, -0.004608154296875, -0.004791259765625, -0.005340576171875, -0.005767822265625, -0.006500244140625, -0.0079345703125, -0.008514404296875, -0.00909423828125, -0.0098876953125, -0.01019287109375, -0.009307861328125, -0.010650634765625, -0.010101318359375, -0.009521484375, -0.009124755859375, -0.008636474609375, -0.007965087890625, -0.008697509765625, -0.007965087890625, -0.007171630859375, -0.0074462890625, -0.00775146484375, -0.00689697265625, -0.007537841796875, -0.00689697265625, -0.006988525390625, -0.00714111328125, -0.0067138671875, -0.00665283203125, -0.00714111328125, -0.005859375, -0.006561279296875, -0.006744384765625, -0.00616455078125, -0.006683349609375, -0.0062255859375, -0.006378173828125, -0.006439208984375, -0.00701904296875, -0.00665283203125, -0.007965087890625, -0.008941650390625, -0.008880615234375, -0.00933837890625, -0.0101318359375, -0.00909423828125, -0.009979248046875, -0.010101318359375, -0.00848388671875, -0.008026123046875, -0.00799560546875, -0.00701904296875, -0.007659912109375, -0.00677490234375, -0.006866455078125, -0.007843017578125, -0.008941650390625, -0.007843017578125, -0.008392333984375, -0.00872802734375, -0.00689697265625, -0.00726318359375, -0.0076904296875, -0.007293701171875, -0.00775146484375, -0.007659912109375, -0.00726318359375, -0.007965087890625, -0.007904052734375, -0.006561279296875, -0.00726318359375, -0.0072021484375, -0.00762939453125, -0.008575439453125, -0.008941650390625, -0.008453369140625, -0.009063720703125, -0.008087158203125, -0.008819580078125, -0.009674072265625, -0.009185791015625, -0.00958251953125, -0.009368896484375, -0.00994873046875, -0.008575439453125, -0.009063720703125, -0.007598876953125, -0.00616455078125, -0.005401611328125, -0.00555419921875, -0.003814697265625, -0.002288818359375, -0.001739501953125, -0.001007080078125, -0.000457763671875, 0.0001220703125, 0.000518798828125, -0.0001220703125, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/11257.wav', 'sampling_rate': 16000}",Yea so are all the lesser animals of today clean,"[jeɪ, soʊ, ɑɹ, ɔl, ðʌ, lɛsɚ, ænʌmʌlz, ʌv, tudeɪ, klin]"
4,"{'array': [-0.00543212890625, -0.00537109375, -0.005279541015625, -0.004730224609375, -0.004852294921875, -0.004608154296875, -0.0042724609375, -0.004638671875, -0.00445556640625, -0.005157470703125, -0.004913330078125, -0.00531005859375, -0.0048828125, -0.00445556640625, -0.0040283203125, -0.004180908203125, -0.00396728515625, -0.0045166015625, -0.004364013671875, -0.00567626953125, -0.005523681640625, -0.006134033203125, -0.005401611328125, -0.004547119140625, -0.003173828125, -0.00286865234375, -0.00213623046875, -0.001800537109375, -0.001007080078125, -0.00115966796875, -6.103515625e-05, 0.000946044921875, 0.001129150390625, 0.001495361328125, 0.0030517578125, 0.003570556640625, 0.00390625, 0.0047607421875, 0.0047607421875, 0.004608154296875, 0.004730224609375, 0.004913330078125, 0.005157470703125, 0.0052490234375, 0.00482177734375, 0.00439453125, 0.003448486328125, 0.00335693359375, 0.0020751953125, 0.001556396484375, 9.1552734375e-05, -0.001007080078125, -0.00189208984375, -0.00262451171875, -0.00311279296875, -0.003326416015625, -0.003753662109375, -0.00372314453125, -0.002777099609375, -0.0025634765625, -0.00262451171875, -0.002410888671875, -0.00213623046875, -0.001068115234375, -0.00146484375, -0.001129150390625, -0.001129150390625, -0.000701904296875, -0.000457763671875, 0.000457763671875, 0.000579833984375, 0.000732421875, 0.00115966796875, 0.00146484375, 0.00189208984375, 0.001922607421875, 0.001861572265625, 0.001953125, 0.002716064453125, 0.003662109375, 0.003143310546875, 0.002960205078125, 0.0030517578125, 0.00311279296875, 0.003082275390625, 0.003326416015625, 0.00311279296875, 0.003021240234375, 0.002288818359375, 0.001678466796875, 0.00201416015625, 0.00189208984375, 0.001708984375, 0.002410888671875, 0.002044677734375, 0.002777099609375, 0.0035400390625, 0.004241943359375, 0.0054931640625, 0.00592041015625, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/10564.wav', 'sampling_rate': 16000}",He deluged me overwhelmed me with argument,"[hi, dɛljud͡ʒd, mi, oʊvɚwɛlmd, mi, wɪθ, ɑɹɡjʌmʌnt]"


In [8]:
def get_phonetic_code(x):
    """
    Создаем поле phonetic_codes с разбивкой фонем на отдельные символы, включая пробелы.
    
    Пример:
    Входное поле phonemes: ['stiv', 'kəlɛs', 'ɹɛɹ', 'ɪn', 'nɑvl̩', 'kɔɪns']
    Выходное поле phonetic_codes: 
    ['s', 't', 'ɪ', 'v', ' ', 'k', 'ə', 'l', 'ɛ', 's', ' ', 'ɹ', 'ɛ', 'ɹ', ' ', 'ɪ', 'n', ' ', 'n', 'ɑ', 'v', 'l', '̩', ' ', 'k', 'ɔ', 'ɪ', 'n', 's']
    """
    # Преобразуем список фонем в строку с пробелами между ними
    with_space = '|'.join(x['phonemes'])

    # Разбиваем строку на отдельные символы, включая пробелы
    x['phonetic_codes'] = list(with_space)

    return x

dataset = dataset.map(get_phonetic_code)

Map:   0%|          | 0/12089 [00:00<?, ? examples/s]

Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

In [9]:
print(dataset['train']['text'][1])
print(dataset['train']['phonemes'][1])
print(dataset['train']['phonetic_codes'][1])

Due to the lack of a bug bounty program, Johanna decided to sell her exploit on the black market.
['dju', 'tu', 'dʌ', 'læk', 'ɔf', 'ʌ', 'bʌɡ', 'baʊn', 'di', 'pɹoʊɡɹɑm', 't͡ʃoʊhænʌ', 'dɪsaɪdɪd', 'tu', 'sɛl', 'hɚ', 'ɛksplɔɪt', 'ɔn', 'ðʌ', 'blæk', 'mɑɹkɛt']
['d', 'j', 'u', '|', 't', 'u', '|', 'd', 'ʌ', '|', 'l', 'æ', 'k', '|', 'ɔ', 'f', '|', 'ʌ', '|', 'b', 'ʌ', 'ɡ', '|', 'b', 'a', 'ʊ', 'n', '|', 'd', 'i', '|', 'p', 'ɹ', 'o', 'ʊ', 'ɡ', 'ɹ', 'ɑ', 'm', '|', 't', '͡', 'ʃ', 'o', 'ʊ', 'h', 'æ', 'n', 'ʌ', '|', 'd', 'ɪ', 's', 'a', 'ɪ', 'd', 'ɪ', 'd', '|', 't', 'u', '|', 's', 'ɛ', 'l', '|', 'h', 'ɚ', '|', 'ɛ', 'k', 's', 'p', 'l', 'ɔ', 'ɪ', 't', '|', 'ɔ', 'n', '|', 'ð', 'ʌ', '|', 'b', 'l', 'æ', 'k', '|', 'm', 'ɑ', 'ɹ', 'k', 'ɛ', 't']


In [10]:
def convert_to_ipa(x):
    # x['ipa'] = ' '.join([arphabet_to_ipa[code] for code in x['phonetic_codes']])
    x['ipa'] = ''.join([x for x in x['phonetic_codes']])
    return x

dataset = dataset.map(convert_to_ipa)

print(dataset['train']['text'][1])
print(dataset['train']['phonetic_codes'][1])
print(dataset['train']['ipa'][1])

Map:   0%|          | 0/12089 [00:00<?, ? examples/s]

Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

Due to the lack of a bug bounty program, Johanna decided to sell her exploit on the black market.
['d', 'j', 'u', '|', 't', 'u', '|', 'd', 'ʌ', '|', 'l', 'æ', 'k', '|', 'ɔ', 'f', '|', 'ʌ', '|', 'b', 'ʌ', 'ɡ', '|', 'b', 'a', 'ʊ', 'n', '|', 'd', 'i', '|', 'p', 'ɹ', 'o', 'ʊ', 'ɡ', 'ɹ', 'ɑ', 'm', '|', 't', '͡', 'ʃ', 'o', 'ʊ', 'h', 'æ', 'n', 'ʌ', '|', 'd', 'ɪ', 's', 'a', 'ɪ', 'd', 'ɪ', 'd', '|', 't', 'u', '|', 's', 'ɛ', 'l', '|', 'h', 'ɚ', '|', 'ɛ', 'k', 's', 'p', 'l', 'ɔ', 'ɪ', 't', '|', 'ɔ', 'n', '|', 'ð', 'ʌ', '|', 'b', 'l', 'æ', 'k', '|', 'm', 'ɑ', 'ɹ', 'k', 'ɛ', 't']
dju|tu|dʌ|læk|ɔf|ʌ|bʌɡ|baʊn|di|pɹoʊɡɹɑm|t͡ʃoʊhænʌ|dɪsaɪdɪd|tu|sɛl|hɚ|ɛksplɔɪt|ɔn|ðʌ|blæk|mɑɹkɛt


In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonemes', 'id', 'phonetic_codes', 'ipa'],
        num_rows: 12089
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonemes', 'id', 'phonetic_codes', 'ipa'],
        num_rows: 3023
    })
})

In [12]:
# оставить file audio text phonetic_codes ipa
dataset = dataset.remove_columns(['phonemes', 'id'])

dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_codes', 'ipa'],
        num_rows: 12089
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_codes', 'ipa'],
        num_rows: 3023
    })
})

In [13]:
show_random_elements(dataset["test"].remove_columns(["file"]), num_examples=2)

Unnamed: 0,audio,text,phonetic_codes,ipa
0,"{'array': [0.00048828125, -0.00018310546875, 0.0, -3.0517578125e-05, -3.0517578125e-05, 3.0517578125e-05, 6.103515625e-05, 0.0, 3.0517578125e-05, 9.1552734375e-05, 3.0517578125e-05, -3.0517578125e-05, 0.0, 0.0, 3.0517578125e-05, 9.1552734375e-05, 0.0, 3.0517578125e-05, 6.103515625e-05, -3.0517578125e-05, 3.0517578125e-05, 6.103515625e-05, 0.0, 0.0, 0.0, 9.1552734375e-05, 3.0517578125e-05, 3.0517578125e-05, 0.0001220703125, 0.0, 3.0517578125e-05, 0.0001220703125, -3.0517578125e-05, -6.103515625e-05, 0.0, -3.0517578125e-05, 0.0, 0.0, 0.0, -3.0517578125e-05, 6.103515625e-05, 3.0517578125e-05, 3.0517578125e-05, 3.0517578125e-05, 0.0, 9.1552734375e-05, 0.0, 3.0517578125e-05, 0.0, 3.0517578125e-05, -6.103515625e-05, -3.0517578125e-05, -3.0517578125e-05, 9.1552734375e-05, 3.0517578125e-05, -3.0517578125e-05, 9.1552734375e-05, -3.0517578125e-05, 6.103515625e-05, 3.0517578125e-05, 9.1552734375e-05, -3.0517578125e-05, 6.103515625e-05, 3.0517578125e-05, 0.0, 6.103515625e-05, 0.0, 0.0, 0.0, 9.1552734375e-05, 9.1552734375e-05, 6.103515625e-05, -6.103515625e-05, 0.0, 0.0, 0.0, -6.103515625e-05, -3.0517578125e-05, 0.0, 3.0517578125e-05, -3.0517578125e-05, 0.0, 9.1552734375e-05, 0.0, -3.0517578125e-05, 0.0, 0.0, -6.103515625e-05, -3.0517578125e-05, -3.0517578125e-05, -3.0517578125e-05, -6.103515625e-05, 0.0, -6.103515625e-05, 3.0517578125e-05, -3.0517578125e-05, 3.0517578125e-05, -3.0517578125e-05, 0.0, 0.0, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/3493.wav', 'sampling_rate': 16000}",A connoisseur will enjoy this shellfish dish.,"[ə, |, k, ɑ, n, ə, s, ɚ, |, w, ə, l, |, ɪ, n, d, ͡, ʒ, ɔ, ɪ, |, ð, ɪ, s, |, ʃ, ɛ, l, f, ɪ, ʃ, |, d, ɪ, ʃ]",ə|kɑnəsɚ|wəl|ɪnd͡ʒɔɪ|ðɪs|ʃɛlfɪʃ|dɪʃ
1,"{'array': [0.01788330078125, 0.0179443359375, 0.0184326171875, 0.017822265625, 0.017852783203125, 0.01776123046875, 0.017303466796875, 0.01751708984375, 0.016815185546875, 0.01666259765625, 0.016265869140625, 0.016082763671875, 0.0159912109375, 0.0159912109375, 0.01531982421875, 0.01458740234375, 0.01416015625, 0.013702392578125, 0.013397216796875, 0.012176513671875, 0.0111083984375, 0.0101318359375, 0.00860595703125, 0.00787353515625, 0.007843017578125, 0.006103515625, 0.00518798828125, 0.004852294921875, 0.004547119140625, 0.0025634765625, 0.0018310546875, 0.000762939453125, -0.0001220703125, -0.00079345703125, -0.00054931640625, -0.000518798828125, -0.000701904296875, -0.000244140625, -3.0517578125e-05, -0.000244140625, -6.103515625e-05, 0.001373291015625, 0.00146484375, 0.001068115234375, 0.000640869140625, 0.000244140625, 0.00018310546875, -0.00115966796875, -0.001312255859375, -0.001007080078125, -0.001251220703125, -0.0013427734375, -0.001434326171875, -0.00238037109375, -0.002655029296875, -0.00274658203125, -0.0030517578125, -0.003326416015625, -0.00439453125, -0.00469970703125, -0.004669189453125, -0.005706787109375, -0.00543212890625, -0.00634765625, -0.0062255859375, -0.00677490234375, -0.00823974609375, -0.008575439453125, -0.009796142578125, -0.00994873046875, -0.010467529296875, -0.0108642578125, -0.010498046875, -0.010284423828125, -0.010772705078125, -0.010650634765625, -0.01080322265625, -0.010772705078125, -0.011505126953125, -0.01129150390625, -0.01104736328125, -0.011627197265625, -0.012420654296875, -0.01263427734375, -0.012359619140625, -0.0135498046875, -0.01446533203125, -0.01495361328125, -0.01666259765625, -0.017333984375, -0.017852783203125, -0.018402099609375, -0.0189208984375, -0.019012451171875, -0.0191650390625, -0.019134521484375, -0.01922607421875, -0.019256591796875, -0.019317626953125, -0.019073486328125, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/9025.wav', 'sampling_rate': 16000}",He waded into the edge of the water and began scrubbing himself,"[h, i, |, w, e, ɪ, d, ɪ, d, |, ɪ, n, t, u, |, ð, ʌ, |, ɛ, d, ͡, ʒ, |, ʌ, v, |, ð, ʌ, |, w, ɔ, t, ɚ, |, ʌ, n, d, |, b, ɪ, ɡ, æ, n, |, s, k, ɹ, ʌ, b, ɪ, ŋ, |, h, ɪ, m, s, ɛ, l, f]",hi|weɪdɪd|ɪntu|ðʌ|ɛd͡ʒ|ʌv|ðʌ|wɔtɚ|ʌnd|bɪɡæn|skɹʌbɪŋ|hɪmsɛlf


In [14]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    # batch["ipa"] = re.sub(chars_to_ignore_regex, '', batch["ipa"]).lower() + " "
    batch["ipa"] = re.sub(chars_to_ignore_regex, '', batch["ipa"]) + " "
    return batch

In [15]:
dataset = dataset.map(remove_special_characters)

Map:   0%|          | 0/12089 [00:00<?, ? examples/s]

Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

In [16]:
show_random_elements(dataset["test"].remove_columns(["file"]), num_examples=2)

Unnamed: 0,audio,text,phonetic_codes,ipa
0,"{'array': [-0.0001220703125, -3.0517578125e-05, -3.0517578125e-05, 0.0, 3.0517578125e-05, 3.0517578125e-05, 0.000152587890625, 0.000244140625, 0.000274658203125, 0.000274658203125, 0.000274658203125, 0.000274658203125, 0.000274658203125, 0.000152587890625, 9.1552734375e-05, 3.0517578125e-05, 3.0517578125e-05, -3.0517578125e-05, 0.0, -0.0001220703125, 6.103515625e-05, 3.0517578125e-05, 0.00018310546875, 0.00018310546875, 0.0001220703125, 0.000244140625, 9.1552734375e-05, -3.0517578125e-05, -0.00018310546875, -3.0517578125e-05, 6.103515625e-05, -3.0517578125e-05, -9.1552734375e-05, -6.103515625e-05, 0.0, 0.0, 6.103515625e-05, 3.0517578125e-05, -3.0517578125e-05, -3.0517578125e-05, -0.000213623046875, -9.1552734375e-05, 6.103515625e-05, 3.0517578125e-05, 0.000274658203125, 0.000244140625, 0.00018310546875, 0.00030517578125, 0.0001220703125, 6.103515625e-05, -6.103515625e-05, 3.0517578125e-05, 6.103515625e-05, 0.000152587890625, 9.1552734375e-05, -6.103515625e-05, -6.103515625e-05, 3.0517578125e-05, -0.0001220703125, -0.00018310546875, -0.000213623046875, -3.0517578125e-05, 9.1552734375e-05, 6.103515625e-05, 9.1552734375e-05, 0.0, -0.0001220703125, -0.0001220703125, -9.1552734375e-05, -0.00018310546875, -6.103515625e-05, 3.0517578125e-05, 3.0517578125e-05, 9.1552734375e-05, 0.000213623046875, 0.000274658203125, 0.000213623046875, 0.000213623046875, 0.00018310546875, 0.00030517578125, 0.000274658203125, 0.000274658203125, 0.000274658203125, 0.000274658203125, 0.000274658203125, 0.000213623046875, 3.0517578125e-05, 3.0517578125e-05, -9.1552734375e-05, -9.1552734375e-05, -6.103515625e-05, -3.0517578125e-05, -9.1552734375e-05, -0.0001220703125, 3.0517578125e-05, 6.103515625e-05, 9.1552734375e-05, 0.000244140625, 0.000244140625, 0.000152587890625, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/816.wav', 'sampling_rate': 16000}",They seem darned proud of it.,"[ð, ɪ, |, s, i, m, |, d, ɑ, ɹ, m, |, p, ɹ, a, ʊ, t, |, ʌ, v, |, ɪ]",ðɪ|sim|dɑɹm|pɹaʊt|ʌv|ɪ
1,"{'array': [6.103515625e-05, 9.1552734375e-05, 3.0517578125e-05, 3.0517578125e-05, 0.0, 6.103515625e-05, -3.0517578125e-05, -3.0517578125e-05, 3.0517578125e-05, 0.0, 0.0, 3.0517578125e-05, -9.1552734375e-05, 6.103515625e-05, 0.0, -3.0517578125e-05, -3.0517578125e-05, 0.0001220703125, -3.0517578125e-05, -3.0517578125e-05, 3.0517578125e-05, -0.000152587890625, 6.103515625e-05, 6.103515625e-05, 0.0, -3.0517578125e-05, 3.0517578125e-05, 0.0, -9.1552734375e-05, -3.0517578125e-05, -3.0517578125e-05, 0.0, 3.0517578125e-05, -0.0001220703125, -3.0517578125e-05, -3.0517578125e-05, 0.0, 3.0517578125e-05, 0.0, 0.0, -3.0517578125e-05, -3.0517578125e-05, 3.0517578125e-05, 0.0, 3.0517578125e-05, -3.0517578125e-05, -6.103515625e-05, -3.0517578125e-05, -3.0517578125e-05, 0.0, -3.0517578125e-05, 0.0, 3.0517578125e-05, 0.0, 0.0, 0.0, -6.103515625e-05, 9.1552734375e-05, -3.0517578125e-05, 0.0, 3.0517578125e-05, -3.0517578125e-05, -9.1552734375e-05, -3.0517578125e-05, -3.0517578125e-05, -6.103515625e-05, -3.0517578125e-05, 0.0, 0.0, 0.0, 0.00018310546875, 0.0, 0.0, 6.103515625e-05, -3.0517578125e-05, 9.1552734375e-05, 3.0517578125e-05, 6.103515625e-05, -3.0517578125e-05, -9.1552734375e-05, 0.0, 0.0, 0.0, -6.103515625e-05, -3.0517578125e-05, -3.0517578125e-05, -6.103515625e-05, 3.0517578125e-05, -3.0517578125e-05, -6.103515625e-05, 0.000244140625, 0.0, 0.0, -3.0517578125e-05, 0.0, 0.0, 0.0, -3.0517578125e-05, 0.0, 0.0, ...], 'path': '/home/morph/Desktop/FINAL/processed_dataset/10795.wav', 'sampling_rate': 16000}",I I beg pardon he drawled,"[a, ɪ, |, a, ɪ, |, b, ɛ, ɡ, |, p, ɑ, ɹ, d, ʌ, n, |, h, i, |, d, ɹ, ɔ, l, d]",aɪ|aɪ|bɛɡ|pɑɹdʌn|hi|dɹɔld


## Vocabulary

In [17]:
def extract_all_chars(batch):
  all_text = " ".join(batch["ipa"])
  vocab = list(dict.fromkeys(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [18]:
vocabs = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names["train"])

Map:   0%|          | 0/12089 [00:00<?, ? examples/s]

Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

In [19]:
vocab_list = list(
    dict.fromkeys(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
    )

In [20]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict.items()

dict_items([('z', 0), ('ð', 1), ('i', 2), ('n', 3), ('k', 4), ('ɡ', 5), ('ɛ', 6), ('ʌ', 7), ('|', 8), ('d', 9), ('a', 10), ('ʊ', 11), ('͡', 12), ('s', 13), ('ŋ', 14), ('m', 15), ('t', 16), ('w', 17), ('ɑ', 18), ('j', 19), ('θ', 20), ('e', 21), ('ɔ', 22), ('v', 23), ('o', 24), ('æ', 25), ('ɪ', 26), ('f', 27), ('l', 28), ('ə', 29), ('ʃ', 30), ('ʒ', 31), ('ɚ', 32), ('p', 33), ('b', 34), ('u', 35), (' ', 36), ('h', 37), ('ɹ', 38)])

In [21]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [22]:
vocab_dict.items()

dict_items([('z', 0), ('ð', 1), ('i', 2), ('n', 3), ('k', 4), ('ɡ', 5), ('ɛ', 6), ('ʌ', 7), ('|', 36), ('d', 9), ('a', 10), ('ʊ', 11), ('͡', 12), ('s', 13), ('ŋ', 14), ('m', 15), ('t', 16), ('w', 17), ('ɑ', 18), ('j', 19), ('θ', 20), ('e', 21), ('ɔ', 22), ('v', 23), ('o', 24), ('æ', 25), ('ɪ', 26), ('f', 27), ('l', 28), ('ə', 29), ('ʃ', 30), ('ʒ', 31), ('ɚ', 32), ('p', 33), ('b', 34), ('u', 35), ('h', 37), ('ɹ', 38)])

In [23]:
len(vocab_dict)

38

In [24]:
# NOTE: in this case it fixed itself, because the 38th symbol was "|", which we replace by the position of " ", 
# but in other cases, it's better to add +1 to the position of [UNK] and [PAD] symbols
vocab_dict["[UNK]"] = len(vocab_dict)+1
vocab_dict["[PAD]"] = len(vocab_dict)+1
len(vocab_dict)

40

In [25]:
# del vocab_dict["[PAD]"]
# del vocab_dict["[UNK]"]

In [26]:
vocab_dict.items()

dict_items([('z', 0), ('ð', 1), ('i', 2), ('n', 3), ('k', 4), ('ɡ', 5), ('ɛ', 6), ('ʌ', 7), ('|', 36), ('d', 9), ('a', 10), ('ʊ', 11), ('͡', 12), ('s', 13), ('ŋ', 14), ('m', 15), ('t', 16), ('w', 17), ('ɑ', 18), ('j', 19), ('θ', 20), ('e', 21), ('ɔ', 22), ('v', 23), ('o', 24), ('æ', 25), ('ɪ', 26), ('f', 27), ('l', 28), ('ə', 29), ('ʃ', 30), ('ʒ', 31), ('ɚ', 32), ('p', 33), ('b', 34), ('u', 35), ('h', 37), ('ɹ', 38), ('[UNK]', 39), ('[PAD]', 40)])

Если вы перезапускаете ноутбук без обучения, то создайте копию вокаба, далее, при создании Processor, обращайтесь к копии

In [27]:
with open('/home/morph/Desktop/FINAL/ivanov182/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

## Tokenizer! and Feature Extractor

In a final step, we use the json file to instantiate an object of the `Wav2Vec2CTCTokenizer` class.

In [28]:
#from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab_checkpoint.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

Next, we will create the feature extractor.

A Wav2Vec2 feature extractor object requires the following parameters to be instantiated:

- `feature_size`: Speech models take a sequence of feature vectors as an input. While the length of this sequence obviously varies, the feature size should not. In the case of Wav2Vec2, the feature size is 1 because the model was trained on the raw speech signal ${}^2$.
- `sampling_rate`: The sampling rate at which the model is trained on.
- `padding_value`: For batched inference, shorter inputs need to be padded with a specific value
- `do_normalize`: Whether the input should be *zero-mean-unit-variance* normalized or not. Usually, speech models perform better when normalizing the input
- `return_attention_mask`: Whether the model should make use of an `attention_mask` for batched inference. In general, models should **always** make use of the `attention_mask` to mask padded tokens. However, due to a very specific design choice of `Wav2Vec2`'s "base" checkpoint, better results are achieved when using no `attention_mask`. This is **not** recommended for other speech models. For more information, one can take a look at [this](https://github.com/pytorch/fairseq/issues/3227) issue. **Important** If you want to use this notebook to fine-tune [large-lv60](https://huggingface.co/facebook/wav2vec2-large-lv60), this parameter should be set to `True`.

In [29]:
#from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [30]:
#from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

## Preprocess данных

In [31]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = sf.read(batch["file"])
    batch["speech"] = speech_array
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["ipa"]
    return batch

In [32]:
dataset = dataset.map(speech_file_to_array_fn, remove_columns=dataset.column_names["train"],
                      # num_proc=4
                      )

Map:   0%|          | 0/12089 [00:00<?, ? examples/s]

Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

In [33]:
rand_int = random.randint(0, len(dataset["train"]))
ipd.Audio(data=np.asarray(dataset["train"][rand_int]["speech"]), autoplay=True, rate=16000)

In [34]:
rand_int = random.randint(0, len(dataset["train"]))

print("Target text:", dataset["train"][rand_int]["target_text"])
print("Input array shape:", np.asarray(dataset["train"][rand_int]["speech"]).shape)
print("Sampling rate:", dataset["train"][rand_int]["sampling_rate"])

ipd.Audio(data=np.asarray(dataset["train"][rand_int]["speech"]), autoplay=True, rate=16000)

Target text: aɪ|hæv|lɔŋ|noʊtɪd|jɔɹ|θɚst|ʌnkwɛnt͡ʃʌbʌl 
Input array shape: (73600,)
Sampling rate: 16000


In [35]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [36]:
dataset_prepared = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], batch_size=8,  batched=True
                               #,num_proc=4,
                               )

Map:   0%|          | 0/12089 [00:00<?, ? examples/s]



Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

## Train Setup

In [37]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [38]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [39]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [40]:
result = cer_metric.compute(predictions=['hell'],references=['hello'])
print(result)

0.2


In [41]:
truth = "hello"
hypothesis = "hell"

cer_value = cer(truth, hypothesis)

print(cer_value)

0.2


In [42]:
def compute_metrics(pred):

    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    cerr = cer(label_str, pred_str)

    return {"cer": cerr}

In [43]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
model.freeze_feature_extractor()



In [45]:
training_args = TrainingArguments(
  output_dir="/home/morph/Desktop/FINAL/ivanov182/181224checkpoint",
  group_by_length=True,
  #per_device_train_batch_size=16,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=14, #50
  fp16=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=3e-4,
  warmup_steps=1000,
  save_total_limit=2,
)



In [46]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_prepared["train"],
    eval_dataset=dataset_prepared["test"],
    tokenizer=processor.feature_extractor,
)

  trainer = Trainer(


## Training

In [58]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [59]:
trainer.train(resume_from_checkpoint=False)



Step,Training Loss,Validation Loss,Cer
500,5.2541,2.240453,0.771866
1000,0.9761,0.45505,0.152634
1500,0.6318,0.371019,0.142552
2000,0.4988,0.3326,0.131276
2500,0.4457,0.316473,0.12462
3000,0.4001,0.29567,0.11982
3500,0.3481,0.280685,0.118479
4000,0.3299,0.276927,0.115901
4500,0.3024,0.266599,0.113692
5000,0.273,0.251626,0.11135


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=10584, training_loss=0.5439145339769751, metrics={'train_runtime': 13130.1196, 'train_samples_per_second': 12.89, 'train_steps_per_second': 0.806, 'total_flos': 2.0915180948213604e+19, 'train_loss': 0.5439145339769751, 'epoch': 14.0})

In [64]:
processor.save_pretrained("/home/morph/Desktop/FINAL/ivanov182/181224checkpoint")

[]

## Evaluate

In [47]:
path = "/home/morph/Desktop/FINAL/ivanov182/181224checkpoint/checkpoint-10584"
finetuned_model = Wav2Vec2ForCTC.from_pretrained(path)
processor = Wav2Vec2Processor.from_pretrained("/home/morph/Desktop/FINAL/ivanov182/181224checkpoint")

In [49]:
def map_to_result(batch):
  finetuned_model.to("cuda")
  input_values = processor(
      batch["speech"],
      sampling_rate=batch["sampling_rate"],
      return_tensors="pt"
  ).input_values.to("cuda")

  with torch.no_grad():
    logits = finetuned_model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]

  return batch

In [50]:
results = dataset["test"].map(map_to_result)



Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

In [52]:
def process_results(example):
    # Удаляем токены [PAD] из pred_str
    if isinstance(example['pred_str'], list):
        example['pred_str'] = [s.replace('[PAD]', '') if isinstance(s, str) else s for s in example['pred_str']]
    elif isinstance(example['pred_str'], str):
        example['pred_str'] = example['pred_str'].replace('[PAD]', '')

    # Заменяем | на пробел в target_text
    if isinstance(example['target_text'], list):
        example['target_text'] = [s.replace('|', ' ') if isinstance(s, str) else s for s in example['target_text']]
    elif isinstance(example['target_text'], str):
        example['target_text'] = example['target_text'].replace('|', ' ')

    return example

results = results.map(process_results)

Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

In [53]:
truth = " ".join([" ".join(target) if isinstance(target, list) else target for target in results["target_text"]])
hypothesis = " ".join([" ".join(pred) if isinstance(pred, list) else pred for pred in results["pred_str"]])

cer(truth, hypothesis)

0.07832534321929363

In [54]:
truth = results["target_text"]
hypothesis = results["pred_str"]

cer(truth, hypothesis)

0.06134798830011186

In [55]:
"".join(results[0]['target_text']), "".join(results[0]['pred_str'])

('ɪts eɪ jæŋki d͡ʒoʊn kɹaɪd ', 'ɪts eɪ jæŋki d͡ʒoʊn kɹaɪd')

In [57]:
def play_audio(data, sample_rate):
    """
    Воспроизводит аудио на основе массива данных и частоты дискретизации.
    
    Параметры:
        data (numpy array): Аудиосигнал (1D-массив)
        sample_rate (int): Частота дискретизации в Гц
    """
    # Нормализация данных, если они не находятся в диапазоне [-1, 1]
    if np.max(np.abs(data)) > 1.0:
        data = data / np.max(np.abs(data))
    
    # Конвертация в 16-битный формат PCM для воспроизведения
    audio_data = (data * 32767).astype(np.int16)
    
    # Сохранение во временный файл
    write("temp_audio.wav", sample_rate, audio_data)
    
    # Воспроизведение аудио
    return ipd.Audio("temp_audio.wav")

data = np.array(results[0]['speech'], dtype=float)
sample_rate = 16000 
play_audio(data, sample_rate)

In [58]:
show_random_elements(results.remove_columns(["speech", "sampling_rate"]))

Unnamed: 0,target_text,pred_str
0,noʊ ɪt ɪz ʌ pælʌs wɛɹɪn ðɛɹ ɑɹ mɛni sɚvʌnts,noʊ ɪt ɪz ʌ pælʌs wɛɹɪn ðɛɹ ɑɹ mɛni sɚvʌnts
1,ðɛɹ wʌz sʌmθɪŋ pʌθɛtɪk ɪn ðʌ ɡɚlz ætʌtud naʊ,ðɛɹ wʌz sʌmsɪmpʌtɛtɪk ɪn ðʌ ɡɚlz ætʌtud naʊ
2,d ju hiɹ mi,dɪ ju hɛɹ mi
3,doʊn æsk mi tɪ kɪɹi ɪn ɔɪli ɹæɡ laɪk ðæt,doʊn æs mɪ tɪ kɛɹi ɪn ɔɪli ɹæɡ laɪk ðæt
4,doʊn æs mi tɪ kɛɹi ɪn ɔɪli ɹæɡ laɪk θæt,doʊn æsk mi tɪ kɛɹi ɪn ɔɪli ɹæɡ laɪk ðæt
5,blʌd wʌz uzɪŋ sloʊli fɹʌm ðʌ wundɪd mænz ɹaɪt bɹɛst,blʌd wʌz uzɪŋ sloʊli fɹʌm ðʌ wundɪd mænz ɹaɪt bɹɛst
6,ðɪ kaʊ wʌndɚd fɹəm ðə fɑɹmlænd əɛn bɪkeɪm lɔst,ðɪ kaʊ wʌnɚd fɹəm ðə fɑɹmlɛnd ɪ æn bikeɪm lɔst
7,hɪz pɚskɹɪpʃɪn hɑ ən koʊld kɑmpɹɛsɪs tɪ ɪŋkɹis hɚ ɪbsɔɹpʃɪn ɪv wɑtɚ,hɪz pɹɪskɹɪpʃɪn hɑt ən koʊld kɑmpɹɛsɪz tɪ ɪŋkɹis hɚ ɪbsɔɹpʃɪn əv wɔtɚ
8,ðʌ bʊt͡ʃɚz ænd mit kʌtɚz ɹɪfjuzd tʌ hændʌl mit dɛstɪnd fɔɹ ʌnfɛɹ ɹɛstɹɑnts,ðʌ bʊt͡ʃɚz ænd mit kʌtɚz pɹʌfjuzd tʌ hændʌl mi dɛstɪnd fɔɹ ʌnfɛɹ ɹɛstɚɑnts
9,ɪt ɪz ɔlsoʊ æn ɪnsɪdiʌs dɪsitfʌl sʌn,ɪt ɪz ɔlsoʊ æn ɪnsɪdiʌs dɪsaɪtfʌl sʌn
