Введение и установка зависимостей

In [24]:
# Импорт необходимых библиотек
import json
import os
import torch
import torch.nn as nn
import wave
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from vosk import Model, KaldiRecognizer
import jiwer
from pydub import AudioSegment


Конфигурация

In [32]:
# Конфигурация
CONFIG = {
    'data_dir': '../data/train/',
    'annotation_dir': '../data/train/annotation/',
    'val_dir': '../data/val/luga/',
    'model_path': "../model/vosk_model",
    'batch_size': 2,
    'num_epochs': 10,
    'learning_rate': 0.001,
    'hidden_dim': 55,
    'output_dim': 22,
    'input_dim': 2  # Изменится в зависимости от векторизации
}


Функция для загрузки аннотаций


In [26]:
def load_annotations(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Пример загрузки
train_annotations = load_annotations(f"{CONFIG['annotation_dir']}/hr_bot_synt.json")


Создание модели и классификатора

In [27]:
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Создание экземпляра модели
classifier = TextClassifier(CONFIG['input_dim'], CONFIG['hidden_dim'], CONFIG['output_dim'])


Функция подготовки данных

In [28]:
def prepare_data(train_annotations):
    audio_files, texts, labels = [], [], []
    for annotation in train_annotations:
        audio_files.append(annotation['audio_filepath'])
        texts.append(annotation['text'])
        labels.append(annotation['label'])
    return audio_files, texts, labels

audio_files, texts, labels = prepare_data(train_annotations)


Функция для трансформации текста

In [29]:
def tokenize_and_vectorize(texts, input_dim):
    tokenizer = TfidfVectorizer(max_features=input_dim)
    return tokenizer.fit_transform(texts).toarray(), tokenizer

train_vectors, tokenizer = tokenize_and_vectorize(texts, CONFIG['input_dim'])


Определение датасета и загрузчика данных

In [30]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = TextDataset(train_vectors, labels)
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)


Обучение модели

In [33]:
def train_model(model, data_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for texts_batch, labels_batch in data_loader:
            optimizer.zero_grad()
            outputs = model(texts_batch.float())
            loss = criterion(outputs, labels_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f"Эпоха {epoch+1}/{num_epochs}, Потеря: {epoch_loss:.4f}")

# Настройки обучения
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=CONFIG['learning_rate'])

# Запуск обучения
train_model(classifier, train_loader, criterion, optimizer, CONFIG['num_epochs'])


Эпоха 1/10, Потеря: 2123.5666
Эпоха 2/10, Потеря: 2119.0384
Эпоха 3/10, Потеря: 2118.8926
Эпоха 4/10, Потеря: 2118.7522
Эпоха 5/10, Потеря: 2117.7375
Эпоха 6/10, Потеря: 2116.2456
Эпоха 7/10, Потеря: 2117.9703
Эпоха 8/10, Потеря: 2115.6429
Эпоха 9/10, Потеря: 2113.7787
Эпоха 10/10, Потеря: 2114.7357


Функция для обработки аудио с помощью Vosk

In [34]:
def transcribe_audio(audio_file, model_dir):
    wf = wave.open(audio_file, "rb")
    model = Model(model_dir)
    rec = KaldiRecognizer(model, wf.getframerate())

    result_text = ""
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            result_text += result.get("text", "")

    final_result = json.loads(rec.FinalResult())
    result_text += final_result.get("text", "")

    return result_text


Обработка и сохранение результатов

In [37]:
def process_files(file_list, classifier, tokenizer, model_dir):
    results = []
    for audio_file in file_list:
        transcribed_text = transcribe_audio(audio_file, model_dir)
        text_vector = tokenizer.transform([transcribed_text]).toarray()
        text_tensor = torch.tensor(text_vector, dtype=torch.float32)
        with torch.no_grad():
            outputs = classifier(text_tensor)
        _, predicted_class = torch.max(outputs, 1)
        results.append({
            "file_name": audio_file,
            "transcription": transcribed_text,
            "category": predicted_class.item()
        })
    return results

# Передайте просто список путей к аудиофайлам
transcription_results = process_files(audio_files, classifier, tokenizer, CONFIG['model_path'])


FileNotFoundError: [Errno 2] No such file or directory: '6ca54494-76ff-11ee-8f2f-c09bf4619c03_1.wav'

Сохранение результатов в файл

In [None]:
with open("transcriptions.json", "w", encoding="utf-8") as f:
    json.dump(transcription_results, f, ensure_ascii=False, indent=4)

# Вывод результатов
print(json.dumps(transcription_results, indent=4, ensure_ascii=False))
