# Solution

### Installing and importing dependencies

In [1]:
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install matplotlib>=3.3.2

!python -m pip install git+https://github.com/NVIDIA/NeMo.git@1fa961ba03ab5f8c91b278640e29807079373372#egg=nemo_toolkit[all]

!pip install huggingface-hub==0.23.2

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=3ee99453a033c50420f0fe94e4ab1d2b7f44b190b02d1c9fec1e0f79a38082af
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsndfile1 is already the newest version (1.0.31-2ubuntu0.1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
The following additional packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libwavpack1
Suggested packages:
  libsox-fmt-all
The following NEW packages will be installed

In [2]:
from typing import List, Union

import hydra
import soundfile as sf
import torch
import torchaudio
from omegaconf import DictConfig, ListConfig, OmegaConf
from nemo.collections.asr.models import EncDecCTCModel
from nemo.collections.asr.modules.audio_preprocessing import (
    AudioToMelSpectrogramPreprocessor as NeMoAudioToMelSpectrogramPreprocessor,
)
from nemo.collections.asr.parts.preprocessing.features import (
    FilterbankFeaturesTA as NeMoFilterbankFeaturesTA,
)

### Adding functionality for features extraction

In [3]:
class FilterbankFeaturesTA(NeMoFilterbankFeaturesTA):
    def __init__(self, mel_scale: str = "htk", wkwargs=None, **kwargs):
        if "window_size" in kwargs:
            del kwargs["window_size"]
        if "window_stride" in kwargs:
            del kwargs["window_stride"]

        super().__init__(**kwargs)

        # Compute a mel-scaled spectrogram
        self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = (
            torchaudio.transforms.MelSpectrogram(
                sample_rate=self._sample_rate,
                win_length=self.win_length,
                hop_length=self.hop_length,
                n_mels=kwargs["nfilt"],
                window_fn=self.torch_windows[kwargs["window"]],
                mel_scale=mel_scale,
                norm=kwargs["mel_norm"],
                n_fft=kwargs["n_fft"],
                f_max=kwargs.get("highfreq", None),
                f_min=kwargs.get("lowfreq", 0),
                wkwargs=wkwargs,
            )
        )


class AudioToMelSpectrogramPreprocessor(NeMoAudioToMelSpectrogramPreprocessor):
    def __init__(self, mel_scale: str = "htk", **kwargs):
        super().__init__(**kwargs)
        kwargs["nfilt"] = kwargs["features"]
        del kwargs["features"]
        self.featurizer = (
            FilterbankFeaturesTA(  # Deprecated arguments; kept for config compatibility
                mel_scale=mel_scale,
                **kwargs,
            )
        )

### Model for emotions classification

In [4]:
class SpecScaler(torch.nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.log(x.clamp_(1e-9, 1e9))


class GigaAMEmo(torch.nn.Module):
    """GigaAM-Emo pretrained model."""
    def __init__(self, conf: Union[DictConfig, ListConfig]):
        super().__init__()
        self.id2name = conf.id2name
        self.feature_extractor = hydra.utils.instantiate(conf.feature_extractor)
        self.conformer = hydra.utils.instantiate(conf.encoder)
        self.linear_head = hydra.utils.instantiate(conf.classification_head)

    def forward(self, features, features_length=None):
        """Perform forward pass and return logits."""
        if features.dim() == 2:
            features = features.unsqueeze(0)
        if not features_length:
            features_length = torch.ones(features.shape[0]) * features.shape[-1]
            features_length = features_length.to(features.device)
        encoded, _ = self.conformer(audio_signal=features, length=features_length)
        encoded_pooled = torch.nn.functional.avg_pool1d(
            encoded, kernel_size=encoded.shape[-1]
        ).squeeze(-1)

        logits = self.linear_head(encoded_pooled)
        return logits

    def get_probs(self, audio_path: str) -> List[List[float]]:
        """Perform emotion classification from audio and return probabilities."""
        audio_signal, _ = sf.read(audio_path, dtype="float32")
        features = self.feature_extractor(
            torch.tensor(audio_signal).float().to(next(self.parameters()).device)
        )
        logits = self.forward(features)
        probs = torch.nn.functional.softmax(logits).detach().tolist()
        return probs

### Downloading weights and config

In [5]:
import locale

locale.getpreferredencoding = lambda: "UTF-8"

# Loading weights and config for Emo-model
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/emo_model_weights.ckpt
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/emo_model_config.yaml

# Loading weights and config for CTC-model
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_weights.ckpt
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_config.yaml

--2024-12-18 18:46:59--  https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/emo_model_weights.ckpt
Resolving n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)... 37.230.193.192
Connecting to n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)|37.230.193.192|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 968409626 (924M) [application/octet-stream]
Saving to: ‘emo_model_weights.ckpt’


2024-12-18 18:48:45 (8.98 MB/s) - ‘emo_model_weights.ckpt’ saved [968409626/968409626]

--2024-12-18 18:48:46--  https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/emo_model_config.yaml
Resolving n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)... 37.230.193.192
Connecting to n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)|37.230.193.192|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 765 [application/octet-stream]
Saving to: ‘emo_model_config.yaml’


2024-12-18 18:48:46 (392 MB

### CTC-model instantiating

In [11]:
# setting weights & config
ctc_model_config = "ctc_model_config.yaml"
ctc_model_weights = "ctc_model_weights.ckpt"
device = "cuda" if torch.cuda.is_available() else "cpu"

# preparing model
ctc_model = EncDecCTCModel.from_config_file(ctc_model_config)
ctc_ckpt = torch.load(ctc_model_weights, map_location="cpu")
ctc_model.load_state_dict(ctc_ckpt, strict=False)
ctc_model = ctc_model.to(device)
ctc_model.eval()

[NeMo W 2024-12-18 18:57:31 audio_to_text_dataset:697] Could not load dataset as `manifest_filepath` was None. Provided config : {'batch_size': 10, 'trim_silence': False, 'max_duration': 25.0, 'min_duration': 0.1, 'shuffle': True, 'is_tarred': False, 'num_workers': 8, 'pin_memory': True, 'manifest_filepath': None, 'labels': [' ', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'], 'sample_rate': 16000}
[NeMo W 2024-12-18 18:57:31 audio_to_text_dataset:697] Could not load dataset as `manifest_filepath` was None. Provided config : {'batch_size': 20, 'shuffle': False, 'num_workers': 4, 'min_duration': 0.1, 'pin_memory': True, 'manifest_filepath': None, 'labels': [' ', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'], 'sample_rate': 16000}
[NeMo W 2024-12-18 18:57:31 audio_

[NeMo I 2024-12-18 18:57:31 features:305] PADDING: 0


      ctc_ckpt = torch.load(ctc_model_weights, map_location="cpu")
    


EncDecCTCModel(
  (preprocessor): AudioToMelSpectrogramPreprocessor(
    (featurizer): FilterbankFeaturesTA(
      (_mel_spec_extractor): MelSpectrogram(
        (spectrogram): Spectrogram()
        (mel_scale): MelScale()
      )
    )
  )
  (encoder): ConformerEncoder(
    (pre_encode): ConvSubsampling(
      (out): Linear(in_features=12288, out_features=768, bias=True)
      (conv): Sequential(
        (0): Conv2d(1, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (3): ReLU(inplace=True)
      )
    )
    (pos_enc): RelPositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
      (dropout_emb): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-15): 16 x ConformerLayer(
        (norm_feed_forward1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (feed_forward1): ConformerFeedForward(
          (linear1): Linear

### Emo-model instantiating

In [12]:
# setting weights & config
emo_model_config = "emo_model_config.yaml"
emo_model_weights = "emo_model_weights.ckpt"
device = "cuda" if torch.cuda.is_available() else "cpu"

# preparing model
emo_conf = OmegaConf.load(emo_model_config)
emo_model = GigaAMEmo(emo_conf)
emo_ckpt = torch.load(emo_model_weights, map_location="cpu")
emo_model.load_state_dict(emo_ckpt, strict=False)
emo_model = emo_model.to(device)
emo_model.eval()

      emo_ckpt = torch.load(emo_model_weights, map_location="cpu")
    


GigaAMEmo(
  (feature_extractor): Sequential(
    (0): MelSpectrogram(
      (spectrogram): Spectrogram()
      (mel_scale): MelScale()
    )
    (1): SpecScaler()
  )
  (conformer): ConformerEncoder(
    (pre_encode): ConvSubsampling(
      (out): Linear(in_features=12288, out_features=768, bias=True)
      (conv): Sequential(
        (0): Conv2d(1, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (3): ReLU(inplace=True)
      )
    )
    (pos_enc): RelPositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
      (dropout_emb): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-15): 16 x ConformerLayer(
        (norm_feed_forward1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (feed_forward1): ConformerFeedForward(
          (linear1): Linear(in_features=768, out_features=3072, bias=True)
          (activation):

# Segmentation [optional: for long audio files]

## Installing and importing dependencies

In [61]:
!python -m pip install pyannote.audio==3.2.0

Collecting pyannote.audio==3.2.0
  Downloading pyannote.audio-3.2.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio==3.2.0)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio==3.2.0)
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio==3.2.0)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio==3.2.0)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18 kB)
Collecting semver>=3.0.0 (from pyannote.audio==3.2.0)
  Downloading semver-3.0.2-py3-none-any.whl.metadata (5.0 kB)
Collecting speechbrain>=0.5.14 (from pyannote.audio==3.2.0)
  Downloading speechbrain-1.0.2-py3-none-any.whl.metadata (23 kB)
Collecting tensorboardX>=2.6 (from pyannote.audio==3.2.0)
  Downloading tensorboardX-2.6.2.2-py2.py

In [62]:
from io import BytesIO
from typing import List, Tuple

import numpy as np
from pyannote.audio import Pipeline
from pydub import AudioSegment

## Downloading sample

In [133]:
!wget https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples/long_example.wav

--2024-12-18 23:50:47--  https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples/long_example.wav
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/mipt-bio3/hackathon-1/refs/heads/main/samples/long_example.wav [following]
--2024-12-18 23:50:47--  https://raw.githubusercontent.com/mipt-bio3/hackathon-1/refs/heads/main/samples/long_example.wav
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2280044 (2.2M) [audio/wav]
Saving to: ‘long_example.wav’


2024-12-18 23:50:47 (30.5 MB/s) - ‘long_example.wav’ saved [2280044/2280044]



## Adding functionality for segmentation

In [92]:
def audiosegment_to_numpy(audiosegment: AudioSegment) -> np.ndarray:
    """Convert AudioSegment to numpy array."""
    samples = np.array(audiosegment.get_array_of_samples())
    if audiosegment.channels == 2:
        samples = samples.reshape((-1, 2))

    samples = samples.astype(np.float32, order="C") / 32768.0
    return samples


def segment_audio(
    audio_path: str,
    pipeline: Pipeline,
    max_duration: float = 10,
    min_duration: float = 15,
    new_chunk_threshold: float = 0.2,
) -> Tuple[List[np.ndarray], List[List[float]]]:
    # Prepare audio for pyannote vad pipeline
    audio = AudioSegment.from_wav(audio_path)
    audio_bytes = BytesIO()
    audio.export(audio_bytes, format="wav")
    audio_bytes.seek(0)

    # Process audio with pipeline to obtain segments with speech activity
    sad_segments = pipeline({"uri": "filename", "audio": audio_bytes})

    segments = []
    curr_duration = 0
    curr_start = 0
    curr_end = 0
    boundaries = []

    # Concat segments from pipeline into chunks for asr according to max/min duration
    for segment in sad_segments.get_timeline().support():
        start = max(0, segment.start)
        end = min(len(audio) / 1000, segment.end)
        if (
            curr_duration > min_duration and start - curr_end > new_chunk_threshold
        ) or (curr_duration + (end - curr_end) > max_duration):
            audio_segment = audiosegment_to_numpy(
                audio[curr_start * 1000 : curr_end * 1000]
            )
            segments.append(audio_segment)
            boundaries.append([curr_start, curr_end])
            curr_start = start

        curr_end = end
        curr_duration = curr_end - curr_start

    if curr_duration != 0:
        audio_segment = audiosegment_to_numpy(
            audio[curr_start * 1000 : curr_end * 1000]
        )
        segments.append(audio_segment)
        boundaries.append([curr_start, curr_end])

    return segments, boundaries

## Initializing pyannote VAD pipeline and using it for segmentation

IMPORTANT: place you Hugging Face access token below (you must agree to the Terms and Conditions)

In [94]:
HF_TOKEN = "<YOUR_HF_TOKEN>"

# Initialize pyannote pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/voice-activity-detection", use_auth_token=HF_TOKEN
)
pipeline = pipeline.to(torch.device(device))

# Segment audio
segments, boundaries = segment_audio("long_example.wav", pipeline)

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.2.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [95]:
# Transcribing segments
BATCH_SIZE = 10
transcriptions = ctc_model.transcribe(segments, batch_size=BATCH_SIZE)

Transcribing: 100%|██████████| 1/1 [00:40<00:00, 40.56s/it]


In [96]:
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    full_seconds = int(seconds)
    milliseconds = int((seconds - full_seconds) * 100)

    if hours > 0:
        return f"{hours:02}:{minutes:02}:{full_seconds:02}:{milliseconds:02}"
    else:
        return f"{minutes:02}:{full_seconds:02}:{milliseconds:02}"


# Output results
for transcription, boundary in zip(transcriptions, boundaries):
    boundary_0 = format_time(boundary[0])
    boundary_1 = format_time(boundary[1])
    print(f"[{boundary_0} - {boundary_1}]: {transcription}\n")

[00:00:00 - 00:08:21]: вечерня отошла давно но в кельях тихо и темно уже и сам игумин строгий свои молитвы прекратил

[00:08:62 - 00:16:83]: и кости ветхие склонил перекрестясь на одр убогий кругом и сон и тишина но церкви дверь отворена

[00:17:10 - 00:26:91]: трепещет луч лампады и тускло озаряет он и темную живопись икон и позлощенные оклады и раздается в тишине

[00:27:21 - 00:34:25]: то тяжкий вздох то шепот важный и мрачно дремлет вашине старинный свод глухой и влажный

[00:34:72 - 00:43:97]: стоят за клиросом чернец и грешник неподвижны оба и шепот их как глаз из гроба и грешник бледен

[00:44:29 - 00:53:86]: как мертвец монах несчастный полно перестань ужасна исповедь злодея заплачена тобою дань тому

[00:54:11 - 01:03:51]: кто в злобе пламенее лукаво грешника блюдет и к вечной гибели ведет смирись опомнись время время

[01:04:12 - 01:10:90]: раскаянья покров я разрешу тебя грехов сложи мучительное бремя



# DEMO

### Downloading demo samples

In [126]:
!wget https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples/audio_1.wav
!wget https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples/audio_2.wav
!wget https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples/audio_3.wav
!wget https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples/audio_4.wav
!wget https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples/audio_5.wav

audio = ["audio_1.wav", "audio_2.wav", "audio_3.wav", "audio_4.wav", "audio_5.wav"]

--2024-12-18 23:26:01--  https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples/audio_1.wav
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/mipt-bio3/hackathon-1/refs/heads/main/samples/audio_1.wav [following]
--2024-12-18 23:26:01--  https://raw.githubusercontent.com/mipt-bio3/hackathon-1/refs/heads/main/samples/audio_1.wav
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51024 (50K) [audio/wav]
Saving to: ‘audio_1.wav’


2024-12-18 23:26:01 (4.28 MB/s) - ‘audio_1.wav’ saved [51024/51024]

--2024-12-18 23:26:01--  https://github.com/mipt-bio3/hackathon-1/raw/refs/heads/main/samples

### Transcribing audio


In [127]:
text_list = ctc_model.transcribe(audio)

      with torch.cuda.amp.autocast(enabled=False):
    
Transcribing: 100%|██████████| 2/2 [00:04<00:00,  2.42s/it]


### Detecting emotions

In [128]:
emo_ru = {"angry": "злобно", "sad": "грустно", "neutral": "нейтрально", "positive": "позитивно"}
emotion_list = []

for wav in audio:
    with torch.no_grad():
        probs = emo_model.get_probs(wav)[0]
    # find emotion with the highest probabily and save it along with its value
    emotions = {emo_ru[emo_model.id2name[i]]: p for i, p in enumerate(probs)}
    emotion_list.append(max(emotions.items(), key=lambda x: x[1]))

      probs = torch.nn.functional.softmax(logits).detach().tolist()
    


### Annotating transcription

In [131]:
annotated_text = [f"– {t.strip()} [{e[0]} {e[1]:.2f}]" for t, e in zip(text_list, emotion_list)]
print(*annotated_text, sep='\n', )

– нет времени возьмите шприц иглой [злобно 0.94]
– зачем шприцы [злобно 0.66]
– заткнитесь и ищите [злобно 1.00]
– возможен обморок [нейтрально 1.00]
– ей нужно расправить легкое [нейтрально 1.00]
