In [1]:
!pip install spacy

[0mDefaulting to user installation because normal site-packages is not writeable


In [2]:
!python -m spacy download de_core_news_sm

[0mDefaulting to user installation because normal site-packages is not writeable
Collecting de-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [3]:
import os
import glob
import nemo.collections.asr as nemo_asr
import torch
import numpy as np
import ctc_decoders
import torchaudio
import soundfile as sf
import librosa

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [4]:
from processing.text_processor import preprocess_transcript_for_alignment, preprocess_transcript_for_sentence_split, split_to_sentences
from transcribing.stt_transcribe import get_stt_transcription, get_stt_probs

In [5]:
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(torch.__version__)
print(device)

1.11.0
cuda


## Read true transcript
Read the text from the complete protocol provided by the parliament of canton Schwyz. After reading the text process everything and split the whole text into its sentences. 

In [6]:
PATH_TO_TRANSCRIPT = 'data/text_processed_2021-02-24.txt'

In [7]:
with open(PATH_TO_TRANSCRIPT, encoding='utf-8') as f:
    transcript = f.read()
    transcript = preprocess_transcript_for_sentence_split(transcript)
    truth_sentences = split_to_sentences(transcript)

In [8]:
#print(truth_sentences)

## Retrieve all audio files to later match with the corresponding text

In [9]:
PATH_TO_AUDIO = '/data/voice/SZ_parliament/raw/parts'

In [10]:
audio_files = glob.glob(os.path.join(PATH_TO_AUDIO, '*.wav'))
#print(audio_files)

For testing use a test file randomly extracted from one of the audio files

In [11]:
# set a single pregenerated file for testing
TEST_FILE = ['test.wav']

## Transcription
Load and instantiate a pre-trained model from NVIDIA. In this case we use CTC decoding as output. 

In [None]:
# load model once
model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="stt_de_quartznet15x5")

Retrieve the transcript of the given audio sequence

In [None]:
stt_output = get_stt_transcription(model, TEST_FILE)

In [None]:
print(stt_output)

For comparison we attach the actual true transcript of the given sequence

In [None]:
true_transcript = "die zur Verfügung stehenden Mittel fair aufgeteilt werden, um sämtliche Unternehmen unterstützen zu können, die jetzt dringend unsere Hilfe brauchen. Zu erwähnen ist auch, dass bei vielen betroffenen Unternehmen die Personalkosten durch die Kurzarbeitsentschädigung oder durch die Corona-Erwerbsersatzentschädigung seit März 2020 gedeckt"

In [None]:
print(true_transcript)

In [None]:
!pip install biopython

## TODO
Implement the start and end point detection for each sentence, then pass it along to the sentence aligner

Get the probabilities for each token in the transcript

In [None]:
def get_stt_probs(model, file):
    logits = model.transcribe(paths2audio_files=file, logprobs=True)[0]
    probs = softmax(logits)
    return probs

def softmax(logits):
    e = np.exp(logits - np.max(logits))
    return e / e.sum(axis=-1).reshape([logits.shape[0], 1])

In [None]:
stt_probs = get_stt_probs(model, TEST_FILE)

In [None]:
print(stt_probs)

## Timestep extraction
Define the duration of a timestep 

In [None]:
# 40ms is duration of a timestep at output of the model
time_stride = 0.04

Get the model's alphabet and replace unknown token with 'space'

In [None]:
labels = list(model.cfg.decoder.vocabulary) + ['blank']
labels[0] = 'space'

In [None]:
print(labels)

In [None]:
try:
    from plotly import graph_objects as go
except ModuleNotFoundError:
    !pip install plotly
    from plotly import graph_objects as go

In [None]:
# plot probability distribution over characters for each timestep
fig_probs = go.Figure(
    go.Heatmap(z=stt_probs.transpose(),
               colorscale=[
                   [0, 'rgb(30,62,62)'],
                   [1, 'rgb(30,255,30)'],
               ],
               y=labels,
               dx=time_stride,
               name='Probs',
               hovertemplate='Time: %{x:.2f} s<br>Character: %{y}<br>Probability: %{z:.2f}<extra></extra>'),
    layout={
        'height': 300,
        'xaxis': {'title': 'Time, s'},
        'yaxis': {'title': 'Characters'},
        'title': 'Character Probabilities',
        'margin': dict(l=0, r=0, t=40, b=0, pad=0),
    }
)
fig_probs.show()

Extract timestamps for 'space' tokens

In [None]:
print(np.argmax(stt_probs[6]))

In [None]:
print(stt_probs[0])

In [None]:
spaces = []

state = ''
idx_state = 0

# set first state to 'space'
if np.argmax(stt_probs[0]) != 128 and np.argmax(stt_probs[0]) != 0:
    state = 'word'

# iterate over all timesteps from 1 to 750
for idx in range(1, stt_probs.shape[0]):
    # set current character index to the most likely token
    current_char_idx = np.argmax(stt_probs[idx])
    
    if state == 'word' and (current_char_idx == 128 or current_char_idx == 0):
        spaces.append([idx_state, idx-1])
        state = ''
    if state == '':
        if current_char_idx != 128 and current_char_idx != 0:
            state = 'word'
            idx_state = idx

# TODO            
# if you finish with space earlier than end of time frame add up to match length of complete audio sequence
if state == 'space':
    spaces.append([idx_state, len(probs)-1])

In [None]:
print(labels[1])

In [None]:
for i in range(0, stt_probs.shape[0]):
    print(np.argmax(stt_probs[i]))

In [None]:
# get timestamps for space symbols
spaces = []

state = ''
idx_state = 0

if np.argmax(stt_probs[0]) == 0:
    state = 'space'

for idx in range(1, stt_probs.shape[0]):
    current_char_idx = np.argmax(stt_probs[idx])
    if state == 'space' and current_char_idx != 0 and current_char_idx != 128:
        spaces.append([idx_state, idx-1])
        state = ''
    if state == '':
        if current_char_idx == 0:
            state = 'space'
            idx_state = idx

if state == 'space':
    spaces.append([idx_state, len(stt_probs)-1])

In [None]:
print(spaces)

In [None]:
print(stt_output)

split audio signal into separate words

In [None]:
from IPython.display import Audio, display

In [None]:
signal, sample_rate = librosa.load(TEST_FILE[0], sr=None)

In [None]:
print(stt_output)

In [None]:
# calibration offset for timestamps: 180 ms
offset = -0.18

# split the transcript into words
words = stt_output.split()

# cut words
pos_prev = 0
for j, spot in enumerate(spaces):
    display(words[j])
    pos_end = offset + (spot[0]+spot[1])/2*time_stride
    display(Audio(signal[int(pos_prev*sample_rate):int(pos_end*sample_rate)],
                 rate=sample_rate))
    pos_prev = pos_end

display(words[j+1])
display(Audio(signal[int(pos_prev*sample_rate):],
        rate=sample_rate))


In [35]:
print(stt_output)

die  zu verfürtistehende mitte fahruftaupferde umeber aul um der nahmehölle zu unterstützte wo jetzt bringends weu sie hilf ruhet zerweine ich bil hihnen aus anders feiß das befehlende betroffenen unternahme personalhörchte tuchulzahrbezenschädigung oder repe duk croner werbsesots entschädigung und das siegt märz zwanzizwehn


split audio signal into separate words

In [36]:
from IPython.display import Audio, display

In [40]:
signal, sample_rate = librosa.load(TEST_FILE[0], sr=None)

In [44]:
print(stt_output)

die  zu verfürtistehende mitte fahruftaupferde umeber aul um der nahmehölle zu unterstützte wo jetzt bringends weu sie hilf ruhet zerweine ich bil hihnen aus anders feiß das befehlende betroffenen unternahme personalhörchte tuchulzahrbezenschädigung oder repe duk croner werbsesots entschädigung und das siegt märz zwanzizwehn


In [45]:
# calibration offset for timestamps: 180 ms
offset = -0.18

# split the transcript into words
words = stt_output.split()

# cut words
pos_prev = 0
for j, spot in enumerate(spaces):
    display(words[j])
    pos_end = offset + (spot[0]+spot[1])/2*time_stride
    display(Audio(signal[int(pos_prev*sample_rate):int(pos_end*sample_rate)],
                 rate=sample_rate))
    pos_prev = pos_end

display(words[j+1])
display(Audio(signal[int(pos_prev*sample_rate):],
        rate=sample_rate))


'die'

'zu'

'verfürtistehende'

'mitte'

'fahruftaupferde'

'umeber'

'aul'

'um'

'der'

'nahmehölle'

'zu'

'unterstützte'

'wo'

'jetzt'

'bringends'

'weu'

'sie'

'hilf'

'ruhet'

'zerweine'

'ich'

'bil'

'hihnen'

'aus'

'anders'

'feiß'

ValueError: zero-size array to reduction operation maximum which has no identity

In [None]:
print(words)

Get the probabilities for each token in the transcript

In [None]:
def get_stt_probs(model, file):
    logits = model.transcribe(paths2audio_files=file, logprobs=True)[0]
    probs = softmax(logits)
    return probs

def softmax(logits):
    e = np.exp(logits - np.max(logits))
    return e / e.sum(axis=-1).reshape([logits.shape[0], 1])

In [None]:
stt_probs = get_stt_probs(model, TEST_FILE)

In [None]:
print(stt_probs)

## Timestep extraction
Define the duration of a timestep 

In [None]:
# 40ms is duration of a timestep at output of the model
time_stride = 0.04

Get the model's alphabet and replace unknown token with 'space'

In [None]:
labels = list(model.cfg.decoder.vocabulary) + ['blank']
labels[0] = 'space'

In [None]:
print(labels[128])

In [None]:
try:
    from plotly import graph_objects as go
except ModuleNotFoundError:
    !pip install plotly
    from plotly import graph_objects as go

In [None]:
# plot probability distribution over characters for each timestep
fig_probs = go.Figure(
    go.Heatmap(z=stt_probs.transpose(),
               colorscale=[
                   [0, 'rgb(30,62,62)'],
                   [1, 'rgb(30,255,30)'],
               ],
               y=labels,
               dx=time_stride,
               name='Probs',
               hovertemplate='Time: %{x:.2f} s<br>Character: %{y}<br>Probability: %{z:.2f}<extra></extra>'),
    layout={
        'height': 300,
        'xaxis': {'title': 'Time, s'},
        'yaxis': {'title': 'Characters'},
        'title': 'Character Probabilities',
        'margin': dict(l=0, r=0, t=40, b=0, pad=0),
    }
)
fig_probs.show()

Extract timestamps for 'space' tokens

In [None]:
print(np.argmax(stt_probs[6]))

In [None]:
print(stt_probs[0])

In [None]:
spaces = []

state = ''
idx_state = 0

# set first state to 'space'
if np.argmax(stt_probs[0]) != 128 and np.argmax(stt_probs[0]) != 0:
    state = 'word'

# iterate over all timesteps from 1 to 750
for idx in range(1, stt_probs.shape[0]):
    # set current character index to the most likely token
    current_char_idx = np.argmax(stt_probs[idx])
    
    if state == 'word' and (current_char_idx == 128 or current_char_idx == 0):
        spaces.append([idx_state, idx-1])
        state = ''
    if state == '':
        if current_char_idx != 128 and current_char_idx != 0:
            state = 'word'
            idx_state = idx

# TODO            
# if you finish with space earlier than end of time frame add up to match length of complete audio sequence
#if state == 'space':
#    spaces.append([idx_state, len(pred)-1])

In [None]:
print(spaces)

In [None]:
print(labels[128])

split audio signal into separate words

In [None]:
from IPython.display import Audio, display

In [None]:
# calibration offset for timestamps: 180 ms
offset = -0.18

# split the transcript into words
words = stt_output.split()
print(words)
# cut words
pos_prev = 0
for j, spot in enumerate(spaces):
    display(words[j], spot[0], spot[1])
    pos_end = offset + (spot[0]+spot[1])/2*time_stride
    pos_prev = pos_end

display(words[j+1])


In [None]:
print(words)

In [None]:
from aligning.sentence_aligner import SentenceAligner

In [None]:
PATH_TO_SENTENCE_ALIGNER = 'aligner_model/sentence_aligner__i4ds_alignment_corpus__amazon_transcribe.pickle'

In [None]:
sentence_aligner = SentenceAligner()

In [None]:
sentence_alignment, _, _ = sentence_aligner.predict_one(truth_sentences, 
                                                        stt_output,
                                                        do_length_ratio_full_transcript_filtering=False,
                                                        do_time_correction=False
                                                        )

In [None]:
print(stt_output)

In [None]:
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113