# Notebook for å kjøre talegjenkjenning med wav2vec

## Import

In [7]:
import torch
import pandas as pd
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
from pyannote.audio import Pipeline
from pathlib import Path

## Segmentering

In [8]:
# Bare voice activity detection. Dette gir trolig mindre segmenter og er mye raskere, 
# men det er ikke inndeling i talere
def run_vad(audiofile,
            threshold = 0.5,
            min_speech_duration_ms = 250,
            min_silence_duration_ms = 100,
            window_size_samples = 1536,
            speech_pad_ms = 30,
            return_seconds = True,
            outfile=None):
    """Run voice activity detection on an audiofile.

    Parameter
    ----------
    audiofile
        the adiofile to run VAD on
    outfile=None
        the path to an csv file that the diarized DataFrame is stored to

    Return: a DataFrame with columns 'speaker', 'start', 'end', 'duration',
    and 'audio_path' if outfile is None, else create a csv file with the
    name specified in outfile
    """
    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=False)
    model
    get_ts, read_audio = utils[0], utils[2]
    audio_tns = read_audio(audiofile)
    vad = get_ts(audio_tns,
                 model,
                 threshold=threshold,
                 min_speech_duration_ms=min_speech_duration_ms,
                 min_silence_duration_ms=min_silence_duration_ms,
                 window_size_samples=window_size_samples,
                 speech_pad_ms=speech_pad_ms,
                 return_seconds=return_seconds)

    df = pd.DataFrame(vad)
    df.loc[:, "duration"] = df.end - df.start
    df.loc[:, "audio_path"] = audiofile
    if outfile is None:
        return df
    else:
        df.to_csv(outfile, index=False)

In [9]:
# Denne funksjonen gir inndeling i talere i tillegg til voice activity detection.
# Merk at segmentene noen ganger blir for lange (+30 sek), og derfor filtreres ut
# av talegjenkjenningsfunksjonen
def diarize(audiofile, outfile=None):
    """Identify all the individual speakers in an audio file and return
    a DataFrame with segments with start and end codes and speaker tags.

    Parameter
    ----------
    audiofile
        the adiofile to diarize
    outfile=None
        the path to an csv file that the diarized DataFrame is stored to

    Return: a DataFrame with columns 'speaker', 'start', 'end', 'duration',
    and 'audio_path' if outfile is None, else create a csv file with the
    name specified in outfile
    """
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
    # parameters = pipeline.default_parameters() # might be possible to get shorter segments by adjusting params
    # parameters["min_duration_off"] = 0.001
    # parameters["onset"] = 0.9
    # pipeline.instantiate(parameters)

    diarization = pipeline(audiofile)
    result = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        result.append({"speaker": speaker, "start": turn.start, "end": turn.end})
    df = pd.DataFrame(result)
    df.loc[:, "duration"] = df.end - df.start
    df.loc[:, "audio_path"] = audiofile
    if outfile is None:
        return df
    else:
        df.to_csv(outfile, index=False)

In [10]:
# Lydfil som skal segmenteres
# audiofile = Path("path/to/audiofile") 
audiofile = Path("20220202_Øyvind_Stokke.wav") 

In [11]:
# Bare VAD:
segmentation_df = run_vad(audiofile, min_silence_duration_ms=10)
# Diarization
#segmentation_df = diarize(audiofile)

Using cache found in /home/pers/.cache/torch/hub/snakers4_silero-vad_master


## Kjør talegjenkjenning

In [15]:
def wav2vec_transcribe(
    filepath, processor, model, offset, duration, device, limit=30, print_output=False
):
    """Transcribe an audiofile or segment of an audio file with wav2vec.

    Parameter
    ----------
    filepath
        path to the audio file
    processor
        a wav2vec processor, e.g. Wav2Vec2ProcessorWithLM.from_pretrained('NbAiLab/nb-wav2vec2-1b-bokmaal')
    model
        a wav2vec model, e.g. Wav2Vec2ForCTC.from_pretrained('NbAiLab/nb-wav2vec2-1b-bokmaal')
    offset
        where to start transcribing, in seconds from start of file
    duration
        the duration of the audio segment, in seconds from the offset, which should be transcribed.
    device
        the device the process should be run on (cpu of gpu)
    limit=30:
        The max amount of seconds accepted for a segment
    print_output= False
        Option to print the transcriptions to terminal

    return: the predicted transcription of the audio segment
    """

    try:
        if duration > limit:
            if print_output:
                print("")
            return ""
        else:
            audio, rate = librosa.load(
                filepath, sr=16000, offset=offset, duration=duration
            )
            input_values = processor(
                audio, sampling_rate=rate, return_tensors="pt"
            ).input_values.to(device)
            logits = model(input_values).logits.cpu()
            transcription = processor.batch_decode(logits.detach().numpy()).text
            if print_output:
                print(transcription[0])
            return transcription[0]
    except Exception as e:
        print(e)
        if print_output:
            print("_")
        return "_"


def transcribe_df_w2v(
    df, processor, model, device, audio_dir=None, print_output=False, outfile=None
):
    """Transcribe audio with wav2vec given a DataFrame with segments. A column 'wav2vec' will
    be created with the predicted transcriptions.

    Parameter
    ----------
    df
        a DataFrame with segments
    processor
        a wav2vec processor, e.g. Wav2Vec2ProcessorWithLM.from_pretrained('NbAiLab/nb-wav2vec2-1b-bokmaal')
    model
        a wav2vec model, e.g. Wav2Vec2ForCTC.from_pretrained('NbAiLab/nb-wav2vec2-1b-bokmaal')
    device
        the device the process should be run on (cpu or gpu)
    audio_dir=None
        a directory where the files in the 'audio_path' column in the df are located
        if this column does not contain complete paths
    print_output= False
        Option to print the transcriptions to terminal
    outfile=None
        the path to an csv file that the transcribed DataFrame is stored to

    return: a DataFrame with transcriptions in the column 'wav2vec'if outfile is None,
    else create a csv file with the name specified in outfile"""
    if audio_dir is None:
        df.loc[:, "wav2vec"] = df.apply(
            lambda row: wav2vec_transcribe(
                row.audio_path,
                processor,
                model,
                row.start,
                row.duration,
                device,
                print_output=print_output,
            ),
            axis=1,
        )
    else:
        df.loc[:, "wav2vec"] = df.apply(
            lambda row: wav2vec_transcribe(
                audio_dir + row.audio_path,
                processor,
                model,
                row.start,
                row.duration,
                device,
                print_output=print_output,
            ),
            axis=1,
        )
    if outfile is None:
        return df
    else:
        df.to_csv(outfile, index=False)
        return df

In [14]:
model_name = "NbAiLab/nb-wav2vec2-1b-bokmaal" # pek på en norsk w2v-modell her, f.eks. "NbAiLab/nb-wav2vec2-1b-bokmaal"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)

In [16]:
# Filsti til utfil
# outfile = Path("path/to/output")
outfile = Path("testtranskripsjon.csv")


In [17]:
# Transkriber. Om ikke outfile er None, blir
# transkripsjonen skrevet til utfila og en dataramme med
# transkripsjonen blir returnert
trans_df = transcribe_df_w2v(
    segmentation_df,
    processor,
    model,
    device,
    print_output=True, # printer ut transkripsjonene. Dette kan fylle opp minnet for lange lydfiler
    outfile=outfile
)

dagens gjest har sin ekspertise inn i vår tids viktigste tema
klimaendringer rettferdighet
ressursforvaltning
hører om det av en stake møtte filosofien
og aldri klarte å bli kvist
observatoriet
en forskningspotkast fra ut norges arktiske universitet
med geirhevnsell ringvold
mannen som lurer på det meste
