# Initial steps

In [1]:
# needed for wav2vec models
# no need to run it if wav2vec with lm models are not selected (deafault)
!pip install kenlm pyctcdecode > /dev/null

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [2]:
import pandas as pd
import numpy as np
import torch
import re
import os

**ASR supported models**:
```
          "openai/whisper-tiny",
          "openai/whisper-base",
          "openai/whisper-small", # ----> 244M
          "openai/whisper-medium", # ---> 769M
          "openai/whisper-large", # ---> 1550M
          "openai/whisper-large-v2",
          "openai/whisper-large-v3",
```
finetuned on Ukrainian:
```
          "Yehor/wav2vec2-xls-r-300m-uk-with-small-lm" # ---> with kenlm and pyctcdecode
          "arampacha/whisper-large-uk-2",
          "robinhad/wav2vec2-xls-r-300m-uk", # ---> no lm
          "Yehor/w2v-bert-2.0-uk", # ---> 600M
          "Yehor/wav2vec2-xls-r-1b-uk-with-lm",
          "arampacha/wav2vec2-xls-r-1b-uk",
          "voidful/wav2vec2-xlsr-multilingual-56",
          "facebook/wav2vec2-base-960h" ---> only english
```


**Supported classification models**:

zero-shot classification (multilingual):
```
          "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
          "MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33",
          "MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33"
```
tuned for sentiment classification (english):
```
          "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
          "cardiffnlp/twitter-roberta-base-sentiment-latest",
          "siebert/sentiment-roberta-large-english",

```

tuned for emotion recognition
```
          "SamLowe/roberta-base-go_emotions"
```



**Supported LLMs**:
```
          "mistralai/Mistral-7B-Instruct-v0.2"
```

Best results was obtained using `openai/whisper-large-v2` in translation mode (to English) following the pre-trained sentiment classifier `cardiffnlp/twitter-roberta-base-sentiment-latest`: accuracy 84%

In [3]:
# BEST PARAMETERS:

asr_model_name = 'openai/whisper-large-v2'

# set False to transcribe to original language
# note, translate=True works with Whisper model only
translate = True

classification_model_name = 'cardiffnlp/twitter-roberta-base-sentiment-latest' #by default this model classifies to 'positive', 'negative' and 'neutral' sentiment
# labels need to be set manually for zero-shot-classification models
labels = None
# need to specify which labels we considered to be negative
negative_labels = ["negative"]

# disable llm
llm_model_name = ''
prompt = ''

Note: Whisper is cutting audio by 30 sec. Wav2Vec2 model use the whole audio file, therefore this model may use much more RAM

In [4]:
audio_file = 'file.wav'

In [6]:
from IPython.display import Audio, display

display(Audio(audio_file, autoplay=True))

# Models' interface

## ASR models

In [None]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from transformers import pipeline
import librosa

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print(device, torch_dtype)
#TODO: W&B

In [None]:
# utils
def read_audio(fname):
  sr = 16000
  speech, _ = librosa.load(fname, sr=sr, mono=True)
  return speech

In [None]:
# "openai/whisper-{tiny}"
class Wav2Vec2BERTHuggingFace:
  def __init__(self, name):
    self.name = name
    self.processor = None
    self.model = None
    #self.forced_decoder_ids = None

  def init(self):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    self.model = AutoModelForCTC.from_pretrained(self.name).to(device)
    self.processor = Wav2Vec2BertProcessor.from_pretrained(self.name)

  def process_audio(self, audio_fname, lang=None, task="transcribe"):
    #task = transcribe/translate
    speech = read_audio(audio_fname)

    if speech is None:
      print(f'speech == None for {audio_fname}')
      return ""

    input_features = self.processor(speech, sampling_rate=16000).input_features
    features = torch.tensor(input_features).to(device)

    with torch.no_grad():
      logits = self.model(features).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    predictions = self.processor.batch_decode(predicted_ids)

    return predictions

In [None]:
class ASRPipelineHuggingFace:
  def __init__(self, name):
    self.name = name
    self.pipe = None

  def init(self):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    self.pipe = pipeline(
      "automatic-speech-recognition",
      model=self.name,
      device=device
    )

  def process_audio(self, audio_fname, lang=None, task="transcribe", chunk_lenght=30, stride=None):
    #task = transcribe/translate
    speech = read_audio(audio_fname)

    if speech is None:
      print(f'speech == None for {audio_fname}')
      return ""

    if stride is None:
      stride = chunk_lenght / 6  # default value https://github.com/huggingface/transformers/blob/5cd16f01db3b5499d4665e8624801ed30ba87bdd/src/transformers/pipelines/automatic_speech_recognition.py

    output = self.pipe(speech,
                        return_timestamps=True,
                        chunk_length_s=chunk_lenght,
                        stride_length_s=stride,  #or set list [left, right]
                        #batch_size=32,
                        # note "arampacha/whisper-large-uk-2", does not support task parameter. need to pass it using old ways
                        generate_kwargs={"language": lang, "task" : task}
                       )

    return output["text"]

## Classification models

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
#generator = torch.Generator(device=device).manual_seed(42)

In [None]:
def get_classifier(model_name="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"):
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
  if model_name in ["MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
                        "MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33",
                        "MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33"]:
    classifier = pipeline("zero-shot-classification", model=model_name, device=device)
    return classifier

  if model_name in ["lxyuan/distilbert-base-multilingual-cased-sentiments-student"]:
    return pipeline(model=model_name, device=device)

  if model_name in ["siebert/sentiment-roberta-large-english",
                    "cardiffnlp/twitter-roberta-base-sentiment-latest"]:
    return pipeline("sentiment-analysis", model=model_name, device=device)

  if model_name in ["SamLowe/roberta-base-go_emotions"]:
    return pipeline(task="text-classification", model=model_name, top_k=None)


In [None]:
def classifier_predict(text, classifier, labels):
    if labels:
      output = classifier(text, labels, max_length=512, truncation=True)
      #predict = output['labels'][np.argmax(output['scores'])]
    else:
      output = classifier(text[:512]) # , generator=generator how to pass random seed???

    if len(output) > 1:
        output[0] = [x for x in output[0] if not ('neutral' == x.get('label'))]

        predict = max(output[0], key=lambda x:x['score'])['label']
    else:
        predict = output[0]['label']

    return predict

In [None]:
def output_to_binary(result, conflict_labels):
  binary_res = 1 if result in conflict_labels else 0
  return binary_res

## LLM models

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

#device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = "cpu"

def get_llm_model(llm_model_name):
  if llm_model_name == "mistralai/Mistral-7B-Instruct-v0.2":
    model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
    return model, tokenizer
  else:
    print(f"model {llm_model_name} is not supported")
    return None, None

In [None]:
def predict_llm(model, tokenizer, prompt_instruct, text):
  prompt = f"""
  {prompt_instruct}
  Text: {text}
  """

  messages = [
    {"role": "user", "content": prompt},
  ]

  encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

  model_inputs = encodeds.to(device)
  model.to(device)

  generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
  decoded = tokenizer.batch_decode(generated_ids)

  return decoded[0]

In [None]:
def get_binary_from_llm_output(text):
  new_text = re.split('\W+', text.split('[/INST]')[1])[1].strip()
  return 1 if new_text.lower() == "conflict" else 0

# Inference

## Setup

In [None]:
def inference (audio_file_path, asr_model_name, translate, classification_model_name, labels, negative_labels, llm_model_name='', prompt='', debug=True):
  # === ASR model ===
  if "w2v" in asr_model_name or "wav2vec" in asr_model_name:
    asr_model = Wav2Vec2BERTHuggingFace(asr_model_name)
  else:
    asr_model = ASRPipelineHuggingFace(asr_model_name)

  asr_model.init()

  task = 'translate' if translate else 'transcribe'
  asr_out = asr_model.process_audio(audio_file_path, task=task)
  if debug:
    print(f"Output of ASR model ({task} task):")
    display(asr_out)
  del asr_model
  torch.cuda.empty_cache()

  # === classification model ===
  if classification_model_name != '':
    classification_model = get_classifier(classification_model_name)
    class_label = classifier_predict(asr_out, classification_model, labels)
    if debug:
      print(f"\nclassification label: {class_label}")
    binary_label = output_to_binary(class_label, negative_labels)
    del classification_model
    torch.cuda.empty_cache()

  # === LLM model ===
  elif llm_model_name != '':
    llm_model, llm_tokenizer = get_llm_model(llm_model_name)
    output = predict_llm(llm_model, llm_tokenizer, prompt, asr_out)
    if debug:
      print(output)
    binary_label = get_binary_from_llm_output(output)
    del llm_model, llm_tokenizer
    torch.cuda.empty_cache()
  else:
    print("Classification model or LLM name is not defined")
    return None, None

  if debug:
    print(f"\nRESULT: {'conflict' if binary_label else 'no conflict present'}\n")

  return asr_out, binary_label

## Inference (best results)

In [None]:
%%time
_, _ = inference(audio_file, asr_model_name, translate, classification_model_name, labels, negative_labels, debug=True)

1 min on T4 GPU, 3 min on CPU

## Inference (original language)

In [None]:
asr_model_name = 'openai/whisper-large-v2'
translate = False
classification_model_name = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
labels = ["позитив","негатив","нейтральне"]
negative_labels = ["негатив"]
llm_model_name = ''
prompt = ''

In [None]:
%%time
_, _ = inference(audio_file, asr_model_name, translate, classification_model_name, labels, negative_labels, debug=True)

## Inference (LLM)

In [None]:
# LLM usage (English):
# Note, for me works only on CPU with High-RAM (it needs ~30GB RAM or GPU)

asr_model_name = 'openai/whisper-large-v2'
translate = True
classification_model_name = ''
llm_model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
prompt_instruction = "Analyze the following text (transcribed from the call center conversation) and decide if there is a conflict or miscommunication in this text. Provide 1 word as an answer (conflict or neutral):"
labels = []
negative_labels = []

In [None]:
%%time
_, _ = inference(audio_file, asr_model_name, translate, classification_model_name, labels, negative_labels, llm_model_name, prompt_instruction, debug=True)