In [1]:
%%capture
!pip install transformers
!pip install ffmpeg-python
!pip install librosa
!pip install bnunicodenormalizer
!pip install wordfreq
!pip install symspellpy
!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode

In [2]:
import torch
import torchaudio
from transformers import ( Wav2Vec2CTCTokenizer,
                          Wav2Vec2ForCTC,
                          Wav2Vec2Processor,
                          Wav2Vec2ProcessorWithLM)
import librosa


from wordfreq import (word_frequency,
                      top_n_list,
                      get_frequency_dict,
                      zipf_frequency)

from symspellpy import SymSpell, Verbosity
from itertools import islice

from bnunicodenormalizer import Normalizer 

bnorm=Normalizer()

import warnings
warnings.filterwarnings('ignore')

In [3]:
!wget https://gitlab.com/mushrafi88/dlsprint/-/raw/main/csv_files/symspell.txt?inline=false -O symspell.txt

--2022-09-01 07:07:34--  https://gitlab.com/mushrafi88/dlsprint/-/raw/main/csv_files/symspell.txt?inline=false
Resolving gitlab.com (gitlab.com)... 172.65.251.78, 2606:4700:90:0:f22e:fbec:5bed:a9b9
Connecting to gitlab.com (gitlab.com)|172.65.251.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7191245 (6.9M) [application/octet-stream]
Saving to: ‘symspell.txt’


2022-09-01 07:07:37 (4.15 MB/s) - ‘symspell.txt’ saved [7191245/7191245]



In [4]:
sym_spell_word_segmentation = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
dictionary_path = '/content/symspell.txt'
sym_spell_word_segmentation.load_dictionary(dictionary_path, 0, 1,separator=",")

sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7)
dictionary_path = '/content/symspell.txt'
sym_spell.load_dictionary(dictionary_path, 0, 1,separator=",")

True

In [5]:
if torch.cuda.is_available():  
    device = "cuda:0" 
else:  
    device = "cpu"  

In [6]:
def word_segmentation(input_term):
    result = sym_spell_word_segmentation.word_segmentation(input_term)
    return result.corrected_string

In [29]:
def dictionary_(word):
    suggestions = sym_spell.lookup(
    word, Verbosity.CLOSEST,max_edit_distance=1, include_unknown=True)
    for suggestion in suggestions:
        return str(suggestion).split(',')[0]

In [8]:
def lookup(sen):
    words = sen.split()
    m=[]
    for wow in words:
        if len(wow)>16:
            s=word_segmentation(wow)
            j = s.split()
            for n in j:
                m.append(n)
        else:
            m.append(wow)
    l=[]
    for wow in m:
        if (word_frequency(wow,'bn',wordlist='large',minimum=0.0) == 0.0):
            s=dictionary_(wow)
            l.append(s)
        else:
            l.append(wow)
    return ' '.join(l)

In [9]:
def punctuation(sen):
    if len(sen) != 0:
        sen=sen.strip()
        q_words=["কি", "কই" ,"কয়জন" ,"কে" ,"কিভাবে" ,"কবে" ,"কখন"]
        check = any(item in sen for item in q_words)
        if check:
            return sen +'?'
        else:
            return sen + '।'
    else:
        return sen
    return " ".join(all_words)

In [10]:
model_path='mushrafi88/wav2vec2_xlsr_300m_bn_6gram_arpa'

In [11]:
model = Wav2Vec2ForCTC.from_pretrained(model_path).to(device)
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_path)

Downloading config.json:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/404 [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/84.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

In [30]:
def infer(audio_path):
    inputs = processor(audio_path, sampling_rate=16_000, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    transcription = processor.batch_decode(logits.cpu().numpy()).text
    sen = lookup(transcription[0])
    pun = punctuation(sen)
    return pun

In [13]:
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

In [37]:
audio, sr = get_audio()

In [38]:
import scipy
scipy.io.wavfile.write('recording.wav', sr, audio)
y,sr = librosa.load("recording.wav")
audio = librosa.resample(y, orig_sr=sr, target_sr=16000)

In [39]:
sen=infer(audio)
sen

'আমার নাম মাশরাফি উনি সুষম।'