# VerbaScribe: Transform speech into text like a pro!
## Author: Seyi Ayobami Ayeni (saa2250)

In [1]:
!pip install pydub
!apt install ffmpegq
!pip install SpeechRecognition
import os
import glob
import speech_recognition as sr
from pydub import AudioSegment
from google.colab import files

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
[1;31mE: [0mUnable to locate package ffmpegq[0m
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.11.0


In [2]:
def upload_audio():
  """Uploads an mp3 audio file from local device."""
  uploaded = files.upload()
  return list(uploaded.keys())[0]  # Returns the filename

In [3]:
def convert_to_wav(mp3_file):
  """Converts an mp3 audio file to wav format."""
  !ffmpeg -i {mp3_file} {mp3_file[:-4]}.wav

In [4]:
def split_audio(file_path, chunk_length_ms):
  """Splits an audio file into smaller chunks of specified length."""
  audio = AudioSegment.from_wav(file_path)
  chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
  os.makedirs("audio_chunks", exist_ok=True)
  for i, chunk in enumerate(chunks):
    chunk.export(f"audio_chunks/chunk_{i}.wav", format="wav")

In [5]:
def transcribe_audio(file_path, language="ha"):
    """Transcribes an audio file using Google Speech Recognition."""
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
      recognizer.adjust_for_ambient_noise(source)
      audio_data = recognizer.record(source)
      try:
          text = recognizer.recognize_google(audio_data, language=language)
          return text
      except sr.UnknownValueError:
          return f"Could not understand audio in {file_path}"
      except sr.RequestError as e:
          return f"Could not request results from Google Speech Recognition service; {e}"

In [6]:
def main():
  mp3_file = upload_audio()
  convert_to_wav(mp3_file)

  split_audio(f"{mp3_file[:-4]}.wav", 60000)  # Split into 1-minute chunks

  wav_files = sorted(glob.glob("audio_chunks/chunk_*.wav"), key=lambda x: int(x.split('_')[-1].split('.')[0]))

  transcriptions = []
  for wav_file in wav_files:
    print(f"Transcribing {wav_file}...")
    text = transcribe_audio(wav_file)
    transcriptions.append(text)

  with open("transcription.txt", "w") as file:
    for line in transcriptions:
      file.write(line + "\n")

  print("Transcription saved to transcription.txt")
  print("\nPlease find below the transcript:")
  for transcription in transcriptions:
    print(transcription)

In [7]:
import shutil

def clean_up_chunks():
  """Deletes the audio chunks after transcription."""
  shutil.rmtree("audio_chunks")

In [8]:
# Run the solution
main()

Saving 1.mp3 to 1.mp3
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis 

In [9]:
# Remove the chunks
# clean_up_chunks()