##Setup

In [None]:
!pip install SpeechRecognition sentence_transformers pydub num2words



In [None]:
import os
import speech_recognition as sr
from pydub import AudioSegment
from sentence_transformers import SentenceTransformer
from num2words import num2words
import string
import re
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
import shutil

In [None]:
drive.mount('/content/MyDrive')

Drive already mounted at /content/MyDrive; to attempt to forcibly remount, call drive.mount("/content/MyDrive", force_remount=True).


In [None]:
r = sr.Recognizer()
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

## Functions

In [None]:
def transcribe_audio(audio_path):
  audio = AudioSegment.from_file(audio_path)
  audio.export("temp.wav", format="wav")

  with sr.AudioFile("temp.wav") as source:
    text = r.listen(source)

    transcribed_text = r.recognize_google(text, language="es-ES")

  os.remove("temp.wav")

  return transcribed_text

In [None]:
def convert_numerals(text):
    def replace_numerals(match):
        num = match.group(0).replace(',', '')
        return num2words(int(num), lang='es')

    pattern = r'\b\d{1,3}(,\d{3})*\b'
    words = text.split()
    for word in words:
      try:
        converted_text = re.sub(pattern, replace_numerals, text)
      except:
        pass

    clean_text = ' '.join(words)
    return clean_text

In [None]:
def strip_punctuation(text):
    punctuation = string.punctuation + '«»¡¿'
    translator = str.maketrans("", "", punctuation)

    text_stripped = text.translate(translator)

    return text_stripped

In [None]:
def normalize_text(text):
      text = text.replace('\t', '')
      text = text.replace('\xa0\xa0','')
      stripped_text = strip_punctuation(text)
      normalized_text = convert_numerals(stripped_text)

      return normalized_text

In [None]:
def embed_sentences(original, transcribed):
  sentences = []
  sentences.append(original.lower())
  sentences.append(transcribed.lower())

  sentence_embeddings = model.encode(sentences)

  similarity = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])

  return similarity[0][0]

##Evaluation

In [None]:
books = ['1CH', '1CO', '1JN', '1KI', '1PE', '1SA', '1TH', '1TI', '2CH', '2CO',
         '2JN', '2KI', '2PE', '2SA', '2TH', '2TI', '3JN', 'ACT', 'AMO', 'COL',
         'DAN', 'DEU', 'ECC', 'EPH', 'EST', 'EXO', 'EZK', 'EZR', 'GAL', 'HAB',
         'HAG', 'HEB', 'HOS', 'ISA', 'JAS', 'JDG', 'JER', 'JOB', 'JOL', 'JON',
         'JOS', 'JUD', 'LAM', 'LEV', 'LUK', 'MAL', 'MAT' 'MIC', 'MRK', 'NAH',
         'NEH', 'NUM', 'OBA', 'PHM', 'PHP', 'PRO', 'PSA', 'REV', 'ROM', 'RUT',
         'SNG', 'TIT', 'ZEC', 'ZEP']
 redo = []

In [None]:
for book in books:
  audio_folder = f'/content/MyDrive/Shareddrives/Bible Data Team/Audio Bible Experiments/NVI/Audio/{book}'
  og_text = f'/content/MyDrive/Shareddrives/Bible Data Team/Audio Bible Experiments/NVI/Text/{book}'
  for filename in os.listdir(audio_folder):
        clean_filename = filename.replace(' Copy', '')
        if filename.endswith('.wav'):
            audio_path = os.path.join(audio_folder, filename)
            txt_file = clean_filename[:-4].replace('mixdown_Track ', '')
            original_text_path = os.path.join(og_text, txt_file + ".txt")

            try:
              with open(original_text_path, 'r', encoding='utf-8') as file:
                  original_text = file.read()

              original_n_text = normalize_text(original_text)

              print(f'Comparing Audio for {txt_file}')

              try:
                transcribed_text = transcribe_audio(audio_path)

              # If the audio is too long
              except:
                chunk_output = '/content/chunks'
                os.makedirs(chunk_output, exist_ok = True)
                chunk_length = 60 * 1000 * 4 #4 minutes
                audio = AudioSegment.from_file(audio_path)
                chunk_count = len(audio) // chunk_length + 1
                transcribed_texts = []

                for i in range(chunk_count):
                    start_time = i * chunk_length
                    end_time = (i + 1) * chunk_length

                    chunk = audio[start_time:end_time]
                    chunk_filename = f"{txt_file}_{i}.wav"
                    output_path = os.path.join(chunk_output, chunk_filename)

                    chunk.export(output_path, format="wav")

                for filename in os.listdir(chunk_output):
                    chunk_path = os.path.join(chunk_output, filename)
                    try:
                      transcribed_texts.append(transcribe_audio(chunk_path))
                    except:
                      pass

                transcribed_text = ' '.join(transcribed_texts)
                shutil.rmtree(chunk_output)

              similarity = embed_sentences(original_n_text, transcribed_text)

              if similarity > .8:
                quality = 'Good Enough'
              else:
                quality = 'Questionable'
                redo.append(txt_file)

              print(f'Quality: {quality}')

            except FileNotFoundError:
              pass

if len(redo) > 0:
    print(f'The following chapters should be checked and possibly redone: {sorted(redo)}')
else:
    print('All audio is of acceptable quality.')

Comparing Audio for JAS_1
Quality: Good Enough
Comparing Audio for JAS_2
Quality: Good Enough
Comparing Audio for JAS_3
Quality: Good Enough
Comparing Audio for JAS_5
Quality: Good Enough
Comparing Audio for JAS_4
Quality: Good Enough
Comparing Audio for SNG_5
Quality: Good Enough
Comparing Audio for SNG_6
Quality: Good Enough
Comparing Audio for SNG_8
Quality: Good Enough
Comparing Audio for SNG_7
Quality: Good Enough
Comparing Audio for SNG_3
Quality: Good Enough
Comparing Audio for SNG_2
Quality: Good Enough
Comparing Audio for SNG_1
Quality: Good Enough
Comparing Audio for SNG_4
Quality: Good Enough
Comparing Audio for RUT_2
Quality: Good Enough
Comparing Audio for RUT_1
Quality: Good Enough
Comparing Audio for RUT_4
Quality: Good Enough
Comparing Audio for RUT_3
Quality: Good Enough
Comparing Audio for MIC_3
Quality: Good Enough
Comparing Audio for MIC_7
Quality: Good Enough
Comparing Audio for MIC_4
Quality: Good Enough
Comparing Audio for MIC_2
Quality: Good Enough
Comparing Aud