##Setup

In [1]:
!pip install SpeechRecognition sentence_transformers pydub num2words

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting num2words
  Downloading num2words-0.5.12-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.5 MB/s[0m eta [36

In [2]:
import os
import speech_recognition as sr
from pydub import AudioSegment
from sentence_transformers import SentenceTransformer
from num2words import num2words
import string
import re
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
import shutil

In [3]:
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [4]:
r = sr.Recognizer()
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Functions

In [5]:
def transcribe_audio(audio_path):
  audio = AudioSegment.from_file(audio_path)
  audio.export("temp.wav", format="wav")

  with sr.AudioFile("temp.wav") as source:
    text = r.listen(source)

    transcribed_text = r.recognize_google(text, language="zh-ZH")

  os.remove("temp.wav")

  return transcribed_text

In [6]:
def convert_numerals(text):
    def replace_numerals(match):
        num = match.group(0).replace(',', '')
        return num2words(int(num), lang='zh')

    pattern = r'\b\d{1,3}(,\d{3})*\b'
    words = text.split()
    for word in words:
      try:
        converted_text = re.sub(pattern, replace_numerals, text)
      except:
        pass

    clean_text = ' '.join(words)
    return clean_text

In [7]:
def strip_punctuation(text):
    punctuation = string.punctuation + '«»¡¿'
    translator = str.maketrans("", "", punctuation)

    text_stripped = text.translate(translator)

    return text_stripped

In [8]:
def normalize_text(text):
      text = text.replace('\t', '')
      text = text.replace('\xa0\xa0','')
      stripped_text = strip_punctuation(text)
      normalized_text = convert_numerals(stripped_text)

      return normalized_text

In [9]:
def embed_sentences(original, transcribed):
  sentences = []
  sentences.append(original.lower())
  sentences.append(transcribed.lower())

  sentence_embeddings = model.encode(sentences)

  similarity = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])

  return similarity[0][0]

##Evaluation

In [13]:
books = ['PSA', 'REV', 'ROM', 'RUT',
         'SNG', 'TIT', 'ZEC', 'ZEP']
         #'1CH', '1CO', '1JN', '1KI', '1PE', '1SA', '1TH', '1TI', '2CH', '2CO',
         #'2JN', '2KI', '2PE', '2SA', '2TH', '2TI', '3JN','ACT', 'AMO', 'COL',
         #'DAN', 'DEU', 'ECC', 'EPH', 'EST', 'EXO', 'EZK', 'EZR', 'GAL', 'HAB',
         #'HAG', 'HEB', 'HOS', 'ISA', 'JAS', 'JDG', 'JER', 'JOB', 'JOL', 'JON',
         #'JOS', 'JUD', 'LAM', 'LEV', 'LUK', 'MAL', 'MAT', 'MIC', 'MRK', 'NAM',
         #'NEH', 'NUM', 'OBA', 'PHM', 'PHP', 'PRO',
redo = []

In [14]:
for book in books:
  audio_folder = f'/content/MyDrive/Shareddrives/Bible Data Team/Audio Bible Experiments/Chinese/Audio/{book}'
  og_text = f'/content/MyDrive/Shareddrives/Bible Data Team/Audio Bible Experiments/Chinese/Text/{book}'
  for filename in os.listdir(audio_folder):
        clean_filename = filename.replace(' Copy', '')
        if filename.endswith('.wav'):
            audio_path = os.path.join(audio_folder, filename)
            txt_file = clean_filename[:-4].replace('mixdown_Track ', '')
            original_text_path = os.path.join(og_text, txt_file + ".txt")

            try:
              with open(original_text_path, 'r', encoding='utf-8') as file:
                  original_text = file.read()

              original_n_text = normalize_text(original_text)

              print(f'Comparing Audio for {txt_file}')

              try:
                transcribed_text = transcribe_audio(audio_path)

              # If the audio is too long
              except:
                chunk_output = '/content/chunks'
                os.makedirs(chunk_output, exist_ok = True)
                chunk_length = 60 * 1000 * 4 #4 minutes
                audio = AudioSegment.from_file(audio_path)
                chunk_count = len(audio) // chunk_length + 1
                transcribed_texts = []

                for i in range(chunk_count):
                    start_time = i * chunk_length
                    end_time = (i + 1) * chunk_length

                    chunk = audio[start_time:end_time]
                    chunk_filename = f"{txt_file}_{i}.wav"
                    output_path = os.path.join(chunk_output, chunk_filename)

                    chunk.export(output_path, format="wav")

                for filename in os.listdir(chunk_output):
                    chunk_path = os.path.join(chunk_output, filename)
                    try:
                      transcribed_texts.append(transcribe_audio(chunk_path))
                    except:
                      pass

                transcribed_text = ' '.join(transcribed_texts)
                shutil.rmtree(chunk_output)

              similarity = embed_sentences(original_n_text, transcribed_text)

              if similarity > .8:
                quality = 'Good Enough'
              else:
                quality = 'Questionable'
                redo.append(txt_file)

              print(f'Quality: {quality}')

            except FileNotFoundError:
              pass

if len(redo) > 0:
    print(f'The following chapters should be checked and possibly redone: {sorted(redo)}')
else:
    print('All audio is of acceptable quality.')

Comparing Audio for PSA_45
Quality: Good Enough
Comparing Audio for PSA_46
Quality: Good Enough
Comparing Audio for PSA_47
Quality: Good Enough
Comparing Audio for PSA_42
Quality: Good Enough
Comparing Audio for PSA_48
Quality: Good Enough
Comparing Audio for PSA_33
Quality: Questionable
Comparing Audio for PSA_36
Quality: Good Enough
Comparing Audio for PSA_37
Quality: Good Enough
Comparing Audio for PSA_28
Quality: Questionable
Comparing Audio for PSA_35
Quality: Questionable
Comparing Audio for PSA_39
Quality: Questionable
Comparing Audio for PSA_27
Quality: Questionable
Comparing Audio for PSA_43
Quality: Good Enough
Comparing Audio for PSA_26
Quality: Good Enough
Comparing Audio for PSA_44
Quality: Questionable
Comparing Audio for PSA_41
Quality: Questionable
Comparing Audio for PSA_38
Quality: Questionable
Comparing Audio for PSA_32
Quality: Good Enough
Comparing Audio for PSA_30
Quality: Questionable
Comparing Audio for PSA_34
Quality: Questionable
Comparing Audio for PSA_29
Qua

In [15]:
redo

['PSA_33',
 'PSA_28',
 'PSA_35',
 'PSA_39',
 'PSA_27',
 'PSA_44',
 'PSA_41',
 'PSA_38',
 'PSA_30',
 'PSA_34',
 'PSA_3',
 'PSA_2',
 'PSA_8',
 'PSA_12',
 'PSA_4',
 'PSA_10',
 'PSA_19',
 'PSA_21',
 'PSA_24',
 'PSA_13',
 'PSA_25',
 'PSA_14',
 'PSA_80',
 'PSA_89',
 'PSA_84',
 'PSA_88',
 'PSA_79',
 'PSA_78',
 'PSA_81',
 'PSA_86',
 'PSA_75',
 'PSA_90',
 'PSA_95',
 'PSA_99',
 'PSA_94',
 'PSA_93',
 'PSA_115',
 'PSA_97',
 'PSA_109',
 'PSA_116',
 'PSA_104',
 'PSA_110',
 'PSA_105',
 'PSA_107',
 'PSA_102',
 'PSA_106',
 'PSA_117',
 'PSA_118',
 'PSA_108',
 'PSA_126',
 'PSA_49',
 'PSA_55',
 'PSA_56',
 'PSA_65',
 'PSA_66',
 'PSA_64',
 'PSA_53',
 'PSA_60',
 'PSA_71',
 'PSA_150',
 'PSA_132',
 'PSA_129',
 'PSA_141',
 'PSA_140',
 'PSA_136',
 'PSA_135',
 'PSA_145',
 'PSA_130',
 'PSA_139',
 'PSA_128',
 'PSA_149',
 'REV_20',
 'REV_1',
 'REV_5',
 'REV_22',
 'REV_6',
 'REV_12',
 'REV_14',
 'REV_16',
 'REV_18',
 'REV_7',
 'REV_10',
 'REV_8',
 'ROM_8',
 'ROM_7',
 'ROM_13',
 'ROM_11',
 'ROM_14',
 'ROM_3',
 'ROM_15

In [9]:
chs = [
12,
3,
7,
5,
6,
8,
2,
10,
9,
4,
11,
13,
1,
]
print(sorted(chs))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
