# In this Demo, we will:

1. Download ggml-whisper which is a really fast version of whisper written in C/C++
2. Convert an audio file with any format to wav (the only format currently supported by ggml-whisper)
3. Convert the transcription to csv and save for model training later

## Download an example video

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Download ggml-whisper repo and compile

In [2]:
!git clone https://github.com/ggerganov/whisper.cpp.git

Cloning into 'whisper.cpp'...
remote: Enumerating objects: 4091, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 4091 (delta 29), reused 47 (delta 15), pack-reused 4015[K
Receiving objects: 100% (4091/4091), 7.00 MiB | 10.68 MiB/s, done.
Resolving deltas: 100% (2552/2552), done.


In [3]:
%cd /content/whisper.cpp/models
!bash download-ggml-model.sh base.en
%cd ..
!make

/content/whisper.cpp/models
Downloading ggml model base.en from 'https://huggingface.co/ggerganov/whisper.cpp' ...
Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
You can now use it like this:

  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav

/content/whisper.cpp
I whisper.cpp build info: 
I UNAME_S:  Linux
I UNAME_P:  x86_64
I UNAME_M:  x86_64
I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -pthread -mavx2 -mfma -mf16c -mavx -msse3
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS:  
I CC:       cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
I CXX:      g++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0

cc  -I.              -O3 -DNDEBUG -std=c11   -fPIC -pthread -mavx2 -mfma -mf16c -mavx -msse3   -c ggml.c -o ggml.o
g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -c whisper.cpp -o whisper.o
g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/main/main.cpp examples/common.cpp examples/common-ggml.cpp ggml.o whisper

## Run ggml-whisper on sample file

In [None]:
# Note that main example currently runs only with 16-bit WAV files,
# convert input before running the tool.'
import os

path = "/content/drive/MyDrive/audio_3ai2"
i = 0
print(path)
# Loop through the files in the directory
for file in os.listdir(path):
    if file.endswith(".mp3"):  # Process only MP3 files
        i += 1
        print(file)

        # Convert the MP3 to WAV using FFmpeg
        input_file = os.path.join(path, file)
        output_file = os.path.join(path, f"wav_files/{i}.wav")
        !ffmpeg -i "$input_file" -ar 16000 -ac 1 -c:a pcm_s16le "$output_file"

        # Process the WAV file using the "main" program
        !./main -f "$output_file"



## Redirect and save transcription output and transform to csv

In [None]:
# import os
# path = "/content/drive/MyDrive/audio_3ai2"
# wav_folder = os.path.join(path, f"wav_files")
# i=0
# for wavf in wav_folder:
#   i=i+1
#   dest = os.path.join(path, f"transcripts/{i}.txt")
#   !./main -f wavf > dest

import os

path = "/content/drive/MyDrive/audio_3ai2"
wav_folder = os.path.join(path, "wav_files")

# Create the transcripts folder if it doesn't exist
transcripts_folder = os.path.join(path, "transcripts")
os.makedirs(transcripts_folder, exist_ok=True)

# Iterate through WAV files in the folder
for i, wav_file in enumerate(os.listdir(wav_folder)):
    if wav_file.endswith('.wav'):
        wav_file_path = os.path.join(wav_folder, wav_file)
        dest = os.path.join(transcripts_folder, f"{i+1}.txt")

        # Run the command to generate transcripts using ./main
        command = f"./main -f '{wav_file_path}' > '{dest}'"
        os.system(command)

print("Transcripts generated and saved.")


In [None]:
import os
import csv

def generate_transcription_csv(folder_path):
    # Specify the transcripts folder
    transcript_folder = os.path.join(folder_path, 'transcripts')

    # Specify the csv_files folder
    csv_folder = os.path.join(folder_path, 'csv_files')

    # Create the csv_files folder if it doesn't exist
    os.makedirs(csv_folder, exist_ok=True)

    # Iterate through transcript files in the "transcripts" folder
    for transcript_file in os.listdir(transcript_folder):
        if transcript_file.endswith('.txt'):
            csv_file_name = os.path.splitext(transcript_file)[0] + '.csv'
            csv_file_path = os.path.join(csv_folder, csv_file_name)

            with open(csv_file_path, 'w') as csv_file:
                csv_writer = csv.writer(csv_file)
                csv_writer.writerow(['time_stamp', 'transcription'])  # Writing header

                transcript_path = os.path.join(transcript_folder, transcript_file)
                with open(transcript_path, 'r') as txt_file:
                    for line in txt_file:
                        line = line.strip()
                        print("in loop")
                        # Skip empty lines
                        if line == '':
                            continue

                        # Split the line into timestamp and transcription
                        time_stamp, transcription = line.split(']', 1)

                        # Remove the leading '[' from the timestamp
                        time_stamp = time_stamp[1:]

                        # Remove leading and trailing spaces from the transcription
                        transcription = transcription.strip()

                        # Skip lines where the transcription is enclosed in square brackets
                        if not (transcription.startswith('[') and transcription.endswith(']')):
                            # Write the data to the CSV file
                            csv_writer.writerow([time_stamp, transcription])

            print(f"CSV file has been saved at: {csv_file_path}")


# Specify the main folder path
main_folder_path = "/content/drive/MyDrive/audio_3ai2"

# Call the function to generate CSV files from transcripts
generate_transcription_csv(main_folder_path)


## Modularize and put everything together

In [22]:
!git clone https://github.com/ggerganov/whisper.cpp.git

%cd /content/whisper.cpp/models
!bash download-ggml-model.sh base.en
%cd ..
!make

!pip install pydub
!apt install ffmpeg

fatal: destination path 'whisper.cpp' already exists and is not an empty directory.
/content/whisper.cpp/models
Downloading ggml model base.en from 'https://huggingface.co/ggerganov/whisper.cpp' ...
Model base.en already exists. Skipping download.
/content/whisper.cpp
I whisper.cpp build info: 
I UNAME_S:  Linux
I UNAME_P:  x86_64
I UNAME_M:  x86_64
I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -pthread -mavx2 -mfma -mf16c -mavx -msse3
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS:  
I CC:       cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
I CXX:      g++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0

make: Nothing to be done for 'default'.
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded

## Add DOC_ID Column

In [25]:
import os
import pandas as pd

def add_ids_to_csv_files(folder_path):
    # Iterate through CSV files in the folder
    csv_folder = os.path.join(folder_path, 'csv_files')
    for csv_file in os.listdir(csv_folder):
        if csv_file.endswith('.csv'):
            csv_file_path = os.path.join(csv_folder, csv_file)

            # Read the CSV file
            df = pd.read_csv(csv_file_path)

            # Add the new column with values from 0 to n-1
            df['DOC_ID'] = range(len(df))

            # Save the updated DataFrame back to the CSV file
            df.to_csv(csv_file_path, index=False)

            print(f"Updated {csv_file_path} with IDs.")


# Specify the main folder path
main_folder_path = "/content/drive/MyDrive/audio_3ai2"

# Call the add_ids_to_csv_files function to process all CSV files in the folder
add_ids_to_csv_files(main_folder_path)


Updated /content/drive/MyDrive/audio_3ai2/csv_files/1.csv with IDs.
Updated /content/drive/MyDrive/audio_3ai2/csv_files/2.csv with IDs.
Updated /content/drive/MyDrive/audio_3ai2/csv_files/3.csv with IDs.
Updated /content/drive/MyDrive/audio_3ai2/csv_files/4.csv with IDs.
Updated /content/drive/MyDrive/audio_3ai2/csv_files/5.csv with IDs.
Updated /content/drive/MyDrive/audio_3ai2/csv_files/6.csv with IDs.
Updated /content/drive/MyDrive/audio_3ai2/csv_files/7.csv with IDs.


In [26]:
!pip3 install thirdai --upgrade
!pip3 install thirdai[neural_db]
!pip3 install langchain --upgrade
!pip3 install openai --upgrade
!pip3 install paper-qa --upgrade

Collecting thirdai
  Downloading thirdai-0.7.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: thirdai
Successfully installed thirdai-0.7.16
Collecting PyTrie (from thirdai[neural_db])
  Downloading PyTrie-0.4.0.tar.gz (95 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting PyMuPDF (from thirdai[neural_db])
  Downloading PyMuPDF-1.22.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain (from thirdai[neural_db])
  Downloading langchain-0.0.252-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [27]:
from thirdai import licensing, neural_db as ndb
licensing.deactivate()
licensing.activate("1FB7DD-CAC3EC-832A67-84208D-C4E39E-V3")

In [28]:
db = ndb.NeuralDB(user_id="root")

In [32]:
import os
import pandas as pd
# from pandas_gbq import ndb

def process_csv_files(folder_path):
    insertable_docs = []

    # Iterate through CSV files in the folder
    csv_folder = os.path.join(folder_path, 'csv_files')
    for csv_file in os.listdir(csv_folder):
        if csv_file.endswith('.csv'):
            csv_file_path = os.path.join(csv_folder, csv_file)

            # Read the CSV file
            df = pd.read_csv(csv_file_path)

            # Create a CSV document
            csv_doc = ndb.CSV(
                path=csv_file_path,
                id_column="DOC_ID",
                strong_columns=["transcription"],
                weak_columns=["time_stamp"],
                reference_columns=["time_stamp"])

            insertable_docs.append(csv_doc)

    return insertable_docs


# Specify the main folder path
main_folder_path = "/content/drive/MyDrive/audio_3ai2"

# Call the process_csv_files function to process all CSV files in the folder
insertable_docs = process_csv_files(main_folder_path)

# Now you can use the insertable_docs list for further processing


In [33]:
source_ids = db.insert(insertable_docs, train=True)

loaded data | source 'Documents:
1.csv
2.csv
3.csv
3.csv
5.csv
6.csv
7.csv' | vectors 4922 | batches 3 | time 0s | complete

train | epoch 0 | train_steps 3 | train_hash_precision@5=0.0609102  | train_batches 3 | time 30s

train | epoch 1 | train_steps 6 | train_hash_precision@5=0.067371  | train_batches 3 | time 20s

train | epoch 2 | train_steps 9 | train_hash_precision@5=0.0905729  | train_batches 3 | time 18s

train | epoch 3 | train_steps 12 | train_hash_precision@5=0.15384  | train_batches 3 | time 18s

train | epoch 4 | train_steps 15 | train_hash_precision@5=0.237017  | train_batches 3 | time 18s

train | epoch 5 | train_steps 18 | train_hash_precision@5=0.353149  | train_batches 3 | time 20s

train | epoch 6 | train_steps 21 | train_hash_precision@5=0.505729  | train_batches 3 | time 21s

train | epoch 7 | train_steps 24 | train_hash_precision@5=0.653434  | train_batches 3 | time 19s

train | epoch 8 | train_steps 27 | train_hash_precision@5=0.76282  | train_batches 3 | time 1

In [38]:
search_results = db.search(
    query="Andrew Ng thoughts about deep learning",
    top_k=4,
    on_error=lambda error_msg: print(f"Error! {error_msg}"))

for result in search_results:
    print(result.text)
    print(result.context(radius=1))
    print(result.source)
    print(result.metadata)
    print('************')

00:34:33.600 --> 00:34:40.160
00:34:29.440 --> 00:34:33.600 00:34:33.600 --> 00:34:40.160 00:34:40.720 --> 00:34:48.080
/content/drive/MyDrive/audio_3ai2/csv_files/1.csv
{'time_stamp': '00:34:33.600 --> 00:34:40.160', 'transcription': 'So how does one get started in deep learning and where does deep learning.ai fit into that?', 'DOC_ID': 362}
************
00:04:31.840 --> 00:04:33.280
00:04:28.680 --> 00:04:31.840 00:04:31.840 --> 00:04:33.280 00:04:33.280 --> 00:04:36.640
/content/drive/MyDrive/audio_3ai2/csv_files/7.csv
{'time_stamp': '00:04:31.840 --> 00:04:33.280', 'transcription': 'And I thought, this is going to be great.', 'DOC_ID': 87}
************
01:17:28.640 --> 01:17:36.400
01:17:22.480 --> 01:17:28.640 01:17:28.640 --> 01:17:36.400 01:17:36.400 --> 01:17:41.760
/content/drive/MyDrive/audio_3ai2/csv_files/1.csv
{'time_stamp': '01:17:28.640 --> 01:17:36.400', 'transcription': 'And the machine learning person says, no, wait, I did well on the test set. And I think there is a'