In [1]:
import sys
import os
import numpy as np
import warnings
from tqdm.auto import tqdm

# Suppress all warnings
warnings.filterwarnings("ignore")

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant"
sys.path.append(PROJECT_DIR)

from utils.utils import (
    initialize_env_variables,
    flatten_list_of_lists,
    sample_from_list,
    read_json_file,
    save_json_file,
    standardize_array,
)

initialize_env_variables()

from utils.asr import (read_mp3, transcripe_episode)
from utils.chunking import (chunk_large_text,
                            preindex_process_text)
from utils.questions import (extract_questions,
                             group_questions_by_episode)
# from utils.openai import create_openai_client
from utils.query import openai_rephrase 
from utils.multithread import map_progress


## HF_HOME
cache_dir = os.path.join(PROJECT_DIR, "hf_cache")

from datasets import load_dataset, Dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Connected to Elasticsearch


In [2]:
from huggingface_hub import login

login(os.getenv("HF_READING_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/mohammed/.cache/huggingface/token
Login successful


In [4]:
# Load the full lex fridman podcast dataset
full_dataset = load_dataset(
    path='Whispering-GPT/lex-fridman-podcast-transcript-audio', 
    cache_dir=cache_dir,
    ignore_verifications=True,
)['train']

Resolving data files:   0%|          | 0/107 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/69 [00:00<?, ?it/s]

# ASR

In [4]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", cache_dir=cache_dir)
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small", cache_dir=cache_dir)
model.config.forced_decoder_ids = None

In [None]:
data_dir = os.path.join(
    PROJECT_DIR,
    "data/generated_transcriptions/",
)

for i in tqdm(range(len(full_dataset))):
    full_transcription = transcripe_episode(
        episode=full_dataset['audio'][i],
        processor=processor,
        model=model,
        skip_special_tokens=True,
        minutes=0.4, ## due to model output constraint
        target_sampling_rate=16_000,
    )

    path = os.path.join(
        PROJECT_DIR,
        "data/generated_transcriptions/",
        f"ep{i}.txt"
    )

    with open(path, 'w', encoding="utf-8") as file:
        file.write(full_transcription)

  0%|          | 0/345 [00:00<?, ?it/s]

Since this is very costly, we will rely on already present transcription.

# Chunking Transcription

In [5]:
try:
    full_dataset = full_dataset.remove_columns(['audio'])
except ValueError:
    pass

In [14]:
documents = map_progress(
    f=lambda episode:preindex_process_text(
        episode=episode, 
        chunking_function=chunk_large_text,
        max_chunk_size=2000,
    ),
    seq=Dataset.from_list([full_dataset[0]]),
    max_workers=4
)
documents = [item for sublist in documents for item in sublist]

  0%|          | 0/1 [00:00<?, ?it/s]

In [46]:
path = os.path.join(
    PROJECT_DIR,
    "data/generated_documents/",
    "documents.json"
)
  
save_json_file(documents, path, replace=False)

Skipped...


# Questions Bank

In [4]:
try:
    full_dataset = full_dataset.remove_columns(['audio'])
except ValueError:
    pass

In [5]:
questions = map_progress(
    f=lambda episode:extract_questions(
        episode=episode,
        min_words=15,
    ),
    seq=full_dataset,
    max_workers=4
)

questions = flatten_list_of_lists(questions)

  0%|          | 0/346 [00:00<?, ?it/s]

In [6]:
print("Number of questions before sampling:", len(questions))
questions = sample_from_list(questions, sample_size=500, seed=42)
print("Number of questions after sampling:", len(questions))

Number of questions before sampling: 3803
Number of questions after sampling: 500


In [7]:
# Group questions by episode_id
questions_per_episode = group_questions_by_episode(questions)

# Openai rephrase questions

In [8]:
prompt_template_path = os.path.join(PROJECT_DIR, "prompts/rephrase_questions.txt")

rephrased_questions = map_progress(
    f=lambda episode_questions:openai_rephrase(
        episode_questions=episode_questions,
        prompt_template_path=prompt_template_path,
        model="gpt-4o-mini",
    ),
    seq=questions_per_episode,
    max_workers=4,
)

rephrased_questions = flatten_list_of_lists(rephrased_questions)

  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
sample_from_list(rephrased_questions, sample_size=5)

[{'episode_id': 'SFxIazwNP_0',
  'question': 'When I take actions, am I the one doing it or is it just my atoms acting?'}]

In [10]:
print("Number of questions after discarding non-self-sufficient:", len(rephrased_questions))

Number of questions after discarding non-self-sufficient: 1


In [11]:
path = os.path.join(
    PROJECT_DIR,
    "data/generated_questions/",
    "questions.json"
)

save_json_file(rephrased_questions, path, replace=False)

Skipped...


# Newly downloaded episode

In [4]:
try:
    full_dataset = full_dataset.remove_columns(['audio'])
except ValueError:
    pass

In [9]:
path = os.path.join(PROJECT_DIR, "bucket/351/lex_ai_mrbeast.mp3")
audio, sampling_rate = read_mp3(path)

# mean=0, std=1
audio = standardize_array(audio)

print("Sampling Rate:",sampling_rate)
print("Audio Array Length:",len(audio))

Sampling Rate: 48000
Audio Array Length: 417074688


In [None]:
episode = {
    'array': audio,
    'sampling_rate': sampling_rate
}

In [6]:
## Transcribe then ask chatgpt to generate questions based on it
## This will also happen in the pipeline (orchestration)
path = os.path.join(
    PROJECT_DIR,
    "data/generated_transcriptions/",
    "lex_ai_mrbeast.txt"
)
if not os.path.exists(path):
    ep_transcription = transcripe_episode(
        episode=episode,
        processor=processor,
        model=model,
        skip_special_tokens=True,
        minutes=0.4, ## due to model output constraint
        target_sampling_rate=16_000,
    )

    with open(path, 'w', encoding="utf-8") as f:
        f.write(ep_transcription)
else:
    with open(path, 'r') as f:
        ep_transcription = f.read()

In [8]:
# Reading episode metadata (self-created):
path = os.path.join(PROJECT_DIR, "bucket/351/metadata.json")
episode = read_json_file(path)

episode['text'] = ep_transcription

In [9]:
document = preindex_process_text(
    episode=episode, 
    chunking_function=chunk_large_text,
    max_chunk_size=2000,
)

We can then reindex the newly created document...