In [119]:
import sys
import os
import pandas as pd
import pickle
import numpy as np
from scipy.signal import resample
from scipy.io.wavfile import write
import warnings
from tqdm.auto import tqdm
import re
import json
import random
from collections import defaultdict

import spacy
nlp = spacy.load("en_core_web_sm")

# Suppress all warnings
warnings.filterwarnings("ignore")

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant"
sys.path.append(PROJECT_DIR)

from utils.utils import (
    initialize_env_variables,
)

## HF_HOME
cache_dir = os.path.join(PROJECT_DIR, "hf_cache")

from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration

initialize_env_variables()

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env


In [2]:
from huggingface_hub import login

login(os.getenv("HF_READING_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/mohammed/.cache/huggingface/token
Login successful


In [4]:
# Load the full lex fridman podcast dataset
full_dataset = load_dataset(
    'Whispering-GPT/lex-fridman-podcast-transcript-audio', 
    cache_dir=cache_dir,
    ignore_verifications=True
)['train']

Resolving data files:   0%|          | 0/107 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/69 [00:00<?, ?it/s]

# ASR

In [8]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", cache_dir=cache_dir)
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small", cache_dir=cache_dir)
model.config.forced_decoder_ids = None

In [4]:
## sample minutes of audio
def sample_audio(
    audio_dict, start_from=None, minutes=None
):
    audio = audio_dict['array']
    sampling_rate = audio_dict['sampling_rate']
    
    start_from = int(start_from*60*sampling_rate) if start_from else 0
    up_to = start_from + int(minutes*60*sampling_rate) if minutes else None
        
    return audio[start_from:up_to]

## Change sampling_rate
def update_sampling_rate(audio_data, original_rate, target_rate):
    num_samples = round(len(audio_data) * float(target_rate) / original_rate)
    resampled_audio = resample(audio_data, num_samples)
    return resampled_audio

## Transcripe audio
def transcripe_audio(
    audio,
    processor,
    model,
    sampling_rate=16_000,
    skip_special_tokens=True,
):
    input_features = processor(
        audio, sampling_rate=sampling_rate, return_tensors="pt"
    ).input_features

    predicted_ids = model.generate(input_features)

    return processor.batch_decode(
        predicted_ids, skip_special_tokens=skip_special_tokens
    )


def merge_transcripts(transcripts):
    """
    Merges a list of transcripts into a coherent and grammatically correct speech.

    Args:
        transcripts (list of str): List of transcribed text chunks.

    Returns:
        str: Merged and coherent speech.
    """
    merged_transcript = ""
    
    for i, transcript in enumerate(transcripts):
        # Strip leading/trailing whitespace from the transcript
        transcript = transcript.strip()
        
        # If it's not the first chunk, try to merge smoothly with the previous one
        if i > 0:
            # Handle cases where the previous chunk ends in an incomplete sentence
            if re.match(r'^\w', transcript):
                # Append a space if the transcript starts with a word character (alphanumeric)
                merged_transcript += " " + transcript
            else:
                # Directly append if the transcript starts with punctuation
                merged_transcript += transcript
        else:
            merged_transcript += transcript
    
    # Final pass to fix any spacing issues
    merged_transcript = re.sub(r'\s+', ' ', merged_transcript).strip()
    
    return merged_transcript


## Transcripe full episode
def transcripe_episode(
    episode,
    processor,
    model,
    skip_special_tokens=True,
    **sampling_kwargs,
):
    minutes=sampling_kwargs.get('minutes', 2)
    target_sampling_rate=sampling_kwargs.get('target_sampling_rate', 16_000)
    
    transcripts_list = []
    episode_length = len(episode['array'])
    orig_sampling_rate = episode['sampling_rate']
    
    
    # Calculate the total duration of the episode in seconds
    episode_duration_seconds = episode_length / orig_sampling_rate
    
    # Calculate the list of start_from values in minutes (could be float)
    start_froms = [i * minutes for i in range(int(episode_duration_seconds // (minutes * 60)) + 1)]
    
    for start_from in tqdm(start_froms):
        audio = sample_audio(episode, start_from=start_from ,minutes=minutes)      
        audio = update_sampling_rate(audio, orig_sampling_rate, target_sampling_rate)
        transcripts_list += transcripe_audio(audio, processor, model, target_sampling_rate)
    
    return merge_transcripts(transcripts_list)


def get_resuming_index(data_dir):
    filenames = os.listdir(data_dir)
    
    if not filenames:
        return 0
    
    return max([
        int(filename.split('.')[0][2:]) for filename in filenames
    ]) + 1

In [None]:
data_dir = os.path.join(
    PROJECT_DIR,
    "data/generated_transcriptions/",
)
resuming_index = get_resuming_index(data_dir)

for i in tqdm(range(resuming_index, len(full_dataset))):
    full_transcription = transcripe_episode(
        episode=full_dataset['audio'][i],
        processor=processor,
        model=model,
        skip_special_tokens=True,
        minutes=0.4, ## due to model output constraint
        target_sampling_rate=16_000,
    )

    path = os.path.join(
        PROJECT_DIR,
        "data/generated_transcriptions/",
        f"ep{i}.txt"
    )

    with open(path, 'w', encoding="utf-8") as file:
        file.write(full_transcription)

  0%|          | 0/345 [00:00<?, ?it/s]

Since this is very costly, we will rely on already present transcription.

# Chunking Transcription

In [None]:
try:
    full_dataset = full_dataset.remove_columns(['audio'])
except ValueError:
    pass

In [6]:
def chunk_large_text(text, max_chunk_size=1000):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    
    chunks = []
    current_chunk = ""
    
    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        if '?' in sentence.strip():
            # Handle the question and its following answer
            question_chunk = sentence
            i += 1
            # Include following sentences as the answer, ensuring not to exceed the max_chunk_size
            while i < len(sentences) and len(question_chunk) + len(sentences[i]) <= max_chunk_size:
                question_chunk += " " + sentences[i]
                i += 1
            # Add the combined question-answer chunk to the list
            chunks.append(question_chunk.strip())
        else:
            # If the current chunk can accommodate the sentence
            if len(current_chunk) + len(sentence) <= max_chunk_size:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
            i += 1
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks


def preindex_process_text(
    episode, 
    chunking_function,
    **chunking_function_params,
):
    """
    """
    documents = []

    if 'audio' in episode:
        del episode['audio']
    if 'description' in episode:
        del episode['description']
    if 'segments' in episode:
        del episode['segments']

    text = episode['text']
    del episode['text']

    chunks = chunking_function(
        text,
        **chunking_function_params,
    )

    for i, chunk in enumerate(chunks):
        episode_doc = episode.copy()

        episode_doc['text'] = chunk
        episode_doc['chunk_id'] = i

        documents.append(episode_doc)

    return documents

In [7]:
from utils.multithread import map_progress

documents = map_progress(
    f=lambda episode:preindex_process_text(
        episode=episode, 
        chunking_function=chunk_large_text,
        max_chunk_size=2000,
    ),
    seq=full_dataset,
    max_workers=4
)

documents = [item for sublist in documents for item in sublist]

  0%|          | 0/346 [00:00<?, ?it/s]

In [17]:
path = os.path.join(
    PROJECT_DIR,
    "data/generated_documents/",
    "documents.json"
)

# Save the dictionary to a JSON file
if not os.path.exists(path):
    with open(path, 'w') as json_file:
        json.dump(documents, json_file, indent=4)

    print(f"Data successfully saved to {path}")

Data successfully saved to /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/data/generated_documents/documents.json


# Questions Bank

In [39]:
try:
    full_dataset = full_dataset.remove_columns(['audio'])
except ValueError:
    pass

In [144]:
def extract_questions(
    episode,
    min_words = 11,
):
    questions = []
    epidose_id = episode['id']
    
    for segment in episode['segments']:
        if '?' == segment['text'][-1]:
            segment_text_list = segment['text'].split()
            if segment_text_list[0].istitle():
                if len(segment_text_list) >= min_words:
                    questions.append(
                        {
                            "episode_id": epidose_id,
                            "question": segment['text']
                        }
                    )
                
    return questions


def flatten_list_of_lists(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]


def sample_from_list(
    orig_list,
    sample_size=None,
    seed=42,
):
    indices = np.arange(0, len(orig_list))
    np.random.shuffle(indices)
    
    return list(np.array(orig_list)[indices])[:sample_size]


def group_questions_by_episode(questions):
    questions_per_episode = defaultdict(list)
    for question in questions:
        questions_per_episode[question['episode_id']].append(question)

    questions_per_episode = list(questions_per_episode.values())
    return questions_per_episode

In [148]:
questions = map_progress(
    f=lambda episode:extract_questions(
        episode=episode,
        min_words=15,
    ),
    seq=full_dataset,
    max_workers=4
)

questions = flatten_list_of_lists(questions)

  0%|          | 0/346 [00:00<?, ?it/s]

In [149]:
print("Number of questions before sampling:", len(questions))
questions = sample_from_list(questions, sample_size=500)
print("Number of questions after sampling:", len(questions))

Number of questions before sampling: 11136
Number of questions after sampling: 500


In [150]:
# Group questions by episode_id
questions_per_episode = group_questions_by_episode(questions)

# Openai rephrase questions

In [58]:
from utils.openai import create_openai_client
from utils.utils import parse_json_response

openai_client = create_openai_client()

In [137]:
def openai_rephrase(episode_questions, model="gpt-4o-mini"):
    prompt = """
I have a list of questions extracted from various episodes. 
Each question is associated with an `episode_id`. 
Your task is to check if each question is self-sufficient, meaning it has enough context to stand alone. 
If a question is self-sufficient, rephrase it to convey the same meaning clearly. 
If a question is not self-sufficient and lacks necessary context, 
do not rephrase it nor return it, just skip it, be very strict in this point, the question needs to answerable on its own.

Please return the results in a JSON format, with each question represented as a dictionary containing `episode_id` and `question` keys.

IMPORTANT: Only return the dictionary without the fenced code block marks. Make sure to return double quotations not single quotations.

Here is the list of questions:
    """.strip() + f"\n{str(episode_questions)}"
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
    )

    # Extract and print the response
    return parse_json_response(response.choices[0].message.content)

In [138]:
rephrased_questions = map_progress(
    f=lambda episode_questions:openai_rephrase(
        episode_questions=episode_questions,
        model="gpt-4o-mini",
    ),
    seq=questions_per_episode,
    max_workers=4,
)

rephrased_questions = flatten_list_of_lists(rephrased_questions)

  0%|          | 0/147 [00:00<?, ?it/s]

In [145]:
sample_from_list(rephrased_questions, sample_size=5)

[{'episode_id': 'IT__Nrr3PNI',
  'question': 'How do irrational numbers relate to programming?'},
 {'episode_id': 'LRYkH-fAVGE',
  'question': 'How would you define the general problem of computer vision?'},
 {'episode_id': 'iZRbD7q1n-U',
  'question': 'What are the current weaknesses in a specific combat sport?'},
 {'episode_id': 'Pl3x4GINtBQ',
  'question': 'In a sense, are there multiple governments or systems of enforcement?'},
 {'episode_id': 'LRYkH-fAVGE',
  'question': 'In the context of driving, is vision the primary challenge, or do actions and interactions with the environment also play a crucial role?'}]

In [147]:
print("Number of questions after discarding non-self-sufficient:", len(rephrased_questions))

Number of questions after discarding non-self-sufficient: 389


In [152]:
path = os.path.join(
    PROJECT_DIR,
    "data/generated_questions/",
    "questions.json"
)


if not os.path.exists(path):
    with open(path, 'w') as json_file:
        json.dump(rephrased_questions, json_file, indent=4)

    print(f"Data successfully saved to {path}")

Data successfully saved to /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/data/generated_questions/questions.json
