In [3]:
import sys
import os
import numpy as np
import warnings
from tqdm.auto import tqdm
import pickle

def save_to_pickle(obj, pickle_file_path):
    """
    Saves a Python object to a file using pickle.
    
    :param obj: The Python object to be pickled.
    :param pickle_file_path: Path where the pickled object will be saved.
    """
    with open(pickle_file_path, 'wb') as pickle_file:
        pickle.dump(obj, pickle_file)

def load_pickle(pickle_file_path):
    """
    """
    with open(pickle_file_path, 'rb') as file:
        return pickle.load(file)

# Suppress all warnings
warnings.filterwarnings("ignore")

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant"
sys.path.append(PROJECT_DIR)

from utils.utils import (
    initialize_env_variables,
    flatten_list_of_lists,
    sample_from_list,
    read_json_file,
    save_json_file
)

initialize_env_variables()

from utils.questions import (extract_questions,
                             group_questions_by_episode, 
                             openai_rephrase)
from utils.multithread import map_progress
from datasets import load_dataset, Dataset

from transformers import LEDForConditionalGeneration, LEDTokenizer
import torch

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Connected to Elasticsearch


# Episode summarization

In [12]:
chunked_docs_path = os.path.join(
    PROJECT_DIR,
    "data/generated_documents/documents.json"
)
chunks_dataset = read_json_file(chunked_docs_path)

In [6]:
# from huggingface_hub import login
cache_dir = os.path.join(PROJECT_DIR, "hf_cache")

# login(os.getenv("HF_READING_TOKEN"))

full_dataset = load_dataset(
    path='Whispering-GPT/lex-fridman-podcast-transcript-audio', 
    cache_dir=cache_dir,
    ignore_verifications=True,
)['train']

Using the latest cached version of the dataset since Whispering-GPT/lex-fridman-podcast-transcript-audio couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/hf_cache/Whispering-GPT___lex-fridman-podcast-transcript-audio/default/0.0.0/86c17541f78440e5ff2ef55740a3a503b2716d3a (last modified on Sat Aug 24 03:36:38 2024).


Loading dataset shards:   0%|          | 0/69 [00:00<?, ?it/s]

In [7]:
try:
    full_dataset = full_dataset.remove_columns(['audio'])
except ValueError:
    pass

In [25]:
full_dataset[0]["text"]

" The following is a conversation with Jed Buckwald, a professor of history and a philosopher of science at Caltech. Interested especially in the development of scientific concepts and the instruments used to create and explore new effects and ideas in science. To support this podcast, please check out our sponsors in the description. This is the Lex Friedman podcast and here is my conversation with Jed Buckwald. Does science progress via paradigm shifts and revolutions as philosopher Thomas Kuhn said, or does it progress gradually? What do you think? Well, I got into this field because I was Tom Kuhn's research assistant 50 years ago, 52 years ago. He pulled me into it out of physics instead. So I know his work pretty well. And in the years when I was at MIT running an institute, he was then in the philosophy department, used to come over all the time to the talks we held and so on. So what would I say about that? He, of course, developed his ideas a lot over the years. The thing that

In [24]:
full_dataset[0]["text"] == "".join([segment["text"] for segment in full_dataset[0]["segments"]])

True

In [23]:
# def extract_outline(episode):
#     """
#     """
#     outline_started = False
#     outline = []
    
#     for line in episode["description"].split("\n"):
#         if "OUTLINE:" in line:
#             outline_started = True
#             continue
#         if outline_started:
#             if line.strip() == "" or line.startswith("CONNECT:"):
#                 break
#             outline.append(line.strip())
            
#     episode["outline"] = outline
#     return episode

# full_dataset = full_dataset.map(extract_outline, batched=False)

# Generating questions

In [2]:
chunked_docs_path = os.path.join(
    PROJECT_DIR,
    "data/generated_documents/documents.json"
)
full_dataset = read_json_file(chunked_docs_path)

In [3]:
questions = map_progress(
    f=lambda episode:extract_questions(
        episode=episode,
        min_words=15,
        max_words=20
    ),
    seq=full_dataset,
    max_workers=4,
    verbose=False
)

questions = flatten_list_of_lists(questions)

  0%|          | 0/22232 [00:00<?, ?it/s]

In [4]:
print("Number of questions before sampling:", len(questions))
questions = sample_from_list(questions, sample_size=1000, seed=42)
print("Number of questions after sampling:", len(questions))

Number of questions before sampling: 10143
Number of questions after sampling: 1000


In [5]:
# Group questions by episode_id
questions_per_episode = group_questions_by_episode(questions)

# Rephrasing questions & keeping self-sufficient only

In [7]:
prompt_template_path = os.path.join(PROJECT_DIR, "prompts/rephrase_questions.txt")

rephrased_questions = map_progress(
    f=lambda episode_questions:openai_rephrase(
        episode_questions=episode_questions,
        prompt_template_path=prompt_template_path,
        model="gpt-4o-mini",
    ),
    seq=questions_per_episode,
    max_workers=4,
    verbose=False
)

rephrased_questions = flatten_list_of_lists(rephrased_questions)

  0%|          | 0/302 [00:00<?, ?it/s]

In [8]:
print("Number of questions before rephrasing & keeping self-sufficient only:", len(questions))
print("Number of questions after rephrasing & keeping self-sufficient only:", len(rephrased_questions))

Number of questions before rephrasing & keeping self-sufficient only: 1000
Number of questions after rephrasing & keeping self-sufficient only: 344


In [9]:
rephrased_questions_reviewed = []
for i, rephrased_question in enumerate(rephrased_questions):
    try:
        rephrased_question["text"] = rephrased_question.pop("question")
        rephrased_question["episode_id"]; rephrased_question["chunk_id"]
        
        rephrased_questions_reviewed.append(rephrased_question)
    except:
        pass

In [10]:
print("Number of questions before reviewing:", len(rephrased_questions))
print("Number of questions after reviewing:", len(rephrased_questions_reviewed))

Number of questions before reviewing: 344
Number of questions after reviewing: 12


In [44]:
questions_path = os.path.join(
    PROJECT_DIR,
    "data/generated_questions/",
    "questions.json"
)

save_json_file(rephrased_questions_reviewed, questions_path, replace=True)

Data successfully saved to /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/data/generated_questions/questions.json


In [46]:
rephrased_questions_reviewed[:10]

[{'episode_id': 'FUS6ceIvUnI',
  'chunk_id': 3,
  'llm_answer': 'Visual learning can have applications in language through visual aids such as images or videos that illustrate word meanings, helping learners connect concepts visually. In other domains, examples include interpreting visual data in science, understanding art and design through visual storytelling, and applying visual analytics in business intelligence.',
  'text': 'What are some examples of visual learning applications in language or other domains?'},
 {'episode_id': 'FUS6ceIvUnI',
  'chunk_id': 9,
  'llm_answer': 'Yes, having a robust function to compare objects can reveal underlying features and relationships, allowing for a deeper understanding of individual objects by highlighting their unique characteristics and how they relate to others.',
  'text': 'Does developing an effective function to compare objects lead to deeper understanding of individual objects?'},
 {'episode_id': 'FUS6ceIvUnI',
  'chunk_id': 62,
  'llm

# Vectorizing docs

In [45]:
from utils.ollama import embed_document
from utils.variables import OLLAMA_CLIENT

In [31]:
questions_path = os.path.join(
    PROJECT_DIR,
    "data/generated_questions/",
    "questions.json"
)

rephrased_questions = read_json_file(questions_path)

In [32]:
vectorized_questions = map_progress(
    f=lambda q: embed_document(OLLAMA_CLIENT, q),
    seq=rephrased_questions_reviewed,
    max_workers=4,
    verbose=False
)

  0%|          | 0/495 [00:00<?, ?it/s]

1/495 items processed so far...
25/495 items processed so far...
49/495 items processed so far...
73/495 items processed so far...
97/495 items processed so far...
121/495 items processed so far...
145/495 items processed so far...
169/495 items processed so far...
193/495 items processed so far...
217/495 items processed so far...
241/495 items processed so far...
265/495 items processed so far...
289/495 items processed so far...
313/495 items processed so far...
337/495 items processed so far...
361/495 items processed so far...
385/495 items processed so far...
409/495 items processed so far...
433/495 items processed so far...
457/495 items processed so far...
481/495 items processed so far...
495/495 items processed.


In [39]:
pickle_file_path = os.path.join(
    PROJECT_DIR, 
    "data/generated_document_embeddings/vectorized_questions.pkl"
)

save_to_pickle(vectorized_questions, pickle_file_path)

# Assessing Retrieval

In [9]:
from utils.query import (
    elastic_search_text, elastic_search_knn, elastic_search_hybrid_rrf
)
from utils.variables import ES_CLIENT
from utils.evaluate import hit_rate, mrr

In [3]:
pickle_file_path = os.path.join(
    PROJECT_DIR, 
    "data/generated_document_embeddings/vectorized_questions.pkl"
)
vectorized_questions = load_pickle(pickle_file_path)

In [4]:
vectorized_questions[0]["text"]

'What is the key methodology of the clustering that enables its effectiveness?'

In [28]:
performance = []

### Text

In [5]:
def text_retrieval(question_dict):
    ground_truth = (question_dict['episode_id'], question_dict['chunk_id'])
    return [
        (doc['id'], doc['chunk_id']) == ground_truth\
        for doc in elastic_search_text(question_dict["text"])
    ]

In [10]:
text_relevance = map_progress(
    f=text_retrieval,
    seq=vectorized_questions,
    max_workers=4,
    verbose=False
)

  0%|          | 0/495 [00:00<?, ?it/s]

In [29]:
performance.append(
    {
        "Search": "elastic_search_text",
        "HR": hit_rate(text_relevance),
        "MRR": mrr(text_relevance)
    }
)

### Vector

In [17]:
def knn_retrieval(question_dict):
    ground_truth = (question_dict['episode_id'], question_dict['chunk_id'])
    return [
        (doc['id'], doc['chunk_id']) == ground_truth\
        for doc in elastic_search_knn(question_dict["text_vector"])
    ]

In [18]:
knn_relevance = map_progress(
    f=knn_retrieval,
    seq=vectorized_questions,
    max_workers=4,
    verbose=False
)

  0%|          | 0/495 [00:00<?, ?it/s]

In [30]:
performance.append(
    {
        "Search": "elastic_search_knn",
        "HR": hit_rate(knn_relevance),
        "MRR": mrr(knn_relevance)
    }
)

### Hybrid (RRF)

In [24]:
def hybrid_retrieval(question_dict):
    ground_truth = (question_dict['episode_id'], question_dict['chunk_id'])
    return [
        (doc['id'], doc['chunk_id']) == ground_truth\
        for doc in elastic_search_hybrid_rrf(
            question_dict["text"],
            question_dict["text_vector"]
        )
    ]

In [25]:
hybrid_relevance = map_progress(
    f=hybrid_retrieval,
    seq=vectorized_questions,
    max_workers=4,
    verbose=False
)

  0%|          | 0/495 [00:00<?, ?it/s]

In [31]:
performance.append(
    {
        "Search": "elastic_search_hybrid_rrf",
        "HR": hit_rate(hybrid_relevance),
        "MRR": mrr(hybrid_relevance)
    }
)

In [32]:
import pandas as pd

pd.DataFrame(performance)

Unnamed: 0,Search,HR,MRR
0,elastic_search_text,0.721212,0.594478
1,elastic_search_knn,0.40404,0.29963
2,elastic_search_hybrid_rrf,0.713131,0.48037


In [33]:
[x["text"] for x in vectorized_questions]

['What is the key methodology of the clustering that enables its effectiveness?',
 'Is there a distinction between the concepts of learning and reasoning?',
 'How can something be effectively trained at a fast pace?',
 'Does self-supervised learning overfit to ImageNet, or can it be effective in real-world scenarios?',
 'What were some of the successes you experienced, and what challenges did you face that made it a positive experience?',
 'What was your vision for the scale and potential of this project?',
 'Did you ever think that it could become as popular as it ultimately did?',
 'What does it take to successfully build a community?',
 'Do you believe this model can continue to successfully fund businesses like Google?',
 'Is it important to have people in your life who believe in you, or should self-belief be your primary source of motivation?',
 'How can I leverage my skills to make a positive impact on the world?',
 "Could you summarize the main ideas from each of the five secti