In [1]:
import sys
import os
import numpy as np
import warnings
from tqdm.auto import tqdm
import pickle
import matplotlib.pyplot as plt
import pandas as pd


# Suppress all warnings
warnings.filterwarnings("ignore")

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant"
sys.path.append(PROJECT_DIR)

from utils.utils import (
    initialize_env_variables,
    flatten_list_of_lists,
    sample_from_list,
    read_json_file,
    save_json_file,
    extract_item_by_keys,
    save_to_pickle,
    load_pickle,
    get_json_files_in_dir,
    read_json_file
)

initialize_env_variables()

from utils.questions import (extract_questions,
                             group_questions_by_episode, 
                             openai_process_questions,
                            filter_corrupted_qs,
                            count_question_marks)
from utils.multithread import map_progress
from datasets import load_dataset, Dataset
from utils.ollama import embed_document
from utils.variables import OLLAMA_CLIENT, INDEX_NAME, ES_CLIENT

from transformers import LEDForConditionalGeneration, LEDTokenizer
import torch

from utils.query import (
    elastic_search_text, elastic_search_knn, elastic_search_hybrid_rrf
)
from utils.variables import ES_CLIENT
from utils.query import (
    elastic_search_text, elastic_search_knn, elastic_search_hybrid_rrf
)
from utils.variables import ES_CLIENT
from utils.evaluate import hit_rate, mrr, retrieve_relevance

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Connected to Elasticsearch


In [2]:
# reading chunks

documents = read_json_file("/mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/data/generated_documents/documents.json")

In [3]:
# Filter long chunks 300 words or more

long_chunks = []
for i in range(len(documents)):
    len_ = len(documents[i]["text"].split())
    if len_ >= 300:
        long_chunks.append(documents[i])

print(len(long_chunks), ":", len(documents))

17892 : 30681


In [4]:
# Sample 250 chunks

np.random.seed(42)
sampled_chunks = []

n_to_sample = 250
samples_indices = np.random.choice(range(len(long_chunks)), n_to_sample)

for ind in samples_indices:
    doc = long_chunks[ind].copy()
    chunk = {}
    chunk['episode_id'] = doc.pop('id')
    chunk['chunk_id'] = doc.pop('chunk_id')
    chunk['text'] = doc.pop('title') + "\n" + doc.pop("text")
    
    sampled_chunks.append(chunk)

In [5]:
# Group 5 chunks together to reduce tokens (repitition of instructions)

def split_into_sublists(lst, sublist_size=5):
    return [lst[i:i + sublist_size] for i in range(0, len(lst), sublist_size)]

sampled_chunks = split_into_sublists(sampled_chunks)

In [9]:
# Now on to llm prompting

prompt_template_path = os.path.join(PROJECT_DIR, "prompts/questions_from_chunks.txt")

generated_questions = map_progress(
    f=lambda chunks:openai_process_questions(
        chunks=chunks,
        prompt_template_path=prompt_template_path,
        model="gpt-4o-mini",
    ),
    seq=sampled_chunks,
    max_workers=4,
    verbose=False
)

generated_questions = flatten_list_of_lists(generated_questions)`

  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
generated_questions

[[{'episode_id': 'hy2G3PhGm-g',
   'chunk_id': 57,
   'question': 'What concerns does Nicole Perlroth express about the presence of Jeffrey Epstein at MIT?'},
  {'episode_id': 'nre0QT9LN6w',
   'chunk_id': 70,
   'question': "What was the reaction of some customers to Baxter's screen showing a face?"},
  {'episode_id': 'IUHkhB366tE',
   'chunk_id': 23,
   'question': 'What particle is mentioned as the simplest example that demonstrates how bosons behave?'},
  {'episode_id': 'fIPxfzfOTxk',
   'chunk_id': 71,
   'question': 'According to Georges St-Pierre, what does he believe is the opposite of love?'}],
 [{'episode_id': 'Udh22kuLebg',
   'chunk_id': 81,
   'question': 'How has Roger changed over the years according to his own reflection?'},
  {'episode_id': 'lvh3g7eszVQ',
   'chunk_id': 113,
   'question': 'What does Andrew Huberman suggest can enhance endurance while maintaining muscle size?'},
  {'episode_id': 'I51DuprOb0o',
   'chunk_id': 62,
   'question': "What is Dmitry Korkin's 

In [12]:
# Save Generated questions

questions_path = os.path.join(
    PROJECT_DIR,
    "data/generated_questions/",
    "questions.json"
)

save_json_file(generated_questions, questions_path, replace=True)

Data successfully saved to /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/data/generated_questions/questions.json


In [15]:
# Vectorize questions and store to data/*embeddings*

vectorized_questions = map_progress(
    f=lambda document: embed_document(
        OLLAMA_CLIENT, document, vector_key='question_vector'),
    seq=generated_questions,
    max_workers=6,
    verbose=False
)

  0%|          | 0/234 [00:00<?, ?it/s]

In [20]:
# save the embeddings to file system

save_to_pickle(
    vectorized_questions,
    os.path.join(
        PROJECT_DIR,
        "data/generated_embeddings/vectorized_questions.pkl"
    )
)