### Setup paths

In [1]:
import sys
import os

In [2]:
# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

In [3]:
from data_pull_and_prep.audio_from_yt import download_audio

### Step 0: (Optional) Download audio files from YouTube

I'm using podcasts from youtube in this project as an example. This step is not needed if you already have an mp3 file, skip to step 1.

In [5]:
# Usage
video_url = "https://www.youtube.com/watch?v=vcEVgN4eET8"  # Replace with your video URL
video_name = "test3"  # Replace with your video name
output_dir = project_root+"/data/testing/audio_0/"  # Replace with your output directory

download_audio(video_url, video_name, output_dir)

Downloading audio...
Audio downloaded: /Users/rishikeshdhayarkar/rag-audio-indexing/data/testing/audio_0/test3.mp3


### Step 1: Convert mp3 file to text and generate time stamps for each character

In [3]:
import data_pull_and_prep.utils as utils
import data_pull_and_prep.data_preparation as data_prep
import textwrap

Convert audio to text using Open AI whisper

In [5]:
audio_file_path = project_root+"/data/testing/audio_0/test3.mp3"
transcription = data_prep.transcribe(audio_file_path)

  checkpoint = torch.load(fp, map_location=device)


Transcribed output contains an id, piece of converted text, start time and end time in the audio clip for this text. 

In [6]:
print(len(transcription))
print(f"id: {transcription[5][0]}")
print(f"text: {transcription[5][1]}")
print(f"start time: {transcription[5][2]}")
print(f"end time: {transcription[5][3]}")

34
id: 5
text:  what he called new dirt, like they're new dirt, like to make it super like with safety features
start time: 26.76
end time: 31.44


For each such segment(above cell), calculate the time stamp for each character in text by interpolation.

But why do we need character level time stamps?
Character level timestamps provide the flexibility to create textchunks of any size.

In [4]:
transcription_with_char_timestamps = utils.import_pkl_file(project_root+"/data/audio_1/ivanka_trump_transcription_char_timestamps.pkl")

In [5]:
# transcription_with_char_timestamps = data_prep.map_characters_to_timestamps(transcription)

In [6]:
print(f"Total number of characters: {len(transcription_with_char_timestamps)}")
transcription_with_char_timestamps[:5]

Total number of characters: 157283


[(' ', 0.0),
 ('T', 0.06449438202247192),
 ('h', 0.12898876404494383),
 ('e', 0.19348314606741573),
 (' ', 0.25797752808988766)]

Save character level timestamps

In [7]:
# utils.save_as_pickle_file(directory=project_root+"/data/testing/audio_0/",
#                     filename="transcription_with_char_timestamps.pkl",
#                     data=transcription_with_char_timestamps)

Create custom chunks using SentenceSplitter from Llamaindex.

In [8]:
custom_chunking_obj = data_prep.CreateCustomTextChunks(transcription_with_char_timestamps)
text_chunks_with_timestamps = custom_chunking_obj.create_custom_text_chunks()

In [9]:
print(f"Number of text chunks: {len(text_chunks_with_timestamps)}")

Number of text chunks: 42


In [10]:
print(textwrap.fill(str(text_chunks_with_timestamps[0]), width=80))

("The following is a conversation with Ivanka Trump, businesswoman, real estate
developer, and former senior advisor to the President of the United States. I've
gotten to know Ivanka well over the past two years. We've become good friends,
hitting it off right away over our mutual love of reading, especially
philosophical writings from Marcus Aurelius, Joseph Campbell, Alan Watts, Victor
Franklin, and so on. She is a truly kind, compassionate, and thoughtful human
being. In the past, people have attacked her. In my view, to get indirectly at
her dad, Donald Trump, as part of a dirty game of politics and clickbait
journalism. These attacks obscured many projects and efforts, often bipartisan,
that she helped get done, and they obscured the truth of who she is as a human
being. Through all that, she never returned the attacks with anything but
kindness, and always walked through the fire of it all with grace. For this, and
much more, she is an inspiration, and I'm honored to be able to c

### Step2: Create textnodes and add them to a vector store

In [11]:
import basic_rag.rag as rag
from dotenv import load_dotenv

dotenv_path = '.env'
load_dotenv(dotenv_path=dotenv_path)

pinecone_api_key = os.environ["PINECONE_API_KEY"]
openai_api_key = os.environ["OPENAI_API_KEY"]

  from tqdm.autonotebook import tqdm


In [12]:
custom_rag_obj = rag.CustomRAG(pinecone_api_key=pinecone_api_key,
              openai_api_key=openai_api_key,
              index_name="ivanka-08-26-via-class-trail1",
              text_chunks_with_timestamps=text_chunks_with_timestamps[:10]
              )

In [13]:
await custom_rag_obj.create_text_nodes_and_add_to_vector_store()

100%|██████████| 5/5 [00:01<00:00,  4.16it/s]
100%|██████████| 10/10 [00:04<00:00,  2.14it/s]
Upserted vectors: 100%|██████████| 10/10 [00:01<00:00,  7.61it/s]


### Step 3: Embedding retrieval from vector store

In [41]:
query_str = "describe the incident with kim kardashian?"

In [42]:
custom_retriever_obj = rag.CustomRetriever(embed_model=custom_rag_obj.embed_model,
                                           vector_store=custom_rag_obj.vector_store)
query_result = custom_retriever_obj.retrieve(query=query_str)

In [43]:
query_result

VectorStoreQueryResult(nodes=[TextNode(id_='ed9da4ce-a9bf-4726-9361-d82bc8198923', embedding=[0.00428304868, -0.0131062642, 0.000118938333, -0.0216520634, 0.00568366656, 0.0106365755, -0.0307188649, -0.0233977605, -0.0186884366, -0.0279582255, 0.0376204588, -0.00344572286, 0.00210092682, 0.00908710063, -0.0101764696, -0.00167972641, 0.0365649201, -0.010399757, -0.0127205867, -0.0328028761, -0.0297715869, -0.0012179286, -0.00186918199, -0.00901267119, -0.0293926746, 0.00428304868, 0.01955452, -0.0220851041, -0.00376881217, -0.0208942406, 0.0101967687, -0.0105215497, -0.0123416763, -0.01545416, -0.00943218, -0.0113267358, -0.00232252199, -0.0262260605, 0.00614377297, 0.0126326252, 0.0183365904, 0.00960810296, -0.0210566316, -0.00547391223, -0.0077203135, 0.00384324114, 0.00422553532, -0.0137896575, -0.00956750568, 0.00896530692, 0.0322615728, 0.0240473226, -0.013228057, -0.0181065369, -0.0238984637, -0.0154000297, -0.00648546964, 0.00644148886, 0.00446235482, -0.00677303597, 0.0264155176

### Step 4: Response Synthesis

In [44]:
import basic_rag.response_synthesizer as response_synthesizer

response_synthesizer_obj = response_synthesizer.HierarchicalSummarizer(llm=custom_rag_obj.llm)
response = response_synthesizer_obj.generate_response_hs(retrieved_nodes=query_result.nodes, query_str=query_str)                                                         

In [45]:
print(textwrap.fill(response, 80))

I'm sorry, but based on the context information provided, there is no incident
involving Kim Kardashian mentioned.


In [None]:
# TODO
# 1. change class name to ingestion
# 2. resolve issues in data_preparation
# 3. run black and flake8 on py files
# 4. Add function docs
# 5. Clean up colab and rerun on whole video