In [53]:
from llama_index.core.node_parser import SentenceSplitter
import textwrap
import pickle
import os
from dotenv import load_dotenv
from pinecone import Pinecone, Index, ServerlessSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore
import pickle
import os
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.core import PromptTemplate

In [2]:
def import_pkl_file(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

In [3]:
def get_full_text_from_char_timestamps(char_timestamps):
    full_text_string = ""
    for char, _ in char_timestamps:
        full_text_string += char
    return full_text_string

In [4]:
def get_start_end_idx(all_text_chunks, full_text):
    text_chunks_start_end_idx = []
    for i, text_chunk in enumerate(all_text_chunks):
        if text_chunk not in full_text:
            print(f"Chunk {i} not found in full text")
            break
        else:
            start_idx = full_text.find(text_chunk)
            end_idx = start_idx + len(text_chunk) - 1
            text_chunks_start_end_idx.append((start_idx, end_idx))
    return text_chunks_start_end_idx

In [5]:
def get_time_stamps(start_end_idx, char_timestamps):
    text_chunks_time_stamps = []
    for start_idx, end_idx in start_end_idx:
        start_time = char_timestamps[start_idx][1]
        end_time = char_timestamps[end_idx][1]
        text_chunks_time_stamps.append((start_time, end_time)) 
    return text_chunks_time_stamps

In [6]:
def combine_text_chunks_with_timestamps(all_text_chunks, all_text_chunk_timestamps):
    text_chunks_with_timestamps = []
    for i in range(len(all_text_chunks)):
        text_chunks_with_timestamps.append((all_text_chunks[i], all_text_chunk_timestamps[i]))
    return text_chunks_with_timestamps

In [7]:
char_timestamps = import_pkl_file('data/clip_2/ivanka_trump_transcription_char_timestamps.pkl')

In [8]:
text_chunks = []
text_parser = SentenceSplitter(chunk_size=1024)

full_text_string = get_full_text_from_char_timestamps(char_timestamps)
text_chunks = text_parser.split_text(full_text_string)

start_end_idx = get_start_end_idx(text_chunks, full_text_string)
time_stamps = get_time_stamps(start_end_idx, char_timestamps)
text_chunks_with_timestamps = combine_text_chunks_with_timestamps(text_chunks, time_stamps)


In [9]:
text_chunks_with_timestamps[0]


("The following is a conversation with Ivanka Trump, businesswoman, real estate developer, and former senior advisor to the President of the United States. I've gotten to know Ivanka well over the past two years. We've become good friends, hitting it off right away over our mutual love of reading, especially philosophical writings from Marcus Aurelius, Joseph Campbell, Alan Watts, Victor Franklin, and so on. She is a truly kind, compassionate, and thoughtful human being. In the past, people have attacked her. In my view, to get indirectly at her dad, Donald Trump, as part of a dirty game of politics and clickbait journalism. These attacks obscured many projects and efforts, often bipartisan, that she helped get done, and they obscured the truth of who she is as a human being. Through all that, she never returned the attacks with anything but kindness, and always walked through the fire of it all with grace. For this, and much more, she is an inspiration, and I'm honored to be able to c

In [10]:
dotenv_path = '.env'
load_dotenv(dotenv_path=dotenv_path)

api_key = os.environ["PINECONE_API_KEY"]
pc = Pinecone(api_key=api_key)

index_name = "llamaindex-rag-ivanka-timestamp"

In [11]:
# dimensions are for text-embedding-ada-002
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=1536,
        metric="euclidean",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

In [12]:
pinecone_index = pc.Index(index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [13]:
nodes = []
for i in range(len(text_chunks_with_timestamps)):
    node = TextNode(
        text=text_chunks_with_timestamps[i][0],
        # extra_info={
        #     "start_timestamp": text_chunks_with_timestamps[i][1][0],
        #     "end_timestamp": text_chunks_with_timestamps[i][1][1]
        # }
    )
    nodes.append(node)

In [14]:
llm = OpenAI(model="gpt-3.5-turbo")

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

In [15]:
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:01<00:00,  3.96it/s]
100%|██████████| 42/42 [00:14<00:00,  2.80it/s]


In [16]:
print(nodes[0].get_content())

The following is a conversation with Ivanka Trump, businesswoman, real estate developer, and former senior advisor to the President of the United States. I've gotten to know Ivanka well over the past two years. We've become good friends, hitting it off right away over our mutual love of reading, especially philosophical writings from Marcus Aurelius, Joseph Campbell, Alan Watts, Victor Franklin, and so on. She is a truly kind, compassionate, and thoughtful human being. In the past, people have attacked her. In my view, to get indirectly at her dad, Donald Trump, as part of a dirty game of politics and clickbait journalism. These attacks obscured many projects and efforts, often bipartisan, that she helped get done, and they obscured the truth of who she is as a human being. Through all that, she never returned the attacks with anything but kindness, and always walked through the fire of it all with grace. For this, and much more, she is an inspiration, and I'm honored to be able to cal

In [17]:
print(nodes[0].metadata.keys())
# print(nodes[0].metadata['start_timestamp'])
# print(nodes[0].metadata['end_timestamp'])
print(nodes[0].metadata['document_title'])
print(nodes[0].metadata['questions_this_excerpt_can_answer'])

dict_keys(['document_title', 'questions_this_excerpt_can_answer'])
"Architectural Ambitions: Exploring Beauty, Function, and Innovation in New York and Chicago"
1. How did Ivanka Trump's childhood experiences and family background influence her passion for architecture and real estate development?
2. What aspects of architecture and real estate development does Ivanka Trump find most compelling and challenging, based on her personal experiences and reflections?
3. How does Ivanka Trump's perspective on the confidence and ambition of youth influence her approach to tackling large-scale projects in the fields of architecture and real estate development?


In [19]:
embed_model = OpenAIEmbedding()

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [20]:
print(nodes[0].get_content(metadata_mode="all"))

[Excerpt from document]
document_title: "Architectural Ambitions: Exploring Beauty, Function, and Innovation in New York and Chicago"
questions_this_excerpt_can_answer: 1. How did Ivanka Trump's childhood experiences and family background influence her passion for architecture and real estate development?
2. What aspects of architecture and real estate development does Ivanka Trump find most compelling and challenging, based on her personal experiences and reflections?
3. How does Ivanka Trump's perspective on the confidence and ambition of youth influence her approach to tackling large-scale projects in the fields of architecture and real estate development?
Excerpt:
-----
The following is a conversation with Ivanka Trump, businesswoman, real estate developer, and former senior advisor to the President of the United States. I've gotten to know Ivanka well over the past two years. We've become good friends, hitting it off right away over our mutual love of reading, especially philosoph

In [23]:
print(len(nodes))
print(len(text_chunks_with_timestamps))

42
42


In [30]:
for i, node in enumerate(nodes):
    node.extra_info['start_timestamp'] = float(text_chunks_with_timestamps[i][1][0])
    node.extra_info['end_timestamp'] = float(text_chunks_with_timestamps[i][1][1])

In [33]:
print(nodes[0].get_content(metadata_mode="all"))

[Excerpt from document]
document_title: "Architectural Ambitions: Exploring Beauty, Function, and Innovation in New York and Chicago"
questions_this_excerpt_can_answer: 1. How did Ivanka Trump's childhood experiences and family background influence her passion for architecture and real estate development?
2. What aspects of architecture and real estate development does Ivanka Trump find most compelling and challenging, based on her personal experiences and reflections?
3. How does Ivanka Trump's perspective on the confidence and ambition of youth influence her approach to tackling large-scale projects in the fields of architecture and real estate development?
start_timestamp: 0.06449438202247192
end_timestamp: 316.72680412371136
Excerpt:
-----
The following is a conversation with Ivanka Trump, businesswoman, real estate developer, and former senior advisor to the President of the United States. I've gotten to know Ivanka well over the past two years. We've become good friends, hitting 

In [34]:
vector_store.add(nodes)

Upserted vectors:   0%|          | 0/42 [00:00<?, ?it/s]

['b49e4f0e-0e8d-4d2d-9339-e508c50c9f4e',
 '3acccaf9-a657-4b68-8730-a687b13f844f',
 'c258a541-c9ce-415e-ae69-a5c5a418a413',
 'd06d3e06-0af0-4ba7-8306-a646a23f9b11',
 '42f7c655-3569-462a-b000-04fbfb4add25',
 '3601686b-835b-46eb-a3a9-a4c900391032',
 'bb1c0e1b-fd38-41e6-be56-6862881dfae8',
 '5f89daa6-ff70-4d6f-85dd-276bc98769fa',
 '8ac9a48c-f03d-46d5-9b80-4084cb364b3c',
 '5119277c-f5ee-47de-9c8b-64aacb86be1e',
 '7a5a4138-1761-456b-964e-25e3bf892579',
 '0af8505d-26c3-4694-8754-1322e1b0274c',
 '02d2e322-f41f-4f21-a33f-936ba447f497',
 '11179db6-d40b-4a89-8582-382e6c0e3f2e',
 'caa74fda-f4aa-49fa-a4b0-c813e7a1b233',
 'e13946a9-54fb-4557-bd63-9e907d4a43e4',
 '68eac674-7b5c-4fec-af0a-921166c3b4af',
 '7cfdfe92-4d4f-40fc-a91a-4e6b8d5f8946',
 '32d84438-08c4-4722-91a7-83b3b6ef57ab',
 '6ddb2ddd-b59d-4e13-8964-945e55c77080',
 '11811e96-da50-4cb6-9f74-68959edf16d2',
 '20992b22-4fe7-4434-8f63-f69ca27e532d',
 '278db14b-8660-454e-89ec-7384febfebe7',
 '84451beb-709e-4872-b981-03e08e192a93',
 'b71e2974-0392-

#### Retrieval stage

In [35]:
from llama_index.core.vector_stores import VectorStoreQuery

In [38]:
query_str = "describe the incident with kim kardashian"
query_embedding = embed_model.get_query_embedding(query_str)

In [39]:
query_mode = "default"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=3, mode=query_mode
)

query_result = vector_store.query(vector_store_query)
query_result

VectorStoreQueryResult(nodes=[TextNode(id_='21be2f48-229e-4177-959b-c4237f64246f', embedding=[-0.00130108837, -0.019167, 0.00229185726, -0.018279383, -0.00698999129, 0.0216495562, -0.000573831145, -0.00575218024, -0.0274052043, -0.00599488849, 0.0440480411, 0.00050925347, -0.0083144838, 0.00320547959, -0.0145486128, 0.00259524235, 0.0371967405, -0.00842543598, -0.00338230981, -0.015325279, -0.0291804392, -0.0127179008, 0.00465306, 0.0116430512, -0.0120521877, 0.0283205602, 0.0251722895, -0.0220101513, -0.00877909642, -0.0180297401, 0.0104433801, 0.00687557179, -0.0267949663, -0.0199991427, -0.00830754917, -0.00625493284, -0.023882471, -0.0235912204, 0.0155610517, -0.0078221336, 0.0153807551, -0.00973605923, -0.00616478408, -0.0249087792, 0.00182204391, 0.0173362885, -0.00872362, -0.0123295682, -0.00876522716, 0.012752573, 0.0300680585, 0.0260321703, -0.0260876454, -0.0173501577, -0.0126416208, -0.0141672147, 0.0107068913, 0.0168786105, 0.00999957137, 0.0165180154, 0.028209608, 0.015242

In [42]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

In [43]:
from llama_index.core.response.notebook_utils import display_source_node

for node in nodes_with_scores:
    display_source_node(node, source_length=1000)

**Node ID:** 21be2f48-229e-4177-959b-c4237f64246f<br>**Similarity:** 0.451202154<br>**Text:** so I think she's very cool I hope you have a long conversation yeah she's like okay so there's many things to say about her at first like incredibly great musician songwriters performer yeah also can create an image and have fun with it you know like have fun being herself like over the top it feels that way right like she's really she enjoys after all these years it feels like she's enjoying she like enjoys what she does and you also have the sense that if she didn't she wouldn't do it that's right and just an iconic country musician country music singer yeah um there's a lot we've talked about a lot of musicians what do you enjoy you mentioned a Dal seeing her perform hanging out with her yeah I mean she's extraordinary her voice is unreal um so she is I find her to be so talented and she's so unique in that three-year-olds love her music she's actually the first concert Arabela ever went to and Madison Square Garden when she wished she was around four and nine-year-olds love her ...<br>

**Node ID:** caa74fda-f4aa-49fa-a4b0-c813e7a1b233<br>**Similarity:** 0.482552052<br>**Text:** And to still be a big organization act like a startup is the big challenge. It's super difficult to deconstruct that as well once it's in place, right? It's it's circumventing layers and asking questions, probing questions of people on the ground level is a huge challenge to the authority of the hierarchy. And there's tremendous amount of resistance to it. So it's how do you grow something in the case of a company in terms of a culture that can scale but doesn't lose its connection to to sort of real and meaningful feedback. It's it's not not easy. I've had a lot of conversations with Jim Keller who is this legendary engineer and leader and he he has talked about like you often have to kind of be a little bit of an asshole in the room, not in a mean way, but it's like it's uncomfortable. Yeah. Like a lot of these questions that are uncomfortable, they break the kind of general politeness and civility that people have in communication. When you get a meeting, like nobody wants to be ...<br>

**Node ID:** 04cb7715-e844-4218-93fc-cf4c7c365849<br>**Similarity:** 0.498766065<br>**Text:** advisor you know the campaign I never I never thought about joining it was kind of like get to the end of it and when it started I was like everything in my life was almost firing on all cylinders I two young kids at home during the course of the campaign I ended up I was pregnant with my third so this young family my businesses real estate and and fashion and working alongside my brothers running the Trump Hotel collection and with so many my life was full and busy and and so there was a big part of me that was just wanted to get through just get through it without really thinking forward to what the implications were for me but when my father won he asked Jared and I to join him and in asking that question you know keep in mind he was a total outsider so there was no bench of people as he would have today he had never spent the night in Washington they see you before yeah staying in the White House and so when he asked us to join him he trusted us he trusted in our ability to to e...<br>

In [47]:
print(query_result.nodes[0].metadata['start_timestamp'])

9562.877254901961


### Try a simple prompt

In [54]:
qa_prompt = PromptTemplate(
    """\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: \
"""
)

### Hierarchical summarization

In [75]:
import nest_asyncio
import asyncio

nest_asyncio.apply()

In [76]:
def combine_results(
    texts,
    query_str,
    qa_prompt,
    llm,
    cur_prompt_list,
    num_children=10,
):
    new_texts = []
    for idx in range(0, len(texts), num_children):
        text_batch = texts[idx : idx + num_children]
        context_str = "\n\n".join([t for t in text_batch])
        fmt_qa_prompt = qa_prompt.format(
            context_str=context_str, query_str=query_str
        )
        combined_response = llm.complete(fmt_qa_prompt)
        new_texts.append(str(combined_response))
        cur_prompt_list.append(fmt_qa_prompt)

    if len(new_texts) == 1:
        return new_texts[0]
    else:
        return combine_results(
            new_texts, query_str, qa_prompt, llm, num_children=num_children
        )


def generate_response_hs(
    retrieved_nodes, query_str, qa_prompt, llm, num_children=10
):
    """Generate a response using hierarchical summarization strategy.

    Combine num_children nodes hierarchically until we get one root node.

    """
    fmt_prompts = []
    node_responses = []
    for node in retrieved_nodes:
        context_str = node.get_content()
        fmt_qa_prompt = qa_prompt.format(
            context_str=context_str, query_str=query_str
        )
        node_response = llm.complete(fmt_qa_prompt)
        node_responses.append(node_response)
        fmt_prompts.append(fmt_qa_prompt)

    response_txt = combine_results(
        [str(r) for r in node_responses],
        query_str,
        qa_prompt,
        llm,
        fmt_prompts,
        num_children=num_children,
    )

    return response_txt, fmt_prompts

In [77]:
response, fmt_prompts = generate_response_hs(
    retrieved_nodes, query_str, qa_prompt, llm
)

In [74]:
print(str(response))

<coroutine object acombine_results at 0x3350ed380>


In [73]:
fmt_prompts

["Context information is below.\n---------------------\nso I think she's very cool I hope you have a long conversation yeah she's like okay so there's many things to say about her at first like incredibly great musician songwriters performer yeah also can create an image and have fun with it you know like have fun being herself like over the top it feels that way right like she's really she enjoys after all these years it feels like she's enjoying she like enjoys what she does and you also have the sense that if she didn't she wouldn't do it that's right and just an iconic country musician country music singer yeah um there's a lot we've talked about a lot of musicians what do you enjoy you mentioned a Dal seeing her perform hanging out with her yeah I mean she's extraordinary her voice is unreal um so she is I find her to be so talented and she's so unique in that three-year-olds love her music she's actually the first concert Arabela ever went to and Madison Square Garden when she wi

In [78]:
print(textwrap.fill(response, width=100))

The incident with Kim Kardashian involved her bringing the case of Alice Johnson to the attention of
Ivanka Trump, which ultimately led to Ivanka working on the case and helping to commute Alice
Johnson's sentence. This resulted in a profound experience for Ivanka and her children, as they
attended a concert with Alice Johnson and had meaningful conversations about her story and the
impact of their work on criminal justice reform.
