In [23]:
#Import libs

In [24]:
import pymupdf
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
import pandas as pd

In [25]:
def get_pdf_text(files):
    raw_text = ""
    for file in files:
        doc = pymupdf.open(file)
        for page in doc:
            raw_text += page.get_text()
    return raw_text

In [26]:
pdf_text = get_pdf_text(["video_stories.pdf"])
pdf_text 

'© 2024 IEEE. This is the author’s version of the article that has been published in the proceedings of IEEE\nVisualization conference. The final version of this record is available at: xx.xxxx/TVCG.201x.xxxxxxx/\nFrom Data to Story: Towards Automatic Animated Data Video Creation\nwith LLM-based Multi-Agent Systems\nLeixian Shen\n*\nThe Hong Kong University\nof Science and Technology,\nHong Kong SAR, China\nHaotian Li\n†\nThe Hong Kong University\nof Science and Technology,\nHong Kong SAR, China\nYun Wang\n‡\nMicrosoft,\nBeijing, China\nHuamin Qu\n§\nThe Hong Kong University\nof Science and Technology,\nHong Kong SAR, China\nABSTRACT\nCreating data stories from raw data is challenging due to humans’\nlimited attention spans and the need for specialized skills. Recent\nadvancements in large language models (LLMs) offer great oppor-\ntunities to develop systems with autonomous agents to streamline\nthe data storytelling workflow. Though multi-agent systems have\nbenefits such as fully re

In [27]:
def get_text_chunks(raw_text):
    character_text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = character_text_splitter.split_text(raw_text)
    return chunks

In [28]:
text_chunks = get_text_chunks(pdf_text)
text_chunks

['© 2024 IEEE. This is the author’s version of the article that has been published in the proceedings of IEEE\nVisualization conference. The final version of this record is available at: xx.xxxx/TVCG.201x.xxxxxxx/\nFrom Data to Story: Towards Automatic Animated Data Video Creation\nwith LLM-based Multi-Agent Systems\nLeixian Shen\n*\nThe Hong Kong University\nof Science and Technology,\nHong Kong SAR, China\nHaotian Li\n†\nThe Hong Kong University\nof Science and Technology,\nHong Kong SAR, China\nYun Wang\n‡\nMicrosoft,\nBeijing, China\nHuamin Qu\n§\nThe Hong Kong University\nof Science and Technology,\nHong Kong SAR, China\nABSTRACT\nCreating data stories from raw data is challenging due to humans’\nlimited attention spans and the need for specialized skills. Recent\nadvancements in large language models (LLMs) offer great oppor-\ntunities to develop systems with autonomous agents to streamline\nthe data storytelling workflow. Though multi-agent systems have',
 'advancements in large

In [29]:
def get_vector_store(text_chunks):
    #embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
    # embeddings = HuggingFaceEmbeddings(model_name="nvidia/NV-Embed-v2")
    return FAISS.from_texts(text_chunks, embeddings)


In [30]:
vector_store = get_vector_store(text_chunks)
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x13ec19f70>

In [31]:
def get_vectors(vector_store):
    faiss_index = vector_store.index
    num_vectors = faiss_index.ntotal
    return [faiss_index.reconstruct(i) for i in range(num_vectors)]

In [32]:
data_frame = pd.DataFrame(get_vectors(vector_store))
data_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.066182,-0.02101,-0.008742,-0.014602,-0.023218,0.00893,-0.051568,0.013058,-0.022419,0.015588,...,0.022173,-0.008046,-0.010913,0.023401,-0.019495,-0.030258,0.038941,-0.001185,-0.031221,0.011723
1,-0.045843,-0.050158,-0.023588,0.006357,-0.034032,0.021442,-0.016376,0.021591,-0.007568,0.009252,...,0.027476,-0.002983,-0.017811,0.013417,-0.023275,-0.035028,0.01376,-0.004402,-0.008546,-0.024951
2,-0.045072,-0.016056,-0.036921,0.017648,-0.045985,0.021246,-0.045944,0.028198,-0.01914,-0.000546,...,0.019373,0.006401,0.004539,-0.002458,-0.016572,-0.015448,0.038689,0.0122,-0.017255,-0.017569
3,-0.028486,0.015219,-0.002378,-0.019613,-0.040951,-0.030104,-0.000281,-0.004606,-0.002633,0.007565,...,0.00124,0.009628,0.010895,0.020558,-0.050434,-0.035305,0.042794,-0.006519,-0.022085,-0.035296
4,-0.050036,-0.013621,-0.00063,-0.01236,-0.035588,-0.002847,-0.016386,0.019581,-0.00944,0.025057,...,0.00172,0.014324,-0.00154,0.018827,-0.008673,-0.029025,0.014733,-0.003233,-0.037107,-0.010664


In [33]:
def get_chain(vectorstore):
    llm = ChatOpenAI()
    # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return chain

In [34]:
chain = get_chain(vector_store) 
chain

ConversationalRetrievalChain(memory=ConversationBufferMemory(return_messages=True, memory_key='chat_history'), combine_docs_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x339b60830>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x39c584ec0>, root_client=<openai.OpenAI object at 0x339b61c10>, root_async_client=<openai.AsyncOpenAI object at 0x339b60800>, openai_api_key=SecretStr('**********'), openai_proxy='')), document_va

In [35]:
chain.invoke("What is the story about?")

{'question': 'What is the story about?',
 'chat_history': [HumanMessage(content='What is the story about?'),
  AIMessage(content='The story is about leveraging large language models (LLMs) to create autonomous agents that can streamline the data storytelling workflow. These agents, powered by LLMs, help in perceiving environments, making decisions, and taking actions to automate tasks related to data analysis and information communication. The goal is to enhance the efficiency of data analysis and storytelling by utilizing the capabilities of LLMs in various tasks like data analysis, document generation, and visualization creation.')],
 'answer': 'The story is about leveraging large language models (LLMs) to create autonomous agents that can streamline the data storytelling workflow. These agents, powered by LLMs, help in perceiving environments, making decisions, and taking actions to automate tasks related to data analysis and information communication. The goal is to enhance the eff