# Load document

In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../dataset/llamaindex_data/openai.txt")

docs = loader.load()

In [2]:
len(docs)

1

In [3]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("../dataset/llamaindex_data/Mistral AI - Wikipedia.pdf")

docs = loader.load()

In [4]:
docs

[Document(metadata={'source': '../dataset/llamaindex_data/Mistral AI - Wikipedia.pdf', 'page': 0}, page_content='Mistral AI SAS\nCompany type Private\nIndustry Artificial intelligence\nFounded 28 April 2023\nFounders Arthur Mensch\nGuillaume Lample\nTimoth é e Lacroix\nHeadquarters Paris, France\nKey people Arthur Mensch (CEO)\nGuillaume Lample (Chief\nScientist)\nTimoth é e Lacroix (CTO)\nProducts Mistral 7B\nMixtral 8x7B\nMistral Medium\nMistral Large\nMistral Large 2 (123B)\nMixtral 8x22B\nCodestral 22B\nCodestral Mamba (7B)\nMathstral (7B)\nMistral NeMo 12B\nMistral Embed\nNumber of\nemployees\n150 (2025)[1]\nWebsite mistral.ai (https://mistral.a\ni/)\nMistral AI\nMistral AI SAS is a French artificial intelligence\n(AI) startup, headquartered in Paris. It specializes in\nopen-weight large language models (LLMs).[2][3]\nThe company is named after the mistral, a powerful,\ncold wind in southern France.[4]\nMistral AI was established in April 2023 by three\nFrench AI researchers, Arth

In [5]:
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader("../dataset/llamaindex_data", glob="*", show_progress=True)

docs = loader.load()

  0%|          | 0/2 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 2/2 [00:03<00:00,  1.99s/it]


In [6]:
len(docs)

2

# Chunking

In [7]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
  separator="\n",
  chunk_size=500,
  chunk_overlap=20,
  length_function=len,
  is_separator_regex=False,
)

In [8]:
documents = text_splitter.split_documents(docs)

Created a chunk of size 1528, which is longer than the specified 500
Created a chunk of size 3451, which is longer than the specified 500
Created a chunk of size 2224, which is longer than the specified 500
Created a chunk of size 705, which is longer than the specified 500
Created a chunk of size 3773, which is longer than the specified 500
Created a chunk of size 7769, which is longer than the specified 500
Created a chunk of size 1935, which is longer than the specified 500
Created a chunk of size 602, which is longer than the specified 500
Created a chunk of size 675, which is longer than the specified 500
Created a chunk of size 1719, which is longer than the specified 500
Created a chunk of size 2704, which is longer than the specified 500
Created a chunk of size 809, which is longer than the specified 500
Created a chunk of size 755, which is longer than the specified 500
Created a chunk of size 1820, which is longer than the specified 500
Created a chunk of size 1183, which is 

In [9]:
documents[0]

Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='OpenAI, Inc. is an American artificial intelligence (AI) research organization founded in December 2015 and headquartered in San Francisco, California. It aims to develop "safe and beneficial" artificial general intelligence (AGI), which it defines as "highly autonomous systems that outperform humans at most economically valuable work". As a leading organization in the ongoing AI boom, OpenAI is known for the GPT family of large language models, the DALL-E series of text-to-image models, and a text-to-video model named Sora. Its release of ChatGPT in November 2022 has been credited with catalyzing widespread interest in generative AI. The organization consists of the non-profit OpenAI, Inc., registered in Delaware, and its for-profit subsidiary introduced in 2019, OpenAI Global, LLC. Its stated mission is to ensure that AGI "benefits all of humanity". Microsoft owns roughly 49% of OpenAI\'s equity, hav

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
  separators=["\n\n", "\n", " "],
  chunk_size=500,
  chunk_overlap=20,
  length_function=len,
  is_separator_regex=False,
)


In [11]:
documents = text_splitter.split_documents(docs)

In [12]:
documents[0]

Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='OpenAI, Inc. is an American artificial intelligence (AI) research organization founded in December 2015 and headquartered in San Francisco, California. It aims to develop "safe and beneficial" artificial general intelligence (AGI), which it defines as "highly autonomous systems that outperform humans at most economically valuable work". As a leading organization in the ongoing AI boom, OpenAI is known for the GPT family of large language models, the DALL-E series of text-to-image models, and a')

In [14]:
from langchain_experimental.text_splitter import  SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(OpenAIEmbeddings())

documents = text_splitter.split_documents(docs)

In [15]:
documents[0]

Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='OpenAI, Inc. is an American artificial intelligence (AI) research organization founded in December 2015 and headquartered in San Francisco, California. It aims to develop "safe and beneficial" artificial general intelligence (AGI), which it defines as "highly autonomous systems that outperform humans at most economically valuable work". As a leading organization in the ongoing AI boom, OpenAI is known for the GPT family of large language models, the DALL-E series of text-to-image models, and a text-to-video model named Sora. Its release of ChatGPT in November 2022 has been credited with catalyzing widespread interest in generative AI. The organization consists of the non-profit OpenAI, Inc., registered in Delaware, and its for-profit subsidiary introduced in 2019, OpenAI Global, LLC. Its stated mission is to ensure that AGI "benefits all of humanity". Microsoft owns roughly 49% of OpenAI\'s equity, hav

In [16]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
  chunk_size=500,
  chunk_overlap=0,
)

In [17]:
documents = text_splitter.split_documents(docs)

Created a chunk of size 751, which is longer than the specified 500
Created a chunk of size 800, which is longer than the specified 500
Created a chunk of size 1578, which is longer than the specified 500
Created a chunk of size 558, which is longer than the specified 500


In [18]:
documents[0]

Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='OpenAI, Inc. is an American artificial intelligence (AI) research organization founded in December 2015 and headquartered in San Francisco, California. It aims to develop "safe and beneficial" artificial general intelligence (AGI), which it defines as "highly autonomous systems that outperform humans at most economically valuable work". As a leading organization in the ongoing AI boom, OpenAI is known for the GPT family of large language models, the DALL-E series of text-to-image models, and a text-to-video model named Sora. Its release of ChatGPT in November 2022 has been credited with catalyzing widespread interest in generative AI. The organization consists of the non-profit OpenAI, Inc., registered in Delaware, and its for-profit subsidiary introduced in 2019, OpenAI Global, LLC. Its stated mission is to ensure that AGI "benefits all of humanity". Microsoft owns roughly 49% of OpenAI\'s equity, hav

# Embedding

In [19]:
embed_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [21]:
emb = embed_model.embed_query("What is Mistral AI?")
emb[:3]

[-0.010367147624492645, 0.023194052278995514, 0.006391748320311308]

In [22]:
emb = embed_model.embed_documents(["What is Mistral AI?", "Hi", "ML"])

len(emb)

3

# VectorStore

In [24]:
from langchain_community.vectorstores import Chroma

vector_index = Chroma.from_documents(documents, embed_model)

In [25]:
retrieved = vector_index.similarity_search("What is Mistral AI?")
retrieved[0].page_content

'25. 3. 1. 오후 11:31\n\nMistral AI\n\nMistral AI SAS is a French artificial intelligence (AI) startup, headquartered in Paris. It specializes in open-weight large language models (LLMs).[2][3]\n\nNamesake\n\nThe company is named after the mistral, a powerful, cold wind in southern France.[4]\n\nHistory\n\nMistral AI was established in April 2023 by three French AI researchers, Arthur Mensch, Guillaume Lample and Timothée Lacroix.[5]\n\nMensch, an expert in advanced AI systems, is a former employee of Google DeepMind; Lample and Lacroix, meanwhile, are large-scale AI models specialists who had worked for Meta Platforms.[6]\n\nThe trio originally met during their studies at École Polytechnique.[4]\n\nCompany operation\n\nPhilosophy\n\nMistral AI emphasizes openness and innovation in the AI field and positions itself as an alternative to proprietary models.[7]\n\nThe company has gained prominence as an alternative to proprietary AI systems as it aims to focusing on open-source "democratize

In [26]:
from langchain_community.vectorstores import FAISS

vector_index = FAISS.from_documents(documents, embed_model)

In [27]:
retrieved = vector_index.similarity_search("What is Mistral AI?")
retrieved[0].page_content

'25. 3. 1. 오후 11:31\n\nMistral AI\n\nMistral AI SAS is a French artificial intelligence (AI) startup, headquartered in Paris. It specializes in open-weight large language models (LLMs).[2][3]\n\nNamesake\n\nThe company is named after the mistral, a powerful, cold wind in southern France.[4]\n\nHistory\n\nMistral AI was established in April 2023 by three French AI researchers, Arthur Mensch, Guillaume Lample and Timothée Lacroix.[5]\n\nMensch, an expert in advanced AI systems, is a former employee of Google DeepMind; Lample and Lacroix, meanwhile, are large-scale AI models specialists who had worked for Meta Platforms.[6]\n\nThe trio originally met during their studies at École Polytechnique.[4]\n\nCompany operation\n\nPhilosophy\n\nMistral AI emphasizes openness and innovation in the AI field and positions itself as an alternative to proprietary models.[7]\n\nThe company has gained prominence as an alternative to proprietary AI systems as it aims to focusing on open-source "democratize

# Retriever

In [28]:
query = "OpenAI의 sora 모델에 대해 알려줘"

In [31]:
retriever = vector_index.as_retriever(search_type="mmr")
retriever.get_relevant_documents(query)

[Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='===== DALL-E 3 ===== In September 2023, OpenAI announced DALL-E 3, a more powerful model better able to generate images from complex descriptions without manual prompt engineering and render complex details like hands and text. It was released to the public as a ChatGPT Plus feature in October.\n\n=== Text\n\nto\n\nvideo ===\n\n==== Sora ====\n\nSora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora\'s development team named it after the Japanese word for "sky", to signify its "limitless creative potential". Sora\'s technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted

In [32]:
retriever = vector_index.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.2})
retriever.get_relevant_documents(query)

[Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='===== DALL-E 3 ===== In September 2023, OpenAI announced DALL-E 3, a more powerful model better able to generate images from complex descriptions without manual prompt engineering and render complex details like hands and text. It was released to the public as a ChatGPT Plus feature in October.\n\n=== Text\n\nto\n\nvideo ===\n\n==== Sora ====\n\nSora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora\'s development team named it after the Japanese word for "sky", to signify its "limitless creative potential". Sora\'s technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted

In [33]:
retriever = vector_index.as_retriever(search_kwargs={"k": 3})
retriever.get_relevant_documents(query)

[Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='===== DALL-E 3 ===== In September 2023, OpenAI announced DALL-E 3, a more powerful model better able to generate images from complex descriptions without manual prompt engineering and render complex details like hands and text. It was released to the public as a ChatGPT Plus feature in October.\n\n=== Text\n\nto\n\nvideo ===\n\n==== Sora ====\n\nSora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora\'s development team named it after the Japanese word for "sky", to signify its "limitless creative potential". Sora\'s technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted

In [34]:
import logging

logging.basicConfig()

logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)

In [36]:
from langchain.retrievers.multi_query import MultiQueryRetriever

retriever_mult = MultiQueryRetriever.from_llm(
  retriever=vector_index.as_retriever(), llm=llm
)

In [37]:
retriever_mult.get_relevant_documents(query)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. OpenAI의 sora 모델은 어떤 특징을 가지고 있나요?', '2. sora 모델은 OpenAI에서 어떻게 활용되고 있나요?', '3. OpenAI의 sora 모델이 다른 모델과 어떻게 다른가요?']


[Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='===== DALL-E 3 ===== In September 2023, OpenAI announced DALL-E 3, a more powerful model better able to generate images from complex descriptions without manual prompt engineering and render complex details like hands and text. It was released to the public as a ChatGPT Plus feature in October.\n\n=== Text\n\nto\n\nvideo ===\n\n==== Sora ====\n\nSora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora\'s development team named it after the Japanese word for "sky", to signify its "limitless creative potential". Sora\'s technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted

In [None]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

store = InMemoryStore()

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

vector_index = Chroma(collection_name="split_parents", embedding_function=embed_model)

  vector_index = Chroma(collection_name="split_parents", embedding_function=embed_model)


In [39]:
retriever = ParentDocumentRetriever(
  vectorstore = vector_index,
  docstore = store,
  child_splitter = child_splitter,
  parent_splitter = parent_splitter,
)

In [40]:
retriever.get_relevant_documents(query)

[]

In [41]:
retriever.add_documents(docs)

In [42]:
retriever.get_relevant_documents(query)

[Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='=== Text\n\nto\n\nvideo ===\n\n==== Sora ====\n\nSora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora\'s development team named it after the Japanese word for "sky", to signify its "limitless creative potential". Sora\'s technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted videos licensed for that purpose, but did not reveal the number or the exact sources of the videos. OpenAI demonstrated some Sora-created high-definition videos to the public on February 15, 2024, stating that it could generate videos up to one minute long. It also shared a technical report highli

# Generatror

In [43]:
llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo')

In [44]:
llm.invoke(query)

AIMessage(content='OpenAI의 sora 모델은 자연어 처리 모델 중 하나로, 텍스트 생성 및 이해를 위해 설계된 모델입니다. 이 모델은 GPT-3 모델을 기반으로 하며, 다양한 자연어 처리 작업을 수행할 수 있습니다. sora 모델은 대화형 AI, 텍스트 생성, 요약, 번역, 질문 응답 등 다양한 작업에 사용될 수 있습니다. 이 모델은 다양한 언어 및 주제에 대해 학습되어 있어 다양한 분야에서 활용될 수 있습니다.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 176, 'prompt_tokens': 24, 'total_tokens': 200, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-2e3ac7ca-d1ed-40e0-b5f4-9dfb445028e5-0', usage_metadata={'input_tokens': 24, 'output_tokens': 176, 'total_tokens': 200, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

# Chain

In [45]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("Tell me something")

chain = prompt | llm

In [46]:
chain.invoke({})

AIMessage(content="Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still perfectly edible. Honey's low water content and acidic pH create an inhospitable environment for bacteria and microorganisms, allowing it to last indefinitely.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 62, 'prompt_tokens': 10, 'total_tokens': 72, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-20edc8c6-81e7-4844-8202-54dc5516584f-0', usage_metadata={'input_tokens': 10, 'output_tokens': 62, 'total_tokens': 72, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reaso

In [47]:
from langchain_core.output_parsers import StrOutputParser

chain = prompt | llm | StrOutputParser()

In [49]:
chain.invoke({})

"Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still perfectly edible. Honey's low water content and acidic pH create an inhospitable environment for bacteria and microorganisms, allowing it to remain preserved indefinitely."

In [50]:
prompt = ChatPromptTemplate.from_template("Tell me something about {topic}")

chain = prompt | llm | StrOutputParser()

In [51]:
chain.invoke({'topic': 'sora, developed by OpenAI'})

'Sora is a large-scale language model developed by OpenAI that is designed to generate human-like text based on the input it receives. It is trained on a diverse range of internet text data to improve its ability to understand and generate natural language. Sora is capable of generating coherent and contextually relevant responses to a wide variety of prompts, making it a powerful tool for tasks such as text generation, language translation, and content creation.'

In [52]:
prompt_formatted = prompt.invoke({'topic': 'sora, developed by OpenAI'})
property

property

In [53]:
model_output = llm.invoke(prompt_formatted)
model_output

AIMessage(content='Sora is a large-scale language model developed by OpenAI that is designed to generate human-like text based on the input it receives. It is trained on a diverse range of internet text data to improve its ability to understand and generate natural language. Sora is capable of generating coherent and contextually relevant responses to a wide variety of prompts, making it a powerful tool for tasks such as text generation, language translation, and content creation.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 88, 'prompt_tokens': 18, 'total_tokens': 106, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-e6cd8996-2602-4ef2-bf8c-3f8922112407-0', usage_meta

In [54]:
parser = StrOutputParser()
parser.invoke(model_output)

'Sora is a large-scale language model developed by OpenAI that is designed to generate human-like text based on the input it receives. It is trained on a diverse range of internet text data to improve its ability to understand and generate natural language. Sora is capable of generating coherent and contextually relevant responses to a wide variety of prompts, making it a powerful tool for tasks such as text generation, language translation, and content creation.'

In [55]:
(prompt|llm|parser).invoke({'topic': 'sora, developed by OpenAI'})

'Sora is a large-scale language model developed by OpenAI that is designed to generate human-like text based on the input it receives. It is trained on a diverse range of internet text data to improve its ability to understand and generate natural language. Sora is capable of generating coherent and contextually relevant responses to a wide variety of prompts, making it a powerful tool for tasks such as text generation, language translation, and content creation.'

# RAG Chain

In [57]:
retrieved_docs = retriever.invoke("Tell me sonething about sora, developed by OpenAI")
retrieved_docs

[Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='=== Text\n\nto\n\nvideo ===\n\n==== Sora ====\n\nSora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora\'s development team named it after the Japanese word for "sky", to signify its "limitless creative potential". Sora\'s technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted videos licensed for that purpose, but did not reveal the number or the exact sources of the videos. OpenAI demonstrated some Sora-created high-definition videos to the public on February 15, 2024, stating that it could generate videos up to one minute long. It also shared a technical report highli

In [58]:
def merge_docs(retrieved_docs):
  return "\n\n".join([d.page_content for d in retrieved_docs])

In [59]:
(retriever|merge_docs|llm|parser).invoke("Tell me sonething about sora, developed by OpenAI")

'The Whisper model expressed concerns about Altman to the Board in October 2024. OpenAI raised $6.6 billion from investors, potentially valuing the company at $157 billion. The funding attracted returning venture capital firms like Thrive Capital and Khosla Ventures, along with major backer Microsoft and new investors Nvidia and SoftBank. OpenAI\'s CFO, Sarah Friar, informed employees that a tender offer for share buybacks would follow the funding. Thrive Capital invested around $1.2 billion, with the option for an additional $1 billion if revenue goals were met. Apple did not participate in this funding round. The Intercept revealed that OpenAI\'s tools were considered "essential" for AFRICOM\'s mission and included in a contractual agreement between the Department of Defense and Microsoft. In November 2024, OpenAI acquired the domain Chat.com and redirected it to ChatGPT\'s main site. Greg Brockman rejoined OpenAI after a three-month leave from his role as president. He would collabo

In [60]:
from langchain_core.runnables import RunnableParallel

chain_parallel = RunnableParallel({"context": retriever, "llm": llm})
chain_parallel

{
  context: ParentDocumentRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x35d6971d0>, docstore=<langchain_core.stores.InMemoryStore object at 0x35d6ad1d0>, search_kwargs={}, child_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x35e009950>, parent_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x35d2a9890>),
  llm: ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x34915af90>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x357442250>, root_client=<openai.OpenAI object at 0x356e61f10>, root_async_client=<openai.AsyncOpenAI object at 0x356c22c90>, temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'))
}

In [61]:
chain_parallel.invoke("Tell me something about sora, developed by OpenAI")

{'context': [Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='=== Text\n\nto\n\nvideo ===\n\n==== Sora ====\n\nSora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora\'s development team named it after the Japanese word for "sky", to signify its "limitless creative potential". Sora\'s technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted videos licensed for that purpose, but did not reveal the number or the exact sources of the videos. OpenAI demonstrated some Sora-created high-definition videos to the public on February 15, 2024, stating that it could generate videos up to one minute long. It also shared a technical r

In [62]:
from langchain_core.runnables import RunnablePassthrough

chain_parallel = RunnableParallel({"context": retriever, "query": RunnablePassthrough()})

In [63]:
chain_parallel.invoke("Tell me something about sora, developed by OpenAI")

{'context': [Document(metadata={'source': '../dataset/llamaindex_data/openai.txt'}, page_content='=== Text\n\nto\n\nvideo ===\n\n==== Sora ====\n\nSora is a text-to-video model that can generate videos based on short descriptive prompts as well as extend existing videos forwards or backwards in time. It can generate videos with resolution up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. Sora\'s development team named it after the Japanese word for "sky", to signify its "limitless creative potential". Sora\'s technology is an adaptation of the technology behind the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted videos licensed for that purpose, but did not reveal the number or the exact sources of the videos. OpenAI demonstrated some Sora-created high-definition videos to the public on February 15, 2024, stating that it could generate videos up to one minute long. It also shared a technical r

In [64]:
from langchain_core.prompts.chat import ChatPromptTemplate

template = """
Utilizing the context given below, answer the question.

[context]
{context}

question: {query}
"""

prompt = ChatPromptTemplate.from_template(template)

In [65]:
chain = RunnableParallel({"context": retriever, "query": RunnablePassthrough()}) | prompt | llm | StrOutputParser()

In [66]:
chain.invoke("Tell me something about sora, developed by OpenAI")

'Sora is a text-to-video model developed by OpenAI that can generate videos based on short descriptive prompts and extend existing videos forwards or backwards in time. It can create videos with resolutions up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. The development team named it after the Japanese word for "sky" to signify its "limitless creative potential". Sora\'s technology is based on the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available and copyrighted videos, but did not disclose the exact sources. Sora was demonstrated to the public generating high-definition videos up to one minute long, showcasing its ability to create realistic video from text descriptions. Despite some skepticism, notable entertainment figures like Tyler Perry have shown significant interest in Sora\'s potential to revolutionize storytelling and content creation.'

In [67]:
chain = RunnableParallel(context=retriever, query=RunnablePassthrough()) | prompt | llm | StrOutputParser()
chain.invoke("Tell me something about sora, developed by OpenAI")

'Sora is a text-to-video model developed by OpenAI that can generate videos based on short descriptive prompts and extend existing videos forwards or backwards in time. It can create videos with resolutions up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. The development team named it after the Japanese word for "sky" to signify its "limitless creative potential". Sora\'s technology is based on the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available and copyrighted videos, but did not disclose the exact number or sources of the videos. Sora was demonstrated to the public generating high-definition videos up to one minute long, showcasing its capabilities in video generation from text descriptions. Despite some skepticism, notable entertainment-industry figures like Tyler Perry have shown significant interest in Sora\'s potential to revolutionize storytelling and content creation.'

In [68]:
def foo(anything):
  return 'bar'

chain = RunnableParallel(context=retriever, query=RunnablePassthrough(), asdf=foo) | prompt | llm | StrOutputParser()
chain.invoke("Tell me something about sora, developed by OpenAI")

'Sora is a text-to-video model developed by OpenAI that can generate videos based on short descriptive prompts and extend existing videos forwards or backwards in time. It can create videos with resolutions up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. The technology behind Sora is an adaptation of the technology used in the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available videos as well as copyrighted videos licensed for that purpose. Sora was named after the Japanese word for "sky" to signify its "limitless creative potential". The model was demonstrated to the public on February 15, 2024, showcasing its ability to generate high-definition videos up to one minute long. Despite some skepticism, notable entertainment-industry figures, like actor/filmmaker Tyler Perry, have shown significant interest in Sora\'s potential to revolutionize storytelling and content creation.'

In [70]:
chain = {"context": retriever, "query": RunnablePassthrough()} | prompt | llm | StrOutputParser()
chain.invoke("Tell me something about sora, developed by OpenAI")

'Sora is a text-to-video model developed by OpenAI that can generate videos based on short descriptive prompts and extend existing videos forwards or backwards in time. It can create videos with resolutions up to 1920x1080 or 1080x1920. The maximal length of generated videos is unknown. The development team named it after the Japanese word for "sky" to signify its "limitless creative potential". Sora\'s technology is based on the DALL·E 3 text-to-image model. OpenAI trained the system using publicly-available and copyrighted videos, but did not disclose the exact number or sources of the videos. Sora was demonstrated to the public generating high-definition videos up to one minute long, showcasing its capabilities in video generation from text descriptions. Despite some skepticism, notable entertainment-industry figures like Tyler Perry have shown significant interest in Sora\'s potential to revolutionize storytelling and content creation.'

In [71]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")
prompt.messages



[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]

In [73]:
chain = RunnableParallel(context=retriever, question=RunnablePassthrough()) | prompt | llm | StrOutputParser()
chain.invoke("Tell me something about sora, developed by OpenAI")

'Sora is a text-to-video model developed by OpenAI that can generate videos based on short descriptive prompts and extend existing videos. It can create videos with resolutions up to 1920x1080 or 1080x1920 and was named after the Japanese word for "sky" to signify its creative potential. OpenAI trained Sora using publicly-available and licensed videos, demonstrating its capabilities in generating high-definition videos up to one minute long.'

# Putting it all together

In [75]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser