In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableParallel

In [2]:
file_path = "files\\The-Godfather.pdf"

loader = PyPDFLoader(file_path)
documents = loader.load()

print(f"Loaded {len(documents)} pages from {file_path}")
#print(f"First page content: {documents[0].page_content}")  

Loaded 124 pages from files\The-Godfather.pdf


In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks")

Split into 183 chunks


In [4]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)

#vector_store.index_to_docstore_id

In [5]:
#vector_store.get_by_ids(["1f0c811c-eb8f-4db9-8de9-dab36d6041d0"])

In [6]:
vector_store.save_local("vectors/godfather_faiss_index")

In [7]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
#retriever.invoke("kids of The Godfather")

In [8]:
def format_retriever_results(retrieved_docs):
    context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
    return context_text


In [9]:
parallel_chain = RunnableParallel(
    {
        "question": RunnablePassthrough(),
        "context": retriever | RunnableLambda(format_retriever_results),
    }
)

In [27]:
model = ChatOpenAI(model="gpt-4o-mini", temperature=0.4)
parser = StrOutputParser()
prompt = PromptTemplate(
    template="You are a helpful assistant. " \
              "Answer ONLY from the provided screenplay." \
              "If the contect is insufficient, just say you don'f know." \
              "\n\n {context}" \
              "\n\nQuestion: {question}",
    input_variables=["context", "question"]
)

In [30]:
main_chain = parallel_chain | prompt | model | parser
main_chain.invoke("summarize the role of Michael CORLEONE in The Godfather screenplay?")

"Michael Corleone is portrayed as a central character who evolves from a reluctant outsider to a decisive leader within the Corleone crime family. Initially, he is distanced from the family's criminal activities, expressing a desire to forge his own path. However, as the story progresses, he becomes increasingly involved in the family's operations, ultimately taking on the role of head of the family. Michael is strategic and calculated, demonstrating a willingness to make tough decisions for the sake of the family's survival and power, including reshaping the family's leadership and dealing with threats. His character embodies the conflict between personal values and familial loyalty, as he navigates the complexities of power, responsibility, and the consequences of a life of crime."