Load the Document


In [1]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("../Corpus/The Godfather Summary.pdf")
documents = loader.load()

Split Documents


In [2]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50,separator="\n")
texts = text_splitter.split_documents(documents)

Create Embeddings


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
  from .autonotebook import tqdm as notebook_tqdm


Vector Store (FAISS)


In [None]:
from langchain.vectorstores import Chroma

db = Chroma.from_documents(texts, embeddings)

In [None]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path="../.env.token")
huggingface_token = os.getenv("YOUR_HUGGINGFACE_API_TOKEN")

if not huggingface_token:
    raise ValueError("Hugging Face API token not found in .env.token file.")


LLM


In [None]:
from langchain.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.1",
    model_kwargs={"temperature": 0.25, "max_length": 512},
    huggingfacehub_api_token=huggingface_token
    )

Prompt Template


In [None]:
from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

CONTEXT
**********************************************************************
{context}
**********************************************************************


Question: {question}

Answer:"""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)


RetrievalQA Chain


In [None]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(),
    chain_type_kwargs={"prompt": PROMPT},  # Pass the custom prompt
)

In [None]:
query = "What are the main conflicts or storylines in the document?"
result = qa.run(query)
print(result)



Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

**********************************************************************
Context
House of Atreus. Coppola cuts from the younger Vito to 
his successor, Michael — and whatever glow Michael got 
when he reached power fades as he sets about consoli-
dating it. The Don’s legacy of hypocrisy and crime eats 
away at Michael’s soul. 
 
These two movies together are not really about the dete-
rioration of the American dream. What they say is that 
for immigrant groups that became the country’s back-
bone — Italians, Jews, Irish, and others — the American 
Dream was limited from the start by the burdens of pov-
erty, unsettled scores, and insular ethnic cultures. As in 
the Old World, they were prey to powerful economic and 
political forces. But here those forces took more various, 
insidious forms. Many Vietnam-era movies told us tha