In [1]:
from langchain_community.document_loaders.parsers import GrobidParser
from langchain_community.document_loaders.generic import GenericLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama
from langchain_chroma import Chroma
from langchain import hub
from langchain_core.prompts import PromptTemplate

from embedder import SPECTEREmbeddings

In [2]:
loader = GenericLoader.from_filesystem("./data/", glob="*", suffixes=[".pdf"], parser=GrobidParser(segment_sentences=False))
docs = loader.load()

In [3]:
docs[:10]

[Document(metadata={'text': 'Language model pre-training has been shown to be effective for improving many natural language processing tasks (Dai and Le, 2015;Peters et al., 2018a;Radford et al., 2018;Howard and Ruder, 2018).These include sentence-level tasks such as natural language inference (Bowman et al., 2015;Williams et al., 2018) and paraphrasing (Dolan and Brockett, 2005), which aim to predict the relationships between sentences by analyzing them holistically, as well as token-level tasks such as named entity recognition and question answering, where models are required to produce fine-grained output at the token level (Tjong Kim Sang and De Meulder, 2003;Rajpurkar et al., 2016).', 'para': '1', 'bboxes': "[[{'page': '1', 'x': '72.00', 'y': '579.72', 'h': '218.27', 'w': '9.46'}, {'page': '1', 'x': '72.00', 'y': '593.27', 'h': '218.27', 'w': '9.46'}, {'page': '1', 'x': '72.00', 'y': '606.82', 'h': '218.27', 'w': '9.46'}, {'page': '1', 'x': '72.00', 'y': '620.37', 'h': '218.27', '

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# vectorstore = Chroma(persist_directory="./chroma_db", collection_name="bert").from_documents(documents=splits, embedding=SPECTEREmbeddings())
vectorstore = Chroma(persist_directory="./chroma_db", collection_name="bert").from_documents(documents=docs, embedding=SPECTEREmbeddings())

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  state_dict = torch.load(weights_file, map_location="cpu")


In [5]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")



In [6]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [7]:
llm = ChatOllama(model="llama3.1")

In [8]:
custom_prompt = """
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
If a query contains the keyword "this" it means the current set documents that you are working on.
{context}
Question: {question}
Answer:"""

custom_rag_prompt = PromptTemplate.from_template(custom_prompt)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm 
    | StrOutputParser()
)

In [9]:
rag_chain.invoke("How is the BERT model different from the GPT model?")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  state_dict = torch.load(weights_file, map_location="cpu")


"I don't know. The provided text only mentions a comparison between BERT, ELMo, and OpenAI GPT, but it does not provide any information about how the BERT model differs specifically from the GPT model."