In [1]:
%load_ext autoreload
%autoreload 2

# 1. Load documents

In [44]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings


def create_knowledgeBase():
    DB_FAISS_PATH = 'vectorstore/db_faiss'
    loader = PyPDFLoader("./ischaemic_stroke_review_donnan_2019.pdf")
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=150)
    splits = text_splitter.split_documents(docs)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

    vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
    vectorstore.save_local(DB_FAISS_PATH)

In [45]:
create_knowledgeBase()

# 2. Initialize document retriever and LLM

In [50]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.llms import HuggingFacePipeline
import torch

def load_llm():
    print("initializing llm....")
    llm = HuggingFacePipeline.from_model_id(model_id="gpt2",
                                            task="text-generation", 
                                            pipeline_kwargs={
                                                "max_new_tokens": 400,
                                                "top_p": 0.95, 
                                                "do_sample": True,
                                                "top_k": 50,
                                                "temperature": 0.2,
                                                "repetition_penalty": 2.0})
    
    print("llm initialized!")

    return llm

In [51]:
from transformers import GPT2Tokenizer, GPT2Model
import torch
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import ChatPromptTemplate

def load_knowledgeBase():
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    DB_FAISS_PATH = 'vectorstore/db_faiss'
    db = FAISS.load_local(DB_FAISS_PATH, 
                          embeddings, 
                          allow_dangerous_deserialization=True)
    return db


def load_prompt():
    prompt = """ You need to answer the question in the sentence as same as in the  pdf content. . 
    Given below is the context and question of the user.
    context = {context}
    question = {question}
    if the answer is not in the pdf answer "i do not know what the hell you are asking about"
        """
    prompt = ChatPromptTemplate.from_template(prompt)
    return prompt

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# 3. Inference

In [52]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

knowledge_base = load_knowledgeBase()
llm = load_llm()
prompt = load_prompt()
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

input = "What is the ischemic stroke?"
similar_embeddings=knowledge_base.similarity_search(input)
similar_embeddings=FAISS.from_documents(documents=similar_embeddings, 
                                        embedding=embeddings)

retriever = similar_embeddings.as_retriever()
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

initializing llm....
llm initialized!


In [54]:
import pprint

response = rag_chain.invoke(input)

print(response)

Human:  You need to answer the question in the sentence as same as in the  pdf content. . 
    Given below is the context and question of the user.
    context = blood pressure and diabetes mellitus and is particularly 
common in Asia. Less common overall, but propor -
tionally more prevalent in younger patients, are arte -
rial dissection, vasculitis, patent foramen ovale (PFO) 
with paradoxical embolism (that is, whereby venous 
thrombi enter the systemic and cerebral circulation) and haematological disorders 
(fig.  2; Table  1). The cause of 
ischaemic stroke is important as it can guide therapeutic 
strategies for the prevention of recurrent stroke.

Stroke is a leading cause of death and disability world-
wide and can be broadly classified into ischaemic stroke 
and haemorrhagic stroke, the latter of which includes 
intracerebral haemorrhage and subarachnoid haem-orrhage. Ischaemic stroke is defined as infarction of the brain, spinal cord or retina
1 and represents ~71% 
of all s