<a href="https://colab.research.google.com/github/nosale-arc/csr-gpt-rag/blob/main/CSR_RAG_POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip3 install --upgrade --user langchain_community langchain-openai pypdf chromadb

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# First load the PDF document into memory

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
DOC_PATH = "/content/Nelson_Biology_unit_30d.pdf"
CHROMA_PATH = "nelson_db"

# load your pdf doc
loader = PyPDFLoader(DOC_PATH)
pages = loader.load()

#Split the document into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split the doc into smaller chunks i.e. chunk_size=500
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(pages)

#Load the openAI key

In [None]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('openai_key')

#Set up the vector database and define how to answer the question

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# get OpenAI Embedding model
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# embed the chunks as vectors and load them into the database
db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)

# Change persona to alter the style of the output
# some possibilities: "a pirate", "Shakespeare", "a 5 year old"
# "Steve Jobs" or "an Engineer" tend to produce the most accurate results
def answer_question(query, persona="Steve Jobs"):

  # retrieve context - top 10 most relevant (closest) chunks to the query vector
  # (by default Langchain is using cosine distance metric)
  docs_chroma = db_chroma.similarity_search_with_score(query, k=10)

  # generate an answer based on given user query and retrieved context information
  context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])

  # you can use a prompt template
  PROMPT_TEMPLATE = """
  Answer the question based only on the following context:
  {context}
  Answer the question based on the above context: {question}.
  Provide a detailed answer.
  Don’t justify your answers.
  Don’t give information not mentioned in the CONTEXT INFORMATION.
  Do not say "according to the context" or "mentioned in the context" or similar.
  Answer this question as if you were {persona}
  """

  # load retrieved context and user query in the prompt template
  prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
  prompt = prompt_template.format(context=context_text, question=query, persona = persona)

  # call LLM model to generate the answer based on the given context and query
  model = ChatOpenAI(openai_api_key=OPENAI_API_KEY)
  response_text = model.invoke(prompt)

  return response_text

In [None]:
print(answer_question("What is the definition of a trait?"))