# Initial setup w.r.t installation and dependencies

## Setting Google Colab path

In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# Changing working directory
import os
base_path='/content/drive/My Drive/Use-Cases/Cigna-usecase/llm-usecases/usecase3/'
os.chdir(base_path)

Mounted at /content/drive


## Install dependencies and import libraries

In [9]:
%pip -q install langchain openai tiktoken chromadb regex --quiet

In [18]:
import os

os.environ["OPENAI_API_KEY"] = "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"
# os.environ["OPENAI_API_KEY"] = "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"

In [27]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.chat_models import ChatOpenAI
import re
import pandas as pd

In [5]:
qna_custom_file = './resources/KnowledgeDocument(pan_card_services).txt'

# Preprocessing (parsing, data cleaning and formatting)

### For this use-case, I am considering one text file which has informations about India's PAN card relation FAQs. Just for the explanation perspective only one text file being used similary the same approach could be implemented with other file formats but their parsing logic definetely would be different.

In [6]:
# parsing markdown contents

def parse_markdown_contents(filepath:str):
    parsed_contents = {}

    ques_list = []
    ans_list = []
    tmp_ans_buffer = ""
    ques_bool = False

    fp = open(filepath, 'r')
    lines = fp.readlines()

    for line_no, line in enumerate(lines):
        line = line.strip()
        # print(f'line no : {line_no}')
        # print(f'line_no : {line_no} :: contents : {line}')
        if line.startswith('# ') or line.startswith('## ') or line == '':
            continue
        if line.startswith('###') or ((re.search("^(\*\*)(.*\*\*)$", line) is not None) and (line_no >= 213) ):
            ques_line = line.replace('###', '').replace('**','')
            ques_list.append(ques_line)
            if tmp_ans_buffer != '':
                ans_list.append(tmp_ans_buffer)
                tmp_ans_buffer = ''
        elif re.search("^[a-zA-z0-9]", line) is not None:
            tmp_ans_buffer +=  line + '\n'
        elif len(ques_list) > len(ans_list) and re.search("^([*]|[-])", line) is not None:
            tmp_ans_buffer += line + '\n'

        # if line_no == 33:
        #     print(f'line : {line} :: ques_bool : {ques_bool}')
        #     print(f'condition check : {line.startswith("**")}')
        # if line_no >= 314: # 38;56
        #     print(f'line no : {line_no}')
        #     ans_list.append(tmp_ans_buffer)
        #     break
    ans_list.append(tmp_ans_buffer)

    return ques_list, ans_list

In [11]:

ques_list, ans_list = parse_markdown_contents(qna_custom_file)

print(f'len ques_list : {len(ques_list)} :: len ans_list : {len(ans_list)}')

indx = 0

print(f'ques :\n {ques_list[indx]}\n :: ans : \n{ans_list[indx]}')

print('-'*20)

print(f'ques :\n {ques_list[len(ques_list)-1]}\n :: ans : \n{ans_list[len(ans_list)-1]}')

len ques_list : 46 :: len ans_list : 46
ques :
  What is Pan card?
 :: ans : 
The PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax Department of India to track the tax-related transactions of individuals and entities. The PAN card is mandatory for any financial transaction in India, including opening a bank account, buying or selling property, and filing income tax returns.

--------------------
ques :
 Can I take the delivery of Pan card at Indian address?
 :: ans : 
Yes, you can take the delivery of your PAN card only at an Indian address mentioned in your Aadhaar card. While applying for a new PAN card or requesting corrections, you can provide your Indian address as the delivery address. Make sure to provide accurate and complete address details to ensure successful delivery.



# Creating ChromaDB and store embeddings

In [13]:
df = pd.DataFrame({'Question': ques_list, 'Answers': ans_list})
df.head()

Unnamed: 0,Question,Answers
0,What is Pan card?,The PAN card is a unique ten-digit alphanumeri...
1,Who needs a Pan card?,All individuals/non-individuals (including for...
2,Types of PAN cards,"In India, two types of PAN cards are available..."
3,Why do NRIs need PAN card?,"NRIS don’t need to have a PAN Card. However, a..."
4,How can NRI apply for a new PAN card,Here are the steps for *PAN CARD* processing.\...


### For exploration perspective `ChromaDB` being used which uses the file directory to store the embeddings and while retrieval it can be accessed fastly.

For real world implementations, it can be changed to more comples with client server and dsitributed architecture which handle the cache computation to server faster and hold much more data.

In [19]:
# Load and process the text files
loader = TextLoader(os.path.join('resources','KnowledgeDocument(pan_card_services).txt'))

documents = loader.load()

In [20]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

print(f'Lenght of texts : {len(texts)}')

Lenght of texts : 23


In [21]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = './resources/docs-db'


## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

In [22]:
vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [23]:
# persiste the db to disk
vectordb.persist()
vectordb = None
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

## As the data being store in chromadb, now retrieve it by asking queries

In [24]:
retriever = vectordb.as_retriever()

In [25]:
docs = retriever.get_relevant_documents("How to apply for PAN ?")
print(f'len of docs : {len(docs)}')

len of docs : 4


In [26]:
retriever.search_type

'similarity'

### Lang Chain implementation

In [31]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

# create the chain to answer questions
# qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)



In [32]:
## Cite sources
def process_llm_response(llm_response):
  print(llm_response)
  print(llm_response['result'])
  print('\n\nSources:')
  for source in llm_response["source_documents"]:
      print(source.metadata['source'])

In [33]:
# full example
query = "What are the documents required to apply for the new pan ?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

{'query': 'What are the documents required to apply for the new pan ?', 'result': "If you have an Aadhaar card, no other document is required to apply for a new PAN card. However, if you don't have an Aadhaar card, the following documents are required:\n\n- Passport (Any Country) / OCI Card\n- Passport Size Photograph\n- Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)", 'source_documents': [Document(page_content='---\n\n# PAN Card Application Process\n\n## New Pan Card\n\n### How can NRI apply for a new PAN card\n\nHere are the steps for *PAN CARD* processing. \n\n- Visit ABC app\n- Navigate to Services > NRI Pan Card > Apply New PAN\n- Select the required form of PAN card and proceed with the payment\n- Our team will get in touch with you to ask for the following documents:\n    - Passport(Any Country) / OCI Card\n    - Passport Size Photograph\n    - Overseas address proof with zip code (Support

In [34]:
# full example (with turbo_llm)
query = "What are the documents required to apply for the new pan ?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

{'query': 'What are the documents required to apply for the new pan ?', 'result': "If you have an Aadhaar card, no other document is required to apply for a new PAN card. However, if you don't have an Aadhaar card, the following documents are required:\n\n- Passport (Any Country) / OCI Card\n- Passport Size Photograph\n- Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)", 'source_documents': [Document(page_content='---\n\n# PAN Card Application Process\n\n## New Pan Card\n\n### How can NRI apply for a new PAN card\n\nHere are the steps for *PAN CARD* processing. \n\n- Visit ABC app\n- Navigate to Services > NRI Pan Card > Apply New PAN\n- Select the required form of PAN card and proceed with the payment\n- Our team will get in touch with you to ask for the following documents:\n    - Passport(Any Country) / OCI Card\n    - Passport Size Photograph\n    - Overseas address proof with zip code (Support