In [4]:
#importing langchain modules

from langchain.document_loaders import UnstructuredExcelLoader
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain import OpenAI, VectorDBQA
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.document_loaders import TextLoader
import tiktoken
from langchain.callbacks import get_openai_callback

#importing the os
import sys
sys.path.append('../config.py')
from config import OPENAI_API_KEY

In [5]:
#loading the excel files
loader = UnstructuredExcelLoader(
    "excel/database.xlsx"
)

docs = loader.load()



In [6]:
text_splitter_character = RecursiveCharacterTextSplitter(
    chunk_size=10000, chunk_overlap=1000
)

 
#docs = TextLoader('file.txt').load()
#embedding
all_splits = text_splitter_character.split_documents(docs)
#db = FAISS.from_documents(text_character, OpenAIEmbeddings())
# print(type(all_splits))


In [36]:
tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    token  = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(token)
token_counts = [tiktoken_len(doc.page_content) for doc in all_splits]
print(sum(token_counts))

221639


In [32]:
#Embed and store the texts
#Supplying a persist_directory will stoer the embeddings on disk
persist_directory = "db"

# create the open-source embedding function
vectordb = Chroma.from_documents(documents=all_splits,embedding=OpenAIEmbeddings(),
        persist_directory=persist_directory)

#persist the db to disk
vectordb.persist()
vectordb = None

vectordb = Chroma.from_documents(documents=all_splits,embedding=OpenAIEmbeddings(),
        persist_directory=persist_directory)

In [37]:
retrieval = vectordb.as_retriever(search_kwargs={"k":2})
#docs = retrieval.get_relevant_documents("Who is Rucha Joshi")


In [47]:
with get_openai_callback() as cb:
        qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retrieval,
                return_source_documents=True, verbose=True)
        print(qa_chain("Which professor should  meet to know more about DNA and why"))
total_tokens = cb.total_tokens
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)
print(total_tokens)



[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
{'query': 'Which professor should  meet to know more about DNA and why', 'result': ' Dr. Swagata Halder is a faculty member who has research expertise in molecular cell biology and in vitro protein biochemistry related to DNA damage response. He obtained his DPhil degree from the University of Oxford, UK and pursued his postdoctoral research as a scientist at the Institute for Research in Biomedicine (IRB), Bellinzona, Switzerland. He has published his research in prestigious scientific journals like Nature Genetics, Nature Communications, Molecular Cell, Nucleic Acid Research, Scientific Reports and Chemical Science. Therefore, Dr. Swagata Halder is an excellent resource to learn more about DNA.', 'source_documents': [Document(page_content='faculty\n      \n    \n    \n      Dr. Swagata Halder\n      Dr. Swagata Halder aims to pursue his research in the area of DNA damage response in order to formulate novel therapeutic int

In [46]:
print(total_tokens)

2273


In [39]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/index/ (stored 0%)
  adding: db/index/index_metadata_23a6e9aa-a2b7-4a98-809a-ad60ef59242c.pkl (deflated 14%)
  adding: db/index/id_to_uuid_23a6e9aa-a2b7-4a98-809a-ad60ef59242c.pkl (deflated 35%)
  adding: db/index/uuid_to_id_23a6e9aa-a2b7-4a98-809a-ad60ef59242c.pkl (deflated 39%)
  adding: db/index/index_23a6e9aa-a2b7-4a98-809a-ad60ef59242c.bin (deflated 17%)
  adding: db/chroma-embeddings.parquet (deflated 26%)
  adding: db/chroma-collections.parquet (deflated 50%)


In [40]:
#To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

#delete the directory

In [41]:
!rm -rf db/

##Restarting the runtime

In [42]:
!unzip db.zip

Archive:  db.zip
   creating: db/
   creating: db/index/
  inflating: db/index/index_metadata_23a6e9aa-a2b7-4a98-809a-ad60ef59242c.pkl  
  inflating: db/index/id_to_uuid_23a6e9aa-a2b7-4a98-809a-ad60ef59242c.pkl  
  inflating: db/index/uuid_to_id_23a6e9aa-a2b7-4a98-809a-ad60ef59242c.pkl  
  inflating: db/index/index_23a6e9aa-a2b7-4a98-809a-ad60ef59242c.bin  
  inflating: db/chroma-embeddings.parquet  
  inflating: db/chroma-collections.parquet  


In [2]:
#importing langchain modules

from langchain.document_loaders import UnstructuredExcelLoader
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain import OpenAI, VectorDBQA
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.document_loaders import TextLoader
from langchain.memory import ConversationBufferMemory
from langchain import OpenAI, LLMChain, PromptTemplate
import tiktoken
from langchain.callbacks import get_openai_callback
from langchain.chains.question_answering import load_qa_chain
#importing the os

from config import OPENAI_API_KEY


In [3]:
persist_directory = "db"

# create the open-source embedding function
vectordb2 = Chroma(embedding_function=OpenAIEmbeddings(),
        persist_directory=persist_directory)

In [5]:
template = """You are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

{context}

{chat_history}
Human: {human_input}
Chatbot:"""

In [6]:
prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")

In [7]:
chain = load_qa_chain(
    OpenAI(temperature=0), chain_type="stuff", memory=memory, prompt=prompt,verbose=True
)

In [8]:
query = "Hi"
retrieval = vectordb2.similarity_search(query)
chain({"input_documents": retrieval, "human_input": query}, return_only_outputs=True)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

1. Plaksha University\nAssistant Professor\nMarch 2023 - Present (4 months)\nChandigarh, India\n2. Manning College of Information and Computer Sciences, UMass\nAmherst\nPostdoctoral Researcher\nJanuary 2022 - January 2023 (1 year 1 month)\nAmherst, Massachusetts, United States\n3. Indian Institute of Technology, Bombay\n5 years 7 months\nPostdoctoral Researcher\nAugust 2021 - January 2022 (6 months)\nMumbai, Maharashtra, India\n4. Research Scholar\nJuly 2016 - July 2021 (5 years 1 month)\nMumbai Area, India\n5. Tata Motors\nManager, Engineering Research Center\nSeptember 2012 - July 2016 (3 years 11 months)\nPune\n6. Automotive Embedded Design & Development\nElectronic Control Unit Software Design and Develpment for Powertrain, BCM\

{'output_text': ' Hi there! How can I help you?'}

In [63]:
query = "I am interested to learn about professor on the Quantom mechanics, who can I meet"
chain({"input_documents": retrieval, "human_input": query}, return_only_outputs=True)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

1. Plaksha University\nAssistant Professor\nMarch 2023 - Present (4 months)\nChandigarh, India\n2. Manning College of Information and Computer Sciences, UMass\nAmherst\nPostdoctoral Researcher\nJanuary 2022 - January 2023 (1 year 1 month)\nAmherst, Massachusetts, United States\n3. Indian Institute of Technology, Bombay\n5 years 7 months\nPostdoctoral Researcher\nAugust 2021 - January 2022 (6 months)\nMumbai, Maharashtra, India\n4. Research Scholar\nJuly 2016 - July 2021 (5 years 1 month)\nMumbai Area, India\n5. Tata Motors\nManager, Engineering Research Center\nSeptember 2012 - July 2016 (3 years 11 months)\nPune\n6. Automotive Embedded Design & Development\nElectronic Control Unit Software Design and Develpment for Powertrain, BCM\

{'output_text': ' You can meet Dr. Anupam Sobti, who is an Assistant Professor at Plaksha University. He has a PhD in Computer Science from Indian Institute of Technology, Delhi and specializes in Applied Machine Learning, Embedded Systems/IoT, Low-cost health diagnostics, Smart Agriculture, and Sustainability Sensing for buildings, rivers, forests, etc.'}

In [9]:

query = "He does not have any experience in quantum mechanics"
chain({"input_documents": retrieval, "human_input": query}, return_only_outputs=True)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

1. Plaksha University\nAssistant Professor\nMarch 2023 - Present (4 months)\nChandigarh, India\n2. Manning College of Information and Computer Sciences, UMass\nAmherst\nPostdoctoral Researcher\nJanuary 2022 - January 2023 (1 year 1 month)\nAmherst, Massachusetts, United States\n3. Indian Institute of Technology, Bombay\n5 years 7 months\nPostdoctoral Researcher\nAugust 2021 - January 2022 (6 months)\nMumbai, Maharashtra, India\n4. Research Scholar\nJuly 2016 - July 2021 (5 years 1 month)\nMumbai Area, India\n5. Tata Motors\nManager, Engineering Research Center\nSeptember 2012 - July 2016 (3 years 11 months)\nPune\n6. Automotive Embedded Design & Development\nElectronic Control Unit Software Design and Develpment for Powertrain, BCM\

{'output_text': ' I see. It looks like Dr. Anupam Sobti does not have any experience in quantum mechanics. Is there anything else I can help you with?'}

In [10]:

query = "Go through the docs which I provided and recommend another professor with Quantum mechanics background"
chain({"input_documents": retrieval, "human_input": query})



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

1. Plaksha University\nAssistant Professor\nMarch 2023 - Present (4 months)\nChandigarh, India\n2. Manning College of Information and Computer Sciences, UMass\nAmherst\nPostdoctoral Researcher\nJanuary 2022 - January 2023 (1 year 1 month)\nAmherst, Massachusetts, United States\n3. Indian Institute of Technology, Bombay\n5 years 7 months\nPostdoctoral Researcher\nAugust 2021 - January 2022 (6 months)\nMumbai, Maharashtra, India\n4. Research Scholar\nJuly 2016 - July 2021 (5 years 1 month)\nMumbai Area, India\n5. Tata Motors\nManager, Engineering Research Center\nSeptember 2012 - July 2016 (3 years 11 months)\nPune\n6. Automotive Embedded Design & Development\nElectronic Control Unit Software Design and Develpment for Powertrain, BCM\

{'output_text': ' Sure, I can help you with that. Based on the documents you provided, I recommend Professor Rajesh Kumar from Indian Institute of Technology, Delhi. He has extensive experience in quantum mechanics and has published several papers in the field.'}

{'query': 'Which professor can I meet to learn C++ from and give information about him', 'result': ' You can meet Dr. Srikant Srinivasan to learn C++ from. He is a faculty member at the university.', 'source_documents': [Document(page_content='faculty\n      \n    \n    \n      Dr. Srikant Srinivasan', metadata={'source': 'excel/database.xlsx'}), Document(page_content='faculty\n      \n    \n    \n      Dr. Srikant Srinivasan', metadata={'source': 'excel/database.xlsx'}), Document(page_content='faculty\n      \n    \n    \n      Dr. Srikant Srinivasan', metadata={'source': 'excel/database.xlsx'}), Document(page_content='faculty\n      \n    \n    \n      Dr. Srikant Srinivasan', metadata={'source': 'excel/database.xlsx'})]}