In [1]:
#importing langchain modules

from langchain.document_loaders import UnstructuredExcelLoader
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain import OpenAI, VectorDBQA
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.document_loaders import TextLoader
import tiktoken
from langchain.callbacks import get_openai_callback
#importing the os

from config import OPENAI_API_KEY


In [2]:
#loading the excel files
loader = UnstructuredExcelLoader(
    "excel/database.xlsx"
)

docs = loader.load()



In [3]:
text_splitter_character = RecursiveCharacterTextSplitter(
    chunk_size=10000, chunk_overlap=1000
)

 
#docs = TextLoader('file.txt').load()
#embedding
all_splits = text_splitter_character.split_documents(docs)
#db = FAISS.from_documents(text_character, OpenAIEmbeddings())
# print(type(all_splits))


In [4]:
tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    token  = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(token)
token_counts = [tiktoken_len(doc.page_content) for doc in all_splits]
print(sum(token_counts))

221639


In [5]:
#Embed and store the texts
#Supplying a persist_directory will stoer the embeddings on disk
persist_directory = "db"

# create the open-source embedding function
vectordb = Chroma.from_documents(documents=all_splits,embedding=OpenAIEmbeddings(),
        persist_directory=persist_directory)

#persist the db to disk
vectordb.persist()
vectordb = None

vectordb = Chroma.from_documents(documents=all_splits,embedding=OpenAIEmbeddings(),
        persist_directory=persist_directory)

In [6]:
retrieval = vectordb.as_retriever(search_kwargs={"k":2})
#docs = retrieval.get_relevant_documents("Who is Rucha Joshi")


In [7]:
with get_openai_callback() as cb:
        qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retrieval,
                return_source_documents=True, verbose=True)
        print(qa_chain("Which professor should  meet to know more about DNA and why"))
total_tokens = cb.total_tokens
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)
print(total_tokens)



[1m> Entering new  chain...[0m

[1m> Finished chain.[0m
{'query': 'Which professor should  meet to know more about DNA and why', 'result': ' Dr. Swagata Halder should be met to know more about DNA repair and DNA damage response because he has expertise in molecular cell biology and in vitro protein biochemistry and has conducted research in the area of DNA damage response to formulate novel therapeutic interventions against cancer. He also obtained his DPhil degree from the University of Oxford, UK and pursued his postdoctoral research as a scientist at the Institute for Research in Biomedicine (IRB), Bellinzona, Switzerland.', 'source_documents': [Document(page_content='faculty\n      \n    \n    \n      Dr. Swagata Halder\n      Dr. Swagata Halder aims to pursue his research in the area of DNA damage response in order to formulate novel therapeutic interventions against cancer. His laboratory expertise in molecular cell biology and in vitro protein biochemistry.\\n\\nDr. Halder

In [8]:
print(total_tokens)

1654


In [9]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/index/ (stored 0%)
  adding: db/index/uuid_to_id_e213ed88-c84b-461f-968b-9407b5821b9f.pkl (deflated 39%)
  adding: db/index/id_to_uuid_e213ed88-c84b-461f-968b-9407b5821b9f.pkl (deflated 35%)
  adding: db/index/index_metadata_e213ed88-c84b-461f-968b-9407b5821b9f.pkl (deflated 14%)
  adding: db/index/index_e213ed88-c84b-461f-968b-9407b5821b9f.bin (deflated 17%)
  adding: db/chroma-embeddings.parquet (deflated 26%)
  adding: db/chroma-collections.parquet (deflated 50%)


In [10]:
#To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

#delete the directory

In [11]:
!rm -rf db/

##Restarting the runtime

In [12]:
!unzip db.zip

Archive:  db.zip
   creating: db/
   creating: db/index/
  inflating: db/index/uuid_to_id_e213ed88-c84b-461f-968b-9407b5821b9f.pkl  
  inflating: db/index/id_to_uuid_e213ed88-c84b-461f-968b-9407b5821b9f.pkl  
  inflating: db/index/index_metadata_e213ed88-c84b-461f-968b-9407b5821b9f.pkl  
  inflating: db/index/index_e213ed88-c84b-461f-968b-9407b5821b9f.bin  
  inflating: db/chroma-embeddings.parquet  
  inflating: db/chroma-collections.parquet  


In [13]:
#importing langchain modules

from langchain.document_loaders import UnstructuredExcelLoader
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain import OpenAI, VectorDBQA
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.document_loaders import TextLoader
from langchain.memory import ConversationBufferMemory
from langchain import OpenAI, LLMChain, PromptTemplate
import tiktoken
from langchain.callbacks import get_openai_callback
from langchain.chains.question_answering import load_qa_chain
#importing the os

from config import OPENAI_API_KEY


In [14]:
persist_directory = "db"

# create the open-source embedding function
vectordb2 = Chroma(embedding_function=OpenAIEmbeddings(),
        persist_directory=persist_directory)

In [15]:
template = """You are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

{context}

{chat_history}
Human: {human_input}
Chatbot:"""

In [16]:
prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")

In [17]:
chain = load_qa_chain(
    OpenAI(temperature=0), chain_type="stuff", memory=memory, prompt=prompt,verbose=True
)

In [18]:
query = "Hi"
retrieval = vectordb2.similarity_search(query)
chain({"input_documents": retrieval, "human_input": query}, return_only_outputs=True)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

faculty
      
    
    
      Dr. Shashikant Pawar
      I am a Mechanical Engineer by training and profession. I specialize\nin Fluid Mechanics and Heat Transfer. My research interests lie in\nthe areas including Environmental Flows, Microclimate Studies,\nBio-inspired Energy-efficient and Sustainable Systems, Light\nPropagation through Turbulent Media, and Engineering Education. I\nstrive to contribute to the development of innovative and sustainable\nengineering solutions to address pressing global challenges.
      Fluid Dynamics and Heat Transfer.
      Buoyancy driven (environmental) turbulent flows\nFluid dynamics and heat transfer of Bio-inspired systems\nLight propagation in turbulent media\nEngineering Education
      1. 

{'output_text': ' Hi there! How can I help you?'}

In [19]:
query = "I am interested to learn about professor on the Quantom mechanics, who can I meet"
chain({"input_documents": retrieval, "human_input": query}, return_only_outputs=True)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

faculty
      
    
    
      Dr. Shashikant Pawar
      I am a Mechanical Engineer by training and profession. I specialize\nin Fluid Mechanics and Heat Transfer. My research interests lie in\nthe areas including Environmental Flows, Microclimate Studies,\nBio-inspired Energy-efficient and Sustainable Systems, Light\nPropagation through Turbulent Media, and Engineering Education. I\nstrive to contribute to the development of innovative and sustainable\nengineering solutions to address pressing global challenges.
      Fluid Dynamics and Heat Transfer.
      Buoyancy driven (environmental) turbulent flows\nFluid dynamics and heat transfer of Bio-inspired systems\nLight propagation in turbulent media\nEngineering Education
      1. 

{'output_text': ' You can meet Dr. Srikant Srinivasan, who specializes in Quantum Mechanics. He is a faculty member at Plaksha University.'}

In [20]:

query = "He does not have any experience in quantum mechanics"
chain({"input_documents": retrieval, "human_input": query}, return_only_outputs=True)



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

faculty
      
    
    
      Dr. Shashikant Pawar
      I am a Mechanical Engineer by training and profession. I specialize\nin Fluid Mechanics and Heat Transfer. My research interests lie in\nthe areas including Environmental Flows, Microclimate Studies,\nBio-inspired Energy-efficient and Sustainable Systems, Light\nPropagation through Turbulent Media, and Engineering Education. I\nstrive to contribute to the development of innovative and sustainable\nengineering solutions to address pressing global challenges.
      Fluid Dynamics and Heat Transfer.
      Buoyancy driven (environmental) turbulent flows\nFluid dynamics and heat transfer of Bio-inspired systems\nLight propagation in turbulent media\nEngineering Education
      1. 

{'output_text': ' Dr. Srikant Srinivasan has extensive experience in Quantum Mechanics. He has published papers on the subject and has been teaching courses related to Quantum Mechanics for many years.'}

In [21]:

query = "Go through the docs which I provided and recommend another professor with Quantum mechanics background"
chain({"input_documents": retrieval, "human_input": query})



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

faculty
      
    
    
      Dr. Shashikant Pawar
      I am a Mechanical Engineer by training and profession. I specialize\nin Fluid Mechanics and Heat Transfer. My research interests lie in\nthe areas including Environmental Flows, Microclimate Studies,\nBio-inspired Energy-efficient and Sustainable Systems, Light\nPropagation through Turbulent Media, and Engineering Education. I\nstrive to contribute to the development of innovative and sustainable\nengineering solutions to address pressing global challenges.
      Fluid Dynamics and Heat Transfer.
      Buoyancy driven (environmental) turbulent flows\nFluid dynamics and heat transfer of Bio-inspired systems\nLight propagation in turbulent media\nEngineering Education
      1. 

{'input_documents': [Document(page_content='faculty\n      \n    \n    \n      Dr. Shashikant Pawar\n      I am a Mechanical Engineer by training and profession. I specialize\\nin Fluid Mechanics and Heat Transfer. My research interests lie in\\nthe areas including Environmental Flows, Microclimate Studies,\\nBio-inspired Energy-efficient and Sustainable Systems, Light\\nPropagation through Turbulent Media, and Engineering Education. I\\nstrive to contribute to the development of innovative and sustainable\\nengineering solutions to address pressing global challenges.\n      Fluid Dynamics and Heat Transfer.\n      Buoyancy driven (environmental) turbulent flows\\nFluid dynamics and heat transfer of Bio-inspired systems\\nLight propagation in turbulent media\\nEngineering Education\n      1. Indian Institute of Science (IISc)\\nDoctor of Philosophy - PhD, Fluid Mechanics and Heat\\nTransfer · (2008 - 2015)\\n2. National Institute of Technology Hamirpur\\nM.Tech., Computational Fluid Dy

{'query': 'Which professor can I meet to learn C++ from and give information about him', 'result': ' You can meet Dr. Srikant Srinivasan to learn C++ from. He is a faculty member at the university.', 'source_documents': [Document(page_content='faculty\n      \n    \n    \n      Dr. Srikant Srinivasan', metadata={'source': 'excel/database.xlsx'}), Document(page_content='faculty\n      \n    \n    \n      Dr. Srikant Srinivasan', metadata={'source': 'excel/database.xlsx'}), Document(page_content='faculty\n      \n    \n    \n      Dr. Srikant Srinivasan', metadata={'source': 'excel/database.xlsx'}), Document(page_content='faculty\n      \n    \n    \n      Dr. Srikant Srinivasan', metadata={'source': 'excel/database.xlsx'})]}