In [1]:
from langchain.document_loaders import PyPDFLoader
import os
import langchain.embeddings 
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.vectorstores import Chroma
from config import OPENAI_API_KEY
from langchain.retrievers.multi_query import MultiQueryRetriever

# loader = PyPDFLoader("../prof.pdf")
loader = UnstructuredExcelLoader(
    "../excel/faculty.xlsx"
)


data = loader.load()

In [2]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
all_splits = text_splitter.split_documents(data)
print(all_splits)

[Document(page_content='Amit Sheth\n      Prof. Sheth calls himself a ‘tinkerer’; graduated in Electrical & Electronics Engineering from BITS Pilani, did his Masters in Product Design from National Institute of Design, Ahmedabad; and an M.Phil. from Industrial Design Centre, IITB, Mumbai. Prof. Sheth has been a design practitioner for the past three decades as well as a design educator. \\nDuring his 30 years as a design consultant, Prof. Sheth’s firm MIND’s EYE DESIGN Pvt Ltd has provided design services in the areas of Product Design, Signage & Wayfinding Design, Exhibition Design, Corporate Branding, Interiors and Architecture. He was instrumental in setting up the Design Institute for Nirma University and was Professor & Head of the Department. Post that he was Dean of Advancement and a Professor at Anant National University, Ahmedabad, and Adjunct Professor at NRTI (National Rail & Transportation Institute, Vadodara) as well as IIT Gandhinagar.', metadata={'source': '../excel/facu

In [3]:
# Store 
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
vectorstore = Chroma.from_documents(documents=all_splits,embedding=OpenAIEmbeddings())

In [23]:
question = "Who is Anupam Sobti?"
docs = vectorstore.similarity_search(question)
len(docs)
print(docs)

[Document(page_content='Dr. Dhiraj Sinha', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Sunita Chauhan', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Amrik Sen', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Anupam Sobti\n      My research interest lies at the intersection of computer vision,\\nmachine learning and embedded systems. Smart sensors, assistive\\ndevices, wearable health monitoring systems, etc. require robust\\nand latency-sensitive sensing and actuation. Therefore, these\\napplications require innovation across sensing techniques,\\nperception algorithms and energy optimization techniques in\\nsoftware and hardware. My current research explores the use of\\ndifferent modalities like radar and ultrasonic sensors along with\\nvision sensors for an energy-efficient and high-performance system.\\nI like to build systems. It is when systems are built and verified with\\nthe users that real p

In [24]:
a = vectorstore.as_retriever(search_type="mmr")
ans = a.get_relevant_documents("Who is Plaksha Vice Chancellor")

In [25]:
print(ans)

[Document(page_content="1. Plaksha University\\n> Associate Professor of Practice: I am a part of the Humanities Fraternity at India's latest Tech University - Plaksha University\\n> Director of Centre for Thinking, Language and Communication\\n2. Samvada Centre for Research Resources, Director\\n3. Samvada International Research Institute, Research Consultant\\n4. Green Templeton College, Associate Member\\n5. Faith and Fire and Chief Fire Officers Association, UK (CFOA), Hindu Faith Contact\\n6. Shiksha Rath, Consultant\\n7. Touch India Trust, Trustee\\n8. THE ROYAL ASIATIC SOCIETY OF GREAT BRITAIN AND\\nIRELAND, fellow\\n9. Oxford Centre For Mission Studies, Visiting Research Tutor\\n10. Ecole Hoteliere Lavasa, Research Dean\\n11. Mahidol University, Visiting Professor\\n12. TWR, PR and Programme Host\\n13. Perfect Relations: Associate Manager\\n14. Oxford Centre for Hindu Studies, Research Dean", metadata={'source': '../excel/faculty.xlsx'}), Document(page_content="Dr. Pratap is th

In [7]:
from langchain.agents import (
    Tool,
    AgentExecutor,
    LLMSingleActionAgent,
    AgentOutputParser,
)
from langchain.prompts import StringPromptTemplate
from langchain import OpenAI, SerpAPIWrapper, LLMChain
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish
import re

In [8]:
print(docs)

[Document(page_content='Dr. Dhiraj Sinha', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Sunita Chauhan', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Amrik Sen', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Anupam Sobti\n      My research interest lies at the intersection of computer vision,\\nmachine learning and embedded systems. Smart sensors, assistive\\ndevices, wearable health monitoring systems, etc. require robust\\nand latency-sensitive sensing and actuation. Therefore, these\\napplications require innovation across sensing techniques,\\nperception algorithms and energy optimization techniques in\\nsoftware and hardware. My current research explores the use of\\ndifferent modalities like radar and ultrasonic sensors along with\\nvision sensors for an energy-efficient and high-performance system.\\nI like to build systems. It is when systems are built and verified with\\nthe users that real p

In [9]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
from langchain.chains import RetrievalQA
question = "Who is Dr. Amrik Sen"
qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever())
qa_chain({"query": question})

{'query': 'Who is Dr. Amrik Sen',
 'result': 'Dr. Amrik Sen is a scholar who obtained his Masters and Ph.D. from the University of Colorado, Boulder, USA in Applied Mathematics. During his studies, he was also a research scholar at the National Center for Atmospheric Research in Boulder, Colorado. Currently, he is the principal investigator of two research grants sponsored by the Government of India. He is leading a multi-university research project involving PGIMER, Chandigarh and NIFTEM, Sonepat as the project coordinator. He has significant industrial and research experience in developing mathematical models with several engineering and scientific applications. He has taught undergraduate and postgraduate courses in mathematics in India and America over a span of ten years. Dr. Sen is also a 5k runner and a marathon bicyclist who has participated in several 100 mile cross country rides as a member of the School of Applied Sciences and Engineering team.'}

In [10]:
question = "Who is Dr. Rucha Joshi"
qa_chain({"query": question})

{'query': 'Who is Dr. Rucha Joshi',
 'result': "The text doesn't provide information about who Dr. Rucha Joshi is."}

In [11]:
# Build prompt
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
template = """You are owned by Plaksha University. You help with connecting user who have a particular interest in a topic to a Professor who have experience in such topic. You also can answer questions regarding a professor. When you are asked about a professor, ensure you provide a well detailed answer to it.
Ensure you go through the below context before answering a question, if you do not know the answer just tell the user that you do not have any information about that.

You can only get the answer from the context:-
{context}

Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context","question", ],template=template,)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Run chain
question = "Who is Dr. Rucha Joshi"
from langchain.chains import RetrievalQA
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectorstore.as_retriever(),
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=llm,
)
chat = ConversationalRetrievalChain.from_llm(llm=ChatOpenAI(temperature=0, model="gpt-4"),chain_type="stuff",retriever=retriever_from_llm,memory=memory)
result = qa_chain({"query": question})
result1 = retriever_from_llm.get_relevant_documents(query=question)

In [12]:
prof_search = ""
for i in result1:
    prof_search+=str(i)

In [13]:
from langchain.vectorstores import FAISS
from langchain.chains.router import MultiRetrievalQAChain
from langchain.llms import OpenAI

In [14]:
prof_retriever = FAISS.from_texts(prof_search, OpenAIEmbeddings()).as_retriever()

In [15]:
retriever_infos = [
    {
        "name": "question_about_professor", 
        "description": "Good for answeing every question", 
        "retriever": vectorstore.as_retriever()
    },
    {
        "name": "question_about_professor", 
        "description": "Good for answering questions about Plaksha Professor if the first retreiver does not have the answer",
        "retriever": prof_retriever
    },
]

In [16]:
chain = MultiRetrievalQAChain.from_retrievers(OpenAI(), retriever_infos, verbose=True)

In [17]:
print(chain.run("Who is Dr. Rucha Joshi"))



[1m> Entering new MultiRetrievalQAChain chain...[0m




question_about_professor: {'query': 'Who is Dr. Rucha Joshi'}
[1m> Finished chain.[0m
 I don't know.


In [18]:
from langchain.agents.agent_toolkits import create_retriever_tool

In [19]:
tool =(
    vectorstore.as_retriever(), 
    "\search_for_professor",
    "Searches and return about a plaksha professor."
)
tools = [tool]

In [20]:
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature = 0)

In [26]:
# agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True)

ValidationError: 1 validation error for OpenAIFunctionsAgent
tools -> 0
  value is not a valid dict (type=type_error.dict)

In [22]:
# result = agent_executor({"input": "Who is Rudra Pratap"})

NameError: name 'agent_executor' is not defined

In [27]:
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain import OpenAI

In [None]:
# tools = [
#     Tool(
#         name="Search",
#         func=retriever_from_llm.get_relevant_documents.search,
#         description="useful for when you need to ask with search",
#     ),
#     Tool(
#         name="Lookup",
#         func=retriever_from_llm.get_relevant_documents.lookup,
#         description="useful for when you need to ask with lookup",
#     ),
# ]


In [28]:
llm = OpenAI(temperature=0, model_name="text-davinci-002")
react = initialize_agent(tools, llm, agent=AgentType.REACT_DOCSTORE, verbose=True)

AttributeError: 'tuple' object has no attribute 'is_single_input'

In [None]:
question = "Who is the Vice Chancellor"
react.run(question)

In [None]:
print(chat)

In [None]:
query = "Who is Rudra Pratap"
result = chat({"question": query})

In [None]:
print(result)

In [None]:
result = qa_chain({"query": "What was the previous question which I asked you"})

In [None]:
print(result)
print(result1)

In [29]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_from_llm, retriever_from_llm], weights=[0.5, 0.5])

In [30]:
docs = ensemble_retriever.get_relevant_documents("Vice Chancellor")

In [31]:
print(docs)

[Document(page_content='Dr. Srikant Srinivasan', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Vishal Garg', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Prashanth Suresh Kumar', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='1. Dean of Advancement and Professor, Anant National University, Ahmedabad\\n2. Founder Head and Professor of the Department of Design, Nirma University, Ahmedabad\\n3. Adjunct Professor, IIT Gandhinagar\\n4. Adjunct Professor, National Rail & Transportation Institute, Vadodara\\n5. Professor of Practice, Plaksha University\\n6. Vice President Design, Value Labs\\n7. Founder and Managing Director, Minds Eye Design Private Limited\\n8. Visiting Faculty, National Institute of Design\\n9. Associate Professor, Mudra Institute of Communications\n      \n      \n      \n      \n      \n      \n      \n    \n    \n      Dr. Aditya Malik', metadata={'source': '../excel/faculty.xlsx'}), Documen

In [34]:
print(result)
print(result1)

{'query': 'Who is Dr. Rucha Joshi', 'result': "I'm sorry, but the context provided does not contain any specific information about Dr. Rucha Joshi."}
[Document(page_content='Dr. Rucha Joshi', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Sunita Chauhan', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Rudra Pratap', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Chaitanya Lekshmi Indira', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='she was involved in a National Science Foundation funded grant at Purdue, which focused on studying the professional formation of engineers and promoting diversity and inclusion.\\n\\nDr. Joshi\'s passion for educational entrepreneurship led her to participate in projects that aimed to make engineering accessible to underrepresented high school students both in India and the United States. Her efforts in this area have been recognized, and she has recei

In [35]:
from typing import List
from langchain import LLMChain
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser


# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions seperated by newlines.
    Original question: {question}""",
)
llm = ChatOpenAI(temperature=0)

# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Other inputs
question = "Who is Manoj Kanan"

In [36]:
retriever = MultiQueryRetriever(
    retriever=vectorstore.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)  # "lines" is the key (attribute name) of the parsed output

# Results
unique_docs = retriever.get_relevant_documents(
    query="Who is Manoj Kannan"
)
len(unique_docs)


4

In [37]:
print(unique_docs)

[Document(page_content='Dr. Manoj Kannan', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Nandini Kannan', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content='Dr. Sandeep Manjanna', metadata={'source': '../excel/faculty.xlsx'}), Document(page_content="Dr. Manoj Kannan\n      Dr. Kannan joined Plaksha University after spending close to 15 years teaching various courses in biological sciences and introductory courses in computer programming at both the Dubai and Pilani campuses of BITS Pilani. He has also been providing academic counseling, mentoring and student support - activities which are close to his heart. For his PhD, he worked at the National Institutes of Health, Maryland in the area of epigenetic control of mammalian transposons. His current research focuses on biology education research, with a special emphasis on Integrated Course Design using Fink's Taxonomy of Significant Learning. He has also worked on devising and implementing 

In [38]:
# Helper function for printing docs

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [39]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings = OpenAIEmbeddings()
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("Who is Plaksha Vice Chancellor")
pretty_print_docs(compressed_docs)

Document 1:

1. Plaksha University\n> Associate Professor of Practice: I am a part of the Humanities Fraternity at India's latest Tech University - Plaksha University\n> Director of Centre for Thinking, Language and Communication\n2. Samvada Centre for Research Resources, Director\n3. Samvada International Research Institute, Research Consultant\n4. Green Templeton College, Associate Member\n5. Faith and Fire and Chief Fire Officers Association, UK (CFOA), Hindu Faith Contact\n6. Shiksha Rath, Consultant\n7. Touch India Trust, Trustee\n8. THE ROYAL ASIATIC SOCIETY OF GREAT BRITAIN AND\nIRELAND, fellow\n9. Oxford Centre For Mission Studies, Visiting Research Tutor\n10. Ecole Hoteliere Lavasa, Research Dean\n11. Mahidol University, Visiting Professor\n12. TWR, PR and Programme Host\n13. Perfect Relations: Associate Manager\n14. Oxford Centre for Hindu Studies, Research Dean
----------------------------------------------------------------------------------------------------
Document 2:

1

In [40]:
pretty_print_docs(compressed_docs)

Document 1:

1. Plaksha University\n> Associate Professor of Practice: I am a part of the Humanities Fraternity at India's latest Tech University - Plaksha University\n> Director of Centre for Thinking, Language and Communication\n2. Samvada Centre for Research Resources, Director\n3. Samvada International Research Institute, Research Consultant\n4. Green Templeton College, Associate Member\n5. Faith and Fire and Chief Fire Officers Association, UK (CFOA), Hindu Faith Contact\n6. Shiksha Rath, Consultant\n7. Touch India Trust, Trustee\n8. THE ROYAL ASIATIC SOCIETY OF GREAT BRITAIN AND\nIRELAND, fellow\n9. Oxford Centre For Mission Studies, Visiting Research Tutor\n10. Ecole Hoteliere Lavasa, Research Dean\n11. Mahidol University, Visiting Professor\n12. TWR, PR and Programme Host\n13. Perfect Relations: Associate Manager\n14. Oxford Centre for Hindu Studies, Research Dean
----------------------------------------------------------------------------------------------------
Document 2:

1

In [41]:
result = qa_chain({"query": question})
result["result"]

"Dr. Manoj Kannan is a professor at Plaksha University. He has spent nearly 15 years teaching various courses in biological sciences and introductory courses in computer programming at both the Dubai and Pilani campuses of BITS Pilani. In addition to teaching, Dr. Kannan has been providing academic counseling, mentoring, and student support. He completed his PhD at the National Institutes of Health, Maryland, focusing on the epigenetic control of mammalian transposons. His current research is centered on biology education research, particularly Integrated Course Design using Fink's Taxonomy of Significant Learning. He has also worked on devising and implementing novel assessment strategies that foster effective learning. Dr. Kannan has been a resource person for faculty development programs held for universities and schools, a role he continues to play as part of the initiatives of the Collaboratory for Innovation in Education at Plaksha."