In [12]:
%pip install PyPDF2



In [13]:
%pip install langchain



In [14]:
%pip install langchain_community



In [15]:
%pip install sentence-transformers



In [16]:
%pip install faiss-cpu



In [17]:
!pip install pyngrok



In [18]:
!pip install -q streamlit

In [19]:
%%writefile app.py
# importing dependencies
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import faiss
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
import numpy as np

# extracting text from pdf
def get_pdf_text(docs):
    text = ""
    for pdf in docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# converting text to chunks
def get_chunks(raw_text):
    text_splitter = CharacterTextSplitter(separator="\n",
                                          chunk_size=1000,
                                          chunk_overlap=150,
                                          length_function=len)
    chunks = text_splitter.split_text(raw_text)
    return chunks

# using all-MiniLm embeddings model and faiss to get vectorstore
def get_vectorstore(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': 'cpu'})
    vectorstore = faiss.FAISS.from_texts(texts=chunks, embedding=embeddings)
    return vectorstore

# generating conversation chain
def get_conversationchain(vectorstore):
    huggingfacehub_api_token = 'YOUR_TOKEN_HERE'
    llm = HuggingFaceHub(repo_id='mistralai/Mistral-7B-Instruct-v0.2',
                         huggingfacehub_api_token=huggingfacehub_api_token,
                         model_kwargs={
                             'temperature': 0.3,
                             'max_new_tokens': 5000
                         })
    memory = ConversationBufferMemory(memory_key='chat_history',
                                      return_messages=True,
                                      output_key='answer')  # using conversation buffer memory to hold past information
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory)
    return conversation_chain

# Find the most similar text chunk
def find_most_similar_text(query, chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': 'cpu'})
    query_embedding = embeddings.embed_query(query)
    chunk_embeddings = embeddings.embed_documents(chunks)

    # Calculate cosine similarity
    similarities = [cosine_similarity(query_embedding, chunk_embedding) for chunk_embedding in chunk_embeddings]
    most_similar_index = similarities.index(max(similarities))
    return chunks[most_similar_index]

# cosine similarity calculation
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# generating response from user queries and displaying them accordingly
def handle_question(question, document_chunks, vectorstore):
    # Find the most relevant document chunk
    relevant_text = find_most_similar_text(question, document_chunks)

    # Generate answer based on the retrieved text and query
    input_text = f"Question: {question}\nContext: {relevant_text}\nAnswer:"
    try:
        response = st.session_state.llm(input_text, max_length=1024)
        # Process response
        answer = response.split("Answer:")[1].strip() if "Answer:" in response else response

        # Update session state with the new question and answer
        st.session_state.past_questions.append(question)
        st.session_state.answers.append(answer)

        # Display results
        st.subheader("Answer:")
        st.write(answer)

        # Display retrieved context in a collapsible section
        with st.expander("Most Relevant Context"):
            st.write(relevant_text)
    except Exception as e:
        st.error(f"Failed to generate response: {e}")

def main():
    st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")

    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []
    if "document_chunks" not in st.session_state:
        st.session_state.document_chunks = []
    if "vectorstore" not in st.session_state:
        st.session_state.vectorstore = None
    if "llm" not in st.session_state:
        st.session_state.llm = HuggingFaceHub(repo_id='mistralai/Mistral-7B-Instruct-v0.2',
                                              huggingfacehub_api_token='YOUR_TOKEN_HERE')
    if "past_questions" not in st.session_state:
        st.session_state.past_questions = []
    if "answers" not in st.session_state:
        st.session_state.answers = []

    st.header("Chat with multiple PDFs :books:")

    with st.sidebar:
        st.subheader("Your documents")
        docs = st.file_uploader("Upload your PDF here and click on 'Process'", accept_multiple_files=True)
        if st.button("Process"):
            with st.spinner("Processing"):
                raw_text = get_pdf_text(docs)
                text_chunks = get_chunks(raw_text)
                st.session_state.document_chunks = text_chunks
                vectorstore = get_vectorstore(text_chunks)
                st.session_state.vectorstore = vectorstore
                st.session_state.conversation = get_conversationchain(vectorstore)

        st.subheader("Previous Questions")
        for i, (question, answer) in enumerate(zip(st.session_state.past_questions, st.session_state.answers)):
            st.write(f"Q{i+1}: {question}")
            st.write(f"A{i+1}: {answer}")

        if st.button("Clear Chat"):
            st.session_state.past_questions = []
            st.session_state.answers = []

    question = st.text_input("Ask a question about your documents:")
    if question:
        handle_question(question, st.session_state.document_chunks, st.session_state.vectorstore)

if __name__ == '__main__':
    main()

Overwriting app.py


In [20]:
!wget -q -O - ipv4.icanhazip.com

34.72.207.220


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.72.207.220:8501[0m
[0m
your url is: https://puny-rockets-march.loca.lt

>> from langchain.embeddings import HuggingFaceEmbeddings

with new imports of:

>> from langchain_community.embeddings import HuggingFaceEmbeddings
You can use the langchain cli to **automatically** upgrade many imports. Please see documentation here <https://python.langchain.com/v0.2/docs/versions/v0_2/>
  warn_deprecated(

`from langchain_community.llms import HuggingFaceHub`.

To install langchain-community run `pip install -U langchain-community`.
  warn_deprecated(

>> from langchain.embeddings import HuggingFaceEmbeddings

with new imports of:

>> from langchain_community.embeddings import HuggingF