<a href="https://colab.research.google.com/github/pankajtandon/Gist/blob/main/gist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook will show you a way to build your app iteratively in Colab.

To run this notebook, navigate to 
https://colab.research.google.com
and File | Open this notebook or simply click on the link above.


To prevent your API Keyes from being committed to source control, do the following:
- Create a directory in the root of your Google Drive and call it `colab_content`.
- Create a file in that directory called `api-keys.txt` and in that file add contents like:
```
OPENAI_API_KEY=<your key>. 
NGROK_AUTH_TOKEN=<your key>
```

For OPENAI_API_KEY, you will need to create an account at https://platform.openai.com and it will cost you but it's usually pennies for moderate usage and usage can be monitored at https://platform.openai.com/account/usage
The NGROK_AUTH_TOKEN is free and can be gotten from https://ngrok.com/


Then run each cell in this notebook in order by looking at the comment in each cell.



In [112]:
# First mount a directory in Google Drive. This will help keep your API Keys out of source control.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# This will need to be done everytime your VM disconnects.

!pip install pyngrok
!pip install streamlit
!pip install openai
!pip install langchain
!pip install tiktoken
!pip install sentence_transformers
!pip install tiktoken
!pip install PyPDF2
!pip install faiss-cpu
!pip install ipdb


In [None]:
# For debugging
import ipdb
# %pdb on
# %env

In [127]:
# This writes the code to the VM on which this notebook runs.

%%writefile /content/drive/MyDrive/colab_content/gist.py


# from scipy import spatial
# import ast  # for converting embeddings saved as strings back to arrays
# import openai  # for calling the OpenAI API
# import pandas as pd  # for storing text and embeddings data
# import tiktoken  # for counting tokens
import time
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback

GPT_MODEL = "gpt-3.5-turbo"
PAGE_CONFIG = {"page_title": "Hello baby!", "page_icon": "smiley", "layout": "centered"}
st.set_page_config(**PAGE_CONFIG)
st.title("Welcome to our world of baby delights!")
st.subheader("We are head over heels!")


# ---- User input

st.write("The supplied PDF file (below) will be chunked and vectorized.")
chunk_size = st.slider('What should be the chunk size in characters? (if not sure, accept the default)', 0, 5000, value = 500, step = 25)
chunk_overlap = st.slider('What should be the chunk overlap in characters? (if not sure, accept the default)', 0, 500, value = 100, step = 10)

embeddings_option = st.selectbox(
    label = 'Which Embeddings engine to use?',
    options= ['HuggingFaceEmbeddings - Free but slow', 'OpenAIEmbeddings - Fast but costs']
)

debug = st.checkbox("Would you like to see debug info?")
pdf = st.file_uploader("Upload your PDF", type = "PDF")
question = None
# -------

if ((pdf is not None)):
    # User supplied a pdf doc
    
    if embeddings_option.startswith("HuggingFace"):
      embeddings = HuggingFaceEmbeddings()
      st.write("Using HuggingFaceEmbeddings")
    else:
      embeddings = OpenAIEmbeddings()
      st.write("Using OpenAIEmbeddings")

    st.write("Using chunk_size", chunk_size, "and chunk overlap of", chunk_overlap)
    pdf_reader = PdfReader(pdf)
    content = ""
    for page in pdf_reader.pages:
        content += page.extract_text()
    # st.write("====Content====")
    # st.write(content)

    total_execution_seconds = 0;
    # Chunk out the file
    st.write("Going to split the file into chunks")
    start_time_for_chunking = time.time()
    text_splitter = CharacterTextSplitter(separator=" ", chunk_size= chunk_size, chunk_overlap = chunk_overlap, length_function= len)
    chunks = text_splitter.split_text(content)
    diff = (time.time() - start_time_for_chunking)
    total_execution_seconds += diff
    st.write("Split file into ", len(chunks), " chunks", " in ", diff, " seconds")
    
    #ipdb.set_trace()
    #Ask the question
    question = st.text_input("Ask me something about the PDF that you just uploaded:")
    if len(question) > 0:
        # These are the vectorized chunks:
        start_time_for_vectorization = time.time()
        st.write("Going to start vectorizing the chunks")
        knowledge_base = FAISS.from_texts(chunks, embeddings)
        diff = (time.time() - start_time_for_vectorization)
        total_execution_seconds += diff
        st.write("Vectorization took %s seconds" % diff)

        # Docs are those vectors that are similar to the vectors in the knowledge base.
        start_time_for_similarity_search = time.time()
        docs = knowledge_base.similarity_search(question)
        diff = (time.time() - start_time_for_similarity_search)
        total_execution_seconds += diff
        st.write("Similarity search took %s seconds" % diff)

        if docs is not None:
            if (debug):
              st.write("These are the related chunks:")
              for doc in docs:
                st.write(doc)
            
            # Forward the related chunks to the LLM with the query as a prompt
            llm = OpenAI()
            st.write("Asking LLM using model", llm.model_name, "...")
            start_time_for_llm_question = time.time()    
            chain = load_qa_chain(llm, chain_type = "stuff")
            with get_openai_callback() as cb:
                response = chain.run(question = question, input_documents = docs)
                st.write("Cost of query:")
                st.write(cb)

            diff = (time.time() - start_time_for_llm_question)
            total_execution_seconds += diff
            st.write("LLM response took %s seconds" % diff)
            st.write(response)
            st.write("Total execution: %s seconds" % total_execution_seconds)
        else:
            st.write("No match on the chunks!")


# EMBEDDING_MODEL = "text-embedding-ada-002"



Overwriting /content/drive/MyDrive/colab_content/gist.py


In [108]:
# Set up the tunnel to allow access to the running Streamlit instance.

from pyngrok import ngrok
import os

with open('/content/drive/MyDrive/colab_content/api-keys.txt', 'r') as f:
    api_key_list = f.readlines()
for kv in api_key_list:
    k,v = kv.split('=')
    #print(k, v)
    os.environ[k] = v.strip()
ngrok_token = os.getenv('NGROK_AUTH_TOKEN').strip()
!ngrok authtoken $ngrok_token
public_url = ngrok.connect(addr='8501') # This is the default Streamlit port
print('This is the URL that can be used to access the Streamlit app', public_url)

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml




This is the URL that can be used to access the Streamlit app NgrokTunnel: "https://48c3-34-86-3-59.ngrok-free.app" -> "http://localhost:8501"


In [109]:
# Start the streamlit app and leave it running and then access the running app at the URL above.

!streamlit run /content/drive/MyDrive/colab_content/gist.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.86.3.59:8501[0m
[0m
> [0;32m/content/drive/MyDrive/colab_content/gist.py[0m(73)[0;36m<module>[0;34m()[0m
[0;32m     72 [0;31m    [0;31m#Ask the question[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 73 [0;31m    [0mquestion[0m [0;34m=[0m [0mst[0m[0;34m.[0m[0mtext_input[0m[0;34m([0m[0;34m"Ask me something about the PDF that you just uploaded:"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     74 [0;31m    [0mprint[0m[0;34m([0m[0;34m"Q"[0m[0;34m,[0m [0mquestion[0m[0;34m,[0m [0;34m"question is None"[0m[0;34m,[0m [0;34m([0m[0mquestion[0m [0;32mis[0m [0;32mNone[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[?2004h[?25l[0m[?7l[0m[J[0;38;5;28mipdb> [6D[6C[?7