In [1]:
%%bash
pip install -U google-genai
pip install pypdf cohere faiss-cpu
pip install streamlit



In [2]:
import os
from google.colab import userdata

os.environ['COHERE_API_KEY'] = userdata.get('COHERE_API_KEY')
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')

print("API keys set as environment variables.")

API keys set as environment variables.


In [3]:
%%writefile app.py
import streamlit as st
import os
from pypdf import PdfReader
import cohere
import faiss
import numpy as np
import google.generativeai as genai
import tempfile
import streamlit.components.v1 as components

COHERE_API_KEY = os.environ.get('COHERE_API_KEY')
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')

if GOOGLE_API_KEY:
    genai.configure(api_key=GOOGLE_API_KEY)
else:
    st.error("Google API key not found. Please set it as an environment variable (GOOGLE_API_KEY).")

def load_pdf(file_path):
    """Extract text from a PDF file using pypdf."""
    text = ""
    reader = PdfReader(file_path)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def get_embeddings(texts, api_key, model="embed-english-v3.0", input_type="search_document"):
    """
    Generate embeddings using Cohere's API.

    Args:
        texts (List[str]): The text chunks or queries.
        api_key (str): Your Cohere API key.
        model (str): The embedding model to use.
        input_type (str): Must be either "search_document" or "search_query".

    Returns:
        List[List[float]]: Embedding vectors.
    """
    if not api_key:
        st.error("Cohere API key not found. Please set it as an environment variable (COHERE_API_KEY).")
        return []
    co = cohere.Client(api_key)

    response = co.embed(
        texts=texts,
        model=model,
        input_type=input_type,
    )
    return response.embeddings

def create_faiss_index(embeddings):
    """
    Create a FAISS index from the given embeddings.

    Args:
        embeddings (List[List[float]]): List of dense vectors.

    Returns:
        faiss.IndexFlatL2: FAISS index with the embeddings.
    """
    embeddings = np.array(embeddings).astype("float32")
    faiss.normalize_L2(embeddings)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)
    return index

def save_faiss_index(index, path):
    """
    Save a FAISS index to the specified file path.
    """
    faiss.write_index(index, path)

def load_faiss_index(path):
    """
    Load a FAISS index from file.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"No FAISS index found at {path}")
    return faiss.read_index(path)

def retrieve_top_k(index, query_embedding, k=5):
    """
    Retrieve top-k most similar chunks using FAISS.

    Args:
        index (faiss.Index): The FAISS index.
        query_embedding (List[float]): The embedding of the query.
        k (int): Number of similar chunks to retrieve.


    Returns:
        List[int]: Indices of the top-k closest chunks in the original chunk list.
    """
    query_vector = np.array([query_embedding]).astype("float32")
    distances, indices = index.search(query_vector, k)
    return indices[0]

def generate_answer(prompt, model="gemini-2.5-flash", temperature=0.3):
    """
    Generate a response from Gemini 2.5 Flash given a prompt.

    Args:
        prompt (str): The context + question input.
        model (str): Model name to use (default is "gemini-2.5-flash").
        temperature (float): Sampling temperature for creativity.

    Returns:
        str: The model's response text.
    """
    if not GOOGLE_API_KEY:
        st.error("Google API key not configured. Please ensure GOOGLE_API_KEY is set in environment variables.")
        return ""


    try:
        response = genai.GenerativeModel(model).generate_content(
            contents=prompt,
            generation_config={
                "temperature": temperature,
                "max_output_tokens": 2048
            }
        )
        return response.text
    except genai.types.core.ClientError as e:
        st.error(f"Google Gemini API Client Error: {e}")
        return f"Error: {e}"
    except Exception as e:
        st.error(f"An unexpected error occurred while calling Google Gemini API: {e}")
        return f"Error: {e}"

def get_rag_context(query, pdf_path, k=5):
    """
    Retrieves relevant context from a PDF document based on a query.

    Args:
        query (str): The user's query.
        pdf_path (str): The path to the PDF document.
        k (int): The number of top-k similar chunks to retrieve.

    Returns:
        str: A concatenated string of the most relevant chunks.
    """
    raw_text = load_pdf(pdf_path)
    chunks = raw_text.split(". ")

    chunk_embeddings = get_embeddings(
        chunks, COHERE_API_KEY, input_type="search_document")

    index = create_faiss_index(chunk_embeddings)

    query_embedding = get_embeddings(
        [query], COHERE_API_KEY, input_type="search_query")[0]

    top_indices = retrieve_top_k(index, query_embedding, k=k)
    context = " ".join([chunks[i] for i in top_indices])
    return context

def run_rag(query, pdf_path, k=5):
    context = get_rag_context(query, pdf_path, k=k)

    full_prompt = f"""Answer the following question based ONLY on the provided context. If the information is not available in the context, state that you cannot find the answer in the provided document.\n\nContext: {context}\n\nQuestion: {query}"""
    answer = generate_answer(full_prompt)
    return answer

def generate_flowchart(pdf_path):
    """
    Generates Mermaid JS flowchart code representing the entire PDF content.

    Args:
        pdf_path (str): The path to the PDF document.

    Returns:
        str: The generated Mermaid JS flowchart code.
    """
    if not GOOGLE_API_KEY:
        st.error("Google API key not found. Please set it as an environment variable.")
        return "Cannot generate flowchart without Google API key."

    context = load_pdf(pdf_path)

    full_prompt = f"""Generate a high-level Mermaid JS flowchart (using 'graph TD' format) summarizing the core process or methodology described in the document. Ensure the flowchart is concise and captures the main stages. Output ONLY the raw Mermaid JS code. Do NOT include any additional text or explanations. If the context does not describe a clear process suitable for a flowchart, output 'Cannot generate a flowchart from the provided context.'. When creating labels for nodes or edges, if a label contains spaces, special characters, or anything other than simple words, enclose the label in double quotes. For example, use 'A[\"Start Process\"]' instead of 'A[Start Process]'. Also, avoid using any unicode characters in the labels.\n\nContext: {context}\n\n"""

    flowchart_code = generate_answer(full_prompt).strip()

    if flowchart_code.startswith('```mermaid') and flowchart_code.endswith('```'):
        flowchart_code = flowchart_code[len('```mermaid'):-len('```')].strip()
    elif flowchart_code.startswith('```') and flowchart_code.endswith('```'):
        flowchart_code = flowchart_code[len('```'):-len('```')].strip()

    if "Cannot generate a flowchart from the provided context." in flowchart_code:
        return "Cannot generate a flowchart from the provided context."

    return flowchart_code


components.html(
    '<marquee behavior="scroll" direction="left" scrollamount="5"><b style="color: white; font-size: 20px;">Developed by: Harshit K, Rounak, Harshit D, Priyanshu, Prabhanshu and Akshat</b></marquee>',
    height=30
)
st.title("PDF Q&A and Flowchart Generator")
st.write("Upload a PDF, ask questions, or generate a flowchart.")

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_file.write(uploaded_file.getvalue())
        pdf_path = tmp_file.name

    st.success(f"PDF uploaded: {uploaded_file.name}")

    st.header("Question Answering (RAG)")
    query = st.text_input("Ask a question about the PDF:")
    if st.button("Get Answer"):
        if query:
            with st.spinner("Generating answer..."):
                answer = run_rag(query, pdf_path)
                st.write("**Answer:**", answer)
        else:
            st.warning("Please enter a question.")

    st.header("Flowchart Generator")
    if st.button("Generate Flowchart"):
        with st.spinner("Generating flowchart..."):
            flowchart_code = generate_flowchart(pdf_path)
            if flowchart_code and "Cannot generate a flowchart" not in flowchart_code:
                st.success("Flowchart generated!")
                st.subheader("Raw Mermaid Code :")
                st.code(flowchart_code, language='mermaid')
                mermaid_html = f"""
                <pre class="mermaid">
                {flowchart_code}
                </pre>
                <script type="module">
                import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.mjs';
                mermaid.initialize({{ startOnLoad: true }});
                </script>
                """
                components.html(mermaid_html, height=1500)
            else:
                st.error(flowchart_code)

else:
    st.info("Please upload a PDF file to get started.")

Overwriting app.py


In [4]:
import os
from google.colab import userdata

os.environ['COHERE_API_KEY'] = userdata.get('COHERE_API_KEY')
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')

print("API keys re-set as environment variables. You can now run the Streamlit app.")

API keys re-set as environment variables. You can now run the Streamlit app.


In [5]:
import os

!kill $(lsof -t -i:8501) 2>/dev/null || echo "No process on port 8501 to kill."

print("Installing cloudflared...")
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O /usr/local/bin/cloudflared
!chmod +x /usr/local/bin/cloudflared
print("cloudflared installed.")

No process on port 8501 to kill.
Installing cloudflared...
cloudflared installed.


In [6]:
import os

!streamlit run app.py --server.port 8501 --server.enableCORS false --server.enableXsrfProtection false > /dev/null 2>&1 &

!/usr/local/bin/cloudflared tunnel --url http://localhost:8501 --logfile cloudflared.log &

[90m2026-01-20T15:17:01Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2026-01-20T15:17:01Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2026-01-20T15:17:05Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2026-01-20T15:17:05Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2026