# üìò RAG-Based QA from Predefined Google Drive PDFs (FAISS + Groq + Streamlit)


Cell 1: Install Dependencies

In [16]:
!pip install requests PyPDF2 streamlit groq langchain langchain-community faiss-cpu sentence-transformers


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

Cell 2:  RAG App Code

In [19]:
%%writefile app.py

# Full Application Code
import os
import requests
from io import BytesIO
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from groq import Groq
import streamlit as st

# ‚úÖ Set your Groq API Key here
os.environ["GROQ_API_KEY"] = [YOUR GROQ API KEY]  # Replace with your actual API key

# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# üìö Predefined Google Drive PDF Links
drive_links = [
    "https://drive.google.com/file/d/1JPf0XvDhn8QoDOlZDrxCOpu4WzKFESNz/view?usp=sharing"]

# üì• Download PDF from Google Drive
def download_pdf_from_drive(drive_link):
    try:
        file_id = drive_link.split("/d/")[1].split("/")[0]
        url = f"https://drive.google.com/uc?id={file_id}&export=download"
        response = requests.get(url)
        response.raise_for_status()
        return BytesIO(response.content)
    except Exception as e:
        raise Exception(f"Failed to download PDF: {e}")

# üìñ Extract text from PDF
def extract_text_from_pdf(pdf_stream):
    reader = PdfReader(pdf_stream)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# ‚úÇÔ∏è Chunk text
def chunk_text(text, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

# üîó Create embeddings and store in FAISS
def create_embeddings_and_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_db = FAISS.from_texts(chunks, embedding=embeddings)
    return vector_db

# üí¨ Query the FAISS DB with Groq

def query_vector_db(query, vector_db):
    docs = vector_db.similarity_search(query, k=3)
    context = "\n".join([doc.page_content for doc in docs])
    completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"Use the following context:\n{context}"},
            {"role": "user", "content": query}
        ],
        model="llama3-8b-8192"
    )
    return completion.choices[0].message.content

# üöÄ Streamlit App
st.title("üìÑ RAG App: Google Drive PDF QA")
all_chunks = []

for link in drive_links:
    try:
        st.write(f"üîó Processing: {link}")
        pdf_stream = download_pdf_from_drive(link)
        text = extract_text_from_pdf(pdf_stream)
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
        st.success(f"‚úÖ Extracted {len(chunks)} chunks.")
    except Exception as e:
        st.error(str(e))

if all_chunks:
    vector_db = create_embeddings_and_store(all_chunks)
    st.success("üß† Embeddings stored in FAISS.")

    user_query = st.text_input("üîç Enter your question:")
    if user_query:
        answer = query_vector_db(user_query, vector_db)
        st.markdown("### üí¨ Answer")
        st.write(answer)


Overwriting app.py


Cell 3: Run Streamlit App in Colab

In [None]:
 !wget -q -O - ipv4.icanhazip.com

!streamlit run app.py & npx localtunnel --port 8501


34.82.92.215

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.82.92.215:8501[0m
[0m
[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0Kyour url is: https://breezy-otter-40.loca.lt
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
2025-07-17 06:42:27.453045: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752734547.484471   14310 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
