In [3]:
!pip install fastapi[all]
!pip install chromadb
!pip install sentence-transformers
!pip install uvicorn
!pip install pyngrok



In [4]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import chromadb
from sentence_transformers import SentenceTransformer
import uvicorn
from pyngrok import ngrok

In [5]:
# Initializing FastAPI app
app = FastAPI()

# Initializing Chroma client
client = chromadb.Client()

# Initializing sentence-transformer model for embedding generation
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
class Document(BaseModel):
    title: str
    content: str

class Query(BaseModel):
    query_text: str

In [7]:
def get_embedding(text: str):
    return model.encode(text).tolist()

In [8]:
# Creating a Chroma collection for storing documents and embeddings
collection = client.create_collection(name="documents")

@app.post("/ingest_document/")
async def ingest_document(document: Document):
    embedding = get_embedding(document.content)
    collection.add(
        documents=[document.content],
        metadatas=[{"title": document.title}],
        embeddings=[embedding]
    )
    return {"message": "Document ingested successfully!"}

In [7]:
@app.post("/query/")
async def query_documents(query: Query):
    # Embedding for the query text
    query_embedding = get_embedding(query.query_text)

    # search in the ChromaDB collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=3  # You can adjust the number of results
    )

    # Returning the top results
    return {"results": results['documents']}

In [10]:
from pyngrok import ngrok
ngrok.set_auth_token("2ocvA3QT7s7WFrDrndBGkZlvHX4_4BeKUfpUwPnGc6a524Zhn")
public_url = ngrok.connect(8000)
print(f"Public URL: {public_url}")

Public URL: NgrokTunnel: "https://4f94-34-168-146-21.ngrok-free.app" -> "http://localhost:8000"


In [11]:
# Running Uvicorn server
!uvicorn main:app --host 0.0.0.0 --port 8000 --reload

[32mINFO[0m:     Will watch for changes in these directories: ['/content']
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m24134[0m] using [36m[1mWatchFiles[0m
[31mERROR[0m:    Error loading ASGI app. Could not import module "main".
[32mINFO[0m:     Stopping reloader process [[36m[1m24134[0m]


In [12]:
!nohup uvicorn main:app --host 0.0.0.0 --port 8000 --reload &

nohup: appending output to 'nohup.out'


In [13]:
# main.py
from fastapi import FastAPI

app = FastAPI()

@app.get("/")
def read_root():
    return {"message": "Hello, World!"}

In [14]:
!pip install pyngrok



In [15]:
from pyngrok import ngrok

# Expose the FastAPI server to the public
public_url = ngrok.connect(8000)

# Print the public URL
print(f"FastAPI server is live at: {public_url}")

FastAPI server is live at: NgrokTunnel: "https://3c25-34-168-146-21.ngrok-free.app" -> "http://localhost:8000"


In [16]:
!pip install chromadb



In [17]:
!pip install sentence-transformers



In [19]:
!rm -rf ./chroma_db

In [21]:
print(f"Chroma client initialized with collection: {collection.name}")

Chroma client initialized with collection: documents


In [22]:
!pip install python-docx PyPDF2

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, PyPDF2
Successfully installed PyPDF2-3.0.1 python-docx-1.1.2


Creating sample data

In [45]:
# Text data
sample_txt = """This is a sample text document.
It contains information related to a test case.
I'm using it to check document ingestion functionality."""

with open("sample_document.txt", "w") as f:
    f.write(sample_txt)

In [46]:
# Document
from docx import Document
doc = Document()
doc.add_paragraph("This is a sample DOCX document.")
doc.add_paragraph("It contains text for testing purposes.")
doc.save("sample_document.docx")

In [26]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=8759c8c5544bdb8a6b747c98210e134bd739974eb1bbad7f550e8147a7ff0afd
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [31]:
!pip install PyPDF2



In [47]:
# PDF
from fpdf import FPDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="This is a sample PDF document.", ln=True, align="C")
pdf.output("sample_document.pdf")

''

In [48]:
from PyPDF2 import PdfReader

def read_and_add_to_chroma(file_path, collection):
    # Read the content of the document based on file type
    if file_path.endswith(".txt"):
        with open(file_path, "r") as f:
            text = f.read()
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    elif file_path.endswith(".pdf"):
        with open(file_path, "rb") as f:
            pdf = PdfReader(f)
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
    else:
        print(f"Unsupported file type: {file_path}")
        return

    # Generating a unique ID for each document for ease
    doc_id = os.path.basename(file_path)


    collection.add(
        documents=[text],
        metadatas=[{"source": file_path}],
        ids=[doc_id],
    )

    print(f"Document '{file_path}' added with ID: {doc_id}")

In [49]:
# Ingesting the sample files
read_and_add_to_chroma("sample_document.txt", collection)

read_and_add_to_chroma("sample_document.docx", collection)

read_and_add_to_chroma("sample_document.pdf", collection)



Document 'sample_document.txt' added with ID: sample_document.txt
Document 'sample_document.docx' added with ID: sample_document.docx




Document 'sample_document.pdf' added with ID: sample_document.pdf


In [None]:
def query_chroma(query, collection):
    results = collection.query(
        query_texts=[query],
        n_results=3
    )
    return results

# Taking an example query
query = "What is the main content of the sample document?"
results = query_chroma(query, collection)
for result in results['documents']:
    print(result)

In [35]:
# query endpoint for FastAPI server
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
# Pydantic model to accept query data
class QueryRequest(BaseModel):
    query: str
@app.post("/query/")
async def query_documents(request: QueryRequest):
    query = request.query
    results = query_chroma(query, collection)
    return {"results": results['documents']}