In [1]:
!pip install requests faiss-cpu transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [2]:
from google.colab import files
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import requests
from hashlib import sha256

In [3]:

# Document processing functions
def read_and_chunk_files(uploaded_files, chunk_size=512):
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    documents = []

    for filename in uploaded_files.keys():
        with open(filename, 'r') as file:
            text = file.read()
            tokens = tokenizer.encode(text, add_special_tokens=False)

            # Create overlapping chunks
            for i in range(0, len(tokens), chunk_size//2):
                chunk = tokens[i:i+chunk_size]
                documents.append(tokenizer.decode(chunk))

    return deduplicate_documents(documents)

In [4]:

def deduplicate_documents(documents):
    seen = set()
    unique = []
    for doc in documents:
        digest = sha256(doc.encode()).hexdigest()
        if digest not in seen:
            seen.add(digest)
            unique.append(doc)
    return unique

In [5]:
# Vector database setup
def create_faiss_index(documents):
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    vectors = []
    for doc in documents:
        inputs = tokenizer(doc, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        vec = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        vectors.append(vec)

    dimension = vectors[0].shape[0]
    index = faiss.IndexFlatIP(dimension)
    faiss.normalize_L2(np.array(vectors))
    index.add(np.array(vectors))
    return index, model, tokenizer

In [6]:
# Query processing
def retrieve_context(question, index, model, tokenizer, documents, top_k=3):
    inputs = tokenizer(question, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    question_vector = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    faiss.normalize_L2(question_vector.reshape(1, -1))
    distances, indices = index.search(question_vector.reshape(1, -1), top_k)
    return "\n".join([documents[i] for i in indices[0]])

In [7]:
# LLM interaction
def generate_answer(question, context, ngrok_url, model_name):
    prompt = f"""Context: {context}

    Question: {question}

    Answer clearly and concisely using only the provided context. If unsure, say 'I don't know'."""

    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0.3,
            "top_p": 0.9
        }
    }

    try:
        response = requests.post(f"{ngrok_url}/api/generate", json=payload, timeout=30)
        return response.json()["response"]
    except Exception as e:
        return f"Error: {str(e)}"

In [8]:
# Main execution flow
if __name__ == "__main__":
    # 1. Upload and process documents
    uploaded = files.upload()
    documents = read_and_chunk_files(uploaded)

    # 2. Create vector store
    index, model, tokenizer = create_faiss_index(documents)

    # 3. User interaction
    ngrok_url = "https://047b-2601-8c-4901-d2c0-1530-3a83-6136-7674.ngrok-free.app/"  # Replace with actual URL
    model_name = "deepseek-r1:1.5b"

    while True:
        question = input("\nEnter your question (or 'quit' to exit): ")
        if question.lower() == 'quit':
            break

        # Retrieve context
        context = retrieve_context(question, index, model, tokenizer, documents)

        # Generate answer
        answer = generate_answer(question, context, ngrok_url, model_name)
        print(f"\nAnswer: {answer}")

Saving sample.txt to sample.txt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]


Enter your question (or 'quit' to exit): whats the content of this file?

Answer: <think>
Alright, let's break down how I approached answering the user's question about the content of the provided AWS health file.

First, I read through the entire context to understand what was mentioned. The context had three identical paragraphs, each discussing aws health and its new features related to Kubernetes, RDS certificates, and support for other open-source software. Each paragraph repeated similar information, which might indicate redundancy or a copy-paste error.

I noticed that all the content was about providing visibility into planned lifecycle events, introducing new features, and emphasizing that aws health is the authoritative source of information. There were no specific details about what the file contains beyond its purpose and the features it covers.

Since the user asked for the content of the file, I considered whether the context provided any specific information about the f