<a href="https://colab.research.google.com/github/nhareesha/MLAI/blob/main/LLM/finetuned/RAG/RAG_mobile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install faiss-cpu


Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [7]:
import faiss
print(faiss.__version__)

1.10.0


In [8]:
import sqlite3
print(sqlite3.version)

2.6.0


In [9]:
import faiss
import sqlite3
import numpy as np
from sentence_transformers import SentenceTransformer

In [10]:
# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
def create_vector_store():
    # Initialize FAISS index
    dimension = 384  # Embedding size for MiniLM
    index = faiss.IndexFlatL2(dimension)
    return index


In [12]:
def save_embeddings_to_db(index, documents):
    # Convert documents to embeddings
    embeddings = np.array(embedder.encode(documents), dtype=np.float32)
    index.add(embeddings)

    # Store in SQLite for persistence
    conn = sqlite3.connect("vector_store.db")
    c = conn.cursor()
    c.execute("CREATE TABLE IF NOT EXISTS docs (id INTEGER PRIMARY KEY, text TEXT)")
    for i, doc in enumerate(documents):
        c.execute("INSERT INTO docs (id, text) VALUES (?, ?)", (i, doc))
    conn.commit()
    conn.close()


In [13]:
def load_faiss_index():
    # Load FAISS index from SQLite stored embeddings
    conn = sqlite3.connect("vector_store.db")
    c = conn.cursor()
    c.execute("SELECT COUNT(*) FROM docs")
    num_docs = c.fetchone()[0]
    conn.close()

    if num_docs > 0:
        return create_vector_store()  # Recreate index
    return None


In [14]:
# Example documents
documents = ["Mobile AI is improving every day.", "RAG models are great for retrieval.", "FAISS helps with fast search."]
index = create_vector_store()
save_embeddings_to_db(index, documents)

3. Implement Retrieval & Ranking

In [15]:
def retrieve_and_rank(query, index, k=2):
    query_vec = np.array(embedder.encode([query]), dtype=np.float32)
    _, indices = index.search(query_vec, k)

    # Fetch retrieved documents
    conn = sqlite3.connect("vector_store.db")
    c = conn.cursor()
    retrieved_docs = []
    for idx in indices[0]:
        c.execute("SELECT text FROM docs WHERE id=?", (int(idx),))
        retrieved_docs.append(c.fetchone()[0])
    conn.close()
    return retrieved_docs

In [25]:
# Error handling for NoneType object - when index does not exist in SQLLite DB
def retrieve_and_rank(query, index, k=2):
    query_vec = np.array(embedder.encode([query]), dtype=np.float32)
    _, indices = index.search(query_vec, k)

    conn = sqlite3.connect("vector_store.db")
    c = conn.cursor()
    retrieved_docs = []

    for idx in indices[0]:
        c.execute("SELECT text FROM docs WHERE id=?", (int(idx),))
        result = c.fetchone()
        if result:  # Ensure result is not None
            retrieved_docs.append(result[0])
        else:
            print(f"Warning: Index {idx} not found in SQLite database.")

    conn.close()
    return retrieved_docs


4. Load a Compact LLM and Generate Responses

In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2").half().to("cpu")

def generate_response(query):
    retrieved_docs = retrieve_and_rank(query, index)
    context = "\n".join(retrieved_docs)
    input_text = f"Context: {context}\nUser Query: {query}\nResponse:"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
query='What is fiass'
resp = generate_response(query)
print(resp)

##What if DB does not have relavent data
```
Why Does FAISS Return -1?

conn = sqlite3.connect("vector_store.db")
c = conn.cursor()
c.execute("SELECT COUNT(*) FROM docs")
print("Number of documents in DB:", c.fetchone()[0])
conn.close()

It prints 0, so SQLite database is empty and I need to repopulate it.
```


5. Deploy as a Mobile-Accessible API with Flask

To execute
- 1. Start the API Server - python script.py
This will start the server on http://0.0.0.0:5000

- 2. Send a Test Query
curl -X POST "http://localhost:5000/rag" -H "Content-Type: application/json" -d '{"query": "What is RAG?"}'

Or use Python:
`
```
import requests

response = requests.post("http://localhost:5000/rag", json={"query": "What is RAG?"})
print(response.json())
```






#Running the App on Google Colab
Since we are running the Flask server on Google Colab, we will need to expose it using a tool like **ngrok** to make it accessible from my local machine or mobile device.

1. Install and Run the Flask App in Colab
```
!pip install faiss-cpu transformers torch sentence-transformers flask sqlite3 flask-ngrok
```

2. Modify the Flask Code for ngrok
Update the ```app.run()``` line at the end of the Flask app:

```
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

if __name__ == "__main__":
    index = load_faiss_index()
    if index is None:
        index = create_vector_store()
        save_embeddings_to_db(index, documents)
    app.run()
    ```
3. Start the Server
Run the entire script, and we should see an output like:

```
* Running on http://xxxxx.ngrok.io
```

4. Test from my Mobile Device
Use Postman or a browser to send a POST request:

```
curl -X POST "http://xxxxx.ngrok.io/rag" -H "Content-Type: application/json" -d '{"query": "What is RAG?"}'
```

This should return a generated response from my mobile-optimized RAG model.



In [18]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route("/rag", methods=["POST"])
def rag_api():
    data = request.json
    query = data.get("query", "")
    response = generate_response(query)
    return jsonify({"response": response})

if __name__ == "__main__":
    index = load_faiss_index()
    if index is None:
        index = create_vector_store()
        save_embeddings_to_db(index, documents)
    app.run(host="0.0.0.0", port=5000)
    # app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [20]:
!pip install 'flask_ngrok'

Collecting flask_ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask_ngrok
Successfully installed flask_ngrok-0.0.25


In [21]:
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

if __name__ == "__main__":
    index = load_faiss_index()
    if index is None:
        index = create_vector_store()
        save_embeddings_to_db(index, documents)
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-11:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 493, in _make_reque

In [22]:
import requests

try:
    response = requests.get('http://localhost:4040/api/tunnels')
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
    print(response.json())
except requests.exceptions.ConnectionError as e:
    print(f"Connection error: {e}")
except requests.exceptions.RequestException as e:
    print(f"Request exception: {e}")


Connection error: HTTPConnectionPool(host='localhost', port=4040): Max retries exceeded with url: /api/tunnels (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7cd545d854d0>: Failed to establish a new connection: [Errno 111] Connection refused'))
