From a79fcc0e97037850eb144888739b020ba8a36a58 Mon Sep 17 00:00:00 2001 From: pavanmantha Date: Sun, 10 Nov 2024 21:01:29 +0530 Subject: [PATCH 1/2] -added api and docker capability of semantic cache and semantic router --- .../templates/qdrant/semantic_cache/.env | 7 ++- .../qdrant/semantic_cache/api_server.py | 23 ++++++--- .../templates/qdrant/semantic_cache/client.py | 17 +++++++ .../templates/qdrant/semantic_cache/readme.md | 14 +++++- .../qdrant/semantic_cache/semantic_cache.py | 10 ++-- .../templates/qdrant/semantic_routing/.env | 5 +- .../qdrant/semantic_routing/api_server.py | 50 ++++++++++++++++--- .../qdrant/semantic_routing/client.py | 17 +++++++ .../qdrant/semantic_routing/readme.md | 12 ++++- 9 files changed, 134 insertions(+), 21 deletions(-) create mode 100644 bootstraprag/templates/qdrant/semantic_cache/client.py create mode 100644 bootstraprag/templates/qdrant/semantic_routing/client.py diff --git a/bootstraprag/templates/qdrant/semantic_cache/.env b/bootstraprag/templates/qdrant/semantic_cache/.env index 8aafba2..13c8a64 100644 --- a/bootstraprag/templates/qdrant/semantic_cache/.env +++ b/bootstraprag/templates/qdrant/semantic_cache/.env @@ -2,4 +2,9 @@ QDRANT_URL='http://localhost:6333' QDRANT_API_KEY='th3s3cr3tk3y' OLLAMA_MODEL='llama3.2:latest' -OLLAMA_BASE_URL='http://localhost:11434' \ No newline at end of file +OLLAMA_BASE_URL='http://localhost:11434' + +model_name_or_path='all-MiniLM-L6-v2' + +LIT_SERVER_PORT=8000 +LIT_SERVER_WORKERS_PER_DEVICE=4 \ No newline at end of file diff --git a/bootstraprag/templates/qdrant/semantic_cache/api_server.py b/bootstraprag/templates/qdrant/semantic_cache/api_server.py index e69bad6..f8001d4 100644 --- a/bootstraprag/templates/qdrant/semantic_cache/api_server.py +++ b/bootstraprag/templates/qdrant/semantic_cache/api_server.py @@ -1,19 +1,30 @@ from abc import ABC +from semantic_cache import SemanticCache, compute_response import litserve as ls +from dotenv import load_dotenv, find_dotenv +import os class SemanticCacheAPI(ls.LitAPI, ABC): def __init__(self): - pass + load_dotenv(find_dotenv()) + self.semantic_cache: SemanticCache = None def setup(self, device): - pass + self.semantic_cache = SemanticCache() def decode_request(self, request, **kwargs): - pass + return request['question'] - def predict(self, x, **kwargs): - pass + def predict(self, query, **kwargs): + return self.semantic_cache.get_response(query=query, compute_response_func=compute_response) def encode_response(self, output, **kwargs): - pass + return {"response": output} + + +if __name__ == '__main__': + api = SemanticCacheAPI() + server = ls.LitServer(lit_api=api, api_path='/api/v1/chat-completion', + workers_per_device=int(os.environ.get('LIT_SERVER_WORKERS_PER_DEVICE'))) + server.run(port=os.environ.get('LIT_SERVER_PORT')) \ No newline at end of file diff --git a/bootstraprag/templates/qdrant/semantic_cache/client.py b/bootstraprag/templates/qdrant/semantic_cache/client.py new file mode 100644 index 0000000..e396db2 --- /dev/null +++ b/bootstraprag/templates/qdrant/semantic_cache/client.py @@ -0,0 +1,17 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import requests + +response = requests.post("http://127.0.0.1:8000/predict", json={"input": 4.0}) +print(f"Status: {response.status_code}\nResponse:\n {response.text}") diff --git a/bootstraprag/templates/qdrant/semantic_cache/readme.md b/bootstraprag/templates/qdrant/semantic_cache/readme.md index 3e7afa1..194cef3 100644 --- a/bootstraprag/templates/qdrant/semantic_cache/readme.md +++ b/bootstraprag/templates/qdrant/semantic_cache/readme.md @@ -1,4 +1,16 @@ ## Qdrant Semantic Cache +Semantic Cache is a superfast cache mechanism on contextual meaning very much useful for LLM giving same response with out much deviation. +### How to run - `pip install -r requirements.txt` -- `python semantic_cache.py` \ No newline at end of file +- `python semantic_cache.py` + +### Expose Semantic Cache as API +- `python api_server.py` +```text +API: http://localhost:8000/api/v1/chat-completion +Method: POST +payload: { + "question": "what is the capital of India?" +} +``` diff --git a/bootstraprag/templates/qdrant/semantic_cache/semantic_cache.py b/bootstraprag/templates/qdrant/semantic_cache/semantic_cache.py index 8da1a85..07f083d 100644 --- a/bootstraprag/templates/qdrant/semantic_cache/semantic_cache.py +++ b/bootstraprag/templates/qdrant/semantic_cache/semantic_cache.py @@ -13,7 +13,7 @@ def __init__(self, threshold=0.35): # load the data from env load_dotenv(find_dotenv()) - self.encoder = SentenceTransformer('all-MiniLM-L6-v2') + self.encoder = SentenceTransformer(model_name_or_path=os.environ.get('model_name_or_path')) self.cache_client = QdrantClient(url=os.environ.get('QDRANT_URL'), api_key=os.environ.get('QDRANT_API_KEY')) self.cache_collection_name = "cache" self.threshold = threshold @@ -78,7 +78,7 @@ def compute_response(query: str): return f"Computed response for: {query} is {assistant_message}" -semantic_cache = SemanticCache(threshold=0.8) -query = "What is the capital of France?" -response = semantic_cache.get_response(query, compute_response) -print(response) +# semantic_cache = SemanticCache(threshold=0.8) +# query = "What is the capital of France?" +# response = semantic_cache.get_response(query, compute_response) +# print(response) diff --git a/bootstraprag/templates/qdrant/semantic_routing/.env b/bootstraprag/templates/qdrant/semantic_routing/.env index 4dcbfc2..7c876d0 100644 --- a/bootstraprag/templates/qdrant/semantic_routing/.env +++ b/bootstraprag/templates/qdrant/semantic_routing/.env @@ -1,3 +1,6 @@ encoder_model='sentence-transformers/all-MiniLM-L6-v2' qdrant_api_key='th3s3cr3tk3y' -qdrant_url='http://localhost:6333/' \ No newline at end of file +qdrant_url='http://localhost:6333/' + +LIT_SERVER_PORT=8000 +LIT_SERVER_WORKERS_PER_DEVICE=4 \ No newline at end of file diff --git a/bootstraprag/templates/qdrant/semantic_routing/api_server.py b/bootstraprag/templates/qdrant/semantic_routing/api_server.py index a3529f3..74a1776 100644 --- a/bootstraprag/templates/qdrant/semantic_routing/api_server.py +++ b/bootstraprag/templates/qdrant/semantic_routing/api_server.py @@ -1,19 +1,57 @@ from abc import ABC + +from semantic_router import Route + +from semantic_routing_core import SemanticRouter import litserve as ls +import os class SemanticRoutingAPI(ls.LitAPI, ABC): def __init__(self): - pass + self.semantic_routing_core = None + # Define routes + politics = Route( + name="politics", + utterances=[ + "isn't politics the best thing ever", + "why don't you tell me about your political opinions", + "don't you just love the president", + "they're going to destroy this country!", + "they will save the country!", + ], + ) + + chitchat = Route( + name="chitchat", + utterances=[ + "how's the weather today?", + "how are things going?", + "lovely weather today", + "the weather is horrendous", + "let's go to the chippy", + ], + ) + + self.routes = [politics, chitchat] def setup(self, device): - pass + self.semantic_routing_core = SemanticRouter() + # Set up routes + self.semantic_routing_core.setup_routes(self.routes) def decode_request(self, request, **kwargs): - pass + return request['question'] - def predict(self, x, **kwargs): - pass + def predict(self, query, **kwargs): + return self.semantic_routing_core.route_query(query=query) def encode_response(self, output, **kwargs): - pass + return {'response': output} + + +if __name__ == '__main__': + api = SemanticRoutingAPI() + server = ls.LitServer(lit_api=api, api_path='/api/v1/chat-completion', + workers_per_device=int(os.environ.get('LIT_SERVER_WORKERS_PER_DEVICE'))) + server.run(port=os.environ.get('LIT_SERVER_PORT')) diff --git a/bootstraprag/templates/qdrant/semantic_routing/client.py b/bootstraprag/templates/qdrant/semantic_routing/client.py new file mode 100644 index 0000000..e396db2 --- /dev/null +++ b/bootstraprag/templates/qdrant/semantic_routing/client.py @@ -0,0 +1,17 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import requests + +response = requests.post("http://127.0.0.1:8000/predict", json={"input": 4.0}) +print(f"Status: {response.status_code}\nResponse:\n {response.text}") diff --git a/bootstraprag/templates/qdrant/semantic_routing/readme.md b/bootstraprag/templates/qdrant/semantic_routing/readme.md index b19049a..dcadcda 100644 --- a/bootstraprag/templates/qdrant/semantic_routing/readme.md +++ b/bootstraprag/templates/qdrant/semantic_routing/readme.md @@ -3,4 +3,14 @@ Semantic Router is a superfast decision-making layer for your LLMs and agents. R ### How to execute code 1. `pip install -r requirements.txt` -2. `python main.py` \ No newline at end of file +2. `python main.py` + +### Expose Semantic Router as API +- `python api_server.py` +```text +API: http://localhost:8000/api/v1/chat-completion +Method: POST +payload: { + "question": "what is the Weather today?" +} +``` \ No newline at end of file From 7b2033a4851c533fd84716b117c9e22beeb9ff05 Mon Sep 17 00:00:00 2001 From: pavanmantha Date: Sun, 10 Nov 2024 21:06:48 +0530 Subject: [PATCH 2/2] -enhanced cli options and -incremented setup --- bootstraprag/cli.py | 18 +++++++++++++++--- setup.py | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bootstraprag/cli.py b/bootstraprag/cli.py index 57f9893..ba5afe6 100644 --- a/bootstraprag/cli.py +++ b/bootstraprag/cli.py @@ -50,11 +50,23 @@ def create(project_name, framework, template, observability): ] elif framework == 'standalone-qdrant': framework = 'qdrant' - template_choices = ['simple-search', 'multimodal-search', 'hybrid-search', 'hybrid-search-advanced', - 'retrieval-quality', 'semantic-cache'] + template_choices = [ + 'simple-search', + 'multimodal-search', + 'hybrid-search', + 'hybrid-search-advanced', + 'retrieval-quality', + 'semantic-cache', + 'semantic-routing' + ] elif framework == 'standalone-evaluations': framework = 'evaluations' - template_choices = ['deep-evals', 'mlflow-evals', 'phoenix-evals', 'ragas-evals'] + template_choices = [ + 'deep-evals', + 'mlflow-evals', + 'phoenix-evals', + 'ragas-evals' + ] # Use InquirerPy to select template with arrow keys template = inquirer.select( message="Which template would you like to use?", diff --git a/setup.py b/setup.py index dc1e2ff..ee4c299 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='bootstrap-rag', - version='0.0.13', + version='0.0.14', long_description=long_description, long_description_content_type="text/markdown", packages=find_packages(),