From 05f1d3eb700b415f2e8f03ee80bac7832e63b99d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?= Date: Thu, 6 Mar 2025 10:03:20 +0100 Subject: [PATCH] Add tutorial for deploying DeepSeek-R1-Distill-Llama3.1-8B --- .../deepseek_vllmconfig.yaml | 5 + ...eploy_deepseek_r1_distill_llama31-8b.ipynb | 434 ++++++++++++++++++ 2 files changed, 439 insertions(+) create mode 100644 integrations/vllm/deepseek-r1-distill-llama31-8b/deepseek_vllmconfig.yaml create mode 100644 integrations/vllm/deepseek-r1-distill-llama31-8b/deploy_deepseek_r1_distill_llama31-8b.ipynb diff --git a/integrations/vllm/deepseek-r1-distill-llama31-8b/deepseek_vllmconfig.yaml b/integrations/vllm/deepseek-r1-distill-llama31-8b/deepseek_vllmconfig.yaml new file mode 100644 index 00000000..adb96d9b --- /dev/null +++ b/integrations/vllm/deepseek-r1-distill-llama31-8b/deepseek_vllmconfig.yaml @@ -0,0 +1,5 @@ +dtype: "half" +max_model_len: 2048 +gpu_memory_utilization: 0.96 +enable_reasoning: true +reasoning_parser: "deepseek_r1" \ No newline at end of file diff --git a/integrations/vllm/deepseek-r1-distill-llama31-8b/deploy_deepseek_r1_distill_llama31-8b.ipynb b/integrations/vllm/deepseek-r1-distill-llama31-8b/deploy_deepseek_r1_distill_llama31-8b.ipynb new file mode 100644 index 00000000..507d32b9 --- /dev/null +++ b/integrations/vllm/deepseek-r1-distill-llama31-8b/deploy_deepseek_r1_distill_llama31-8b.ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ed30e8ad", + "metadata": {}, + "source": [ + "# A Guide for DeepSeek-R1 distilled Llama3.1-8B on Hopsworks\n", + "\n", + "For details about this Large Language Model (LLM) visit the model page in the HuggingFace repository ➡️ [link](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)" + ] + }, + { + "cell_type": "markdown", + "id": "a01e6751", + "metadata": {}, + "source": [ + "### 1️⃣ Download DeepSeek-R1 distilled Llama3.1-8B using the huggingface_hub library\n", + "\n", + "First, we download the Llama3.1 model files (e.g., weights, configuration files) directly from the HuggingFace repository.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a47db2b4", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install huggingface_hub --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47f3a195", + "metadata": {}, + "outputs": [], + "source": [ + "# Place your HuggingFace token in the HF_TOKEN environment variable\n", + "\n", + "import os\n", + "os.environ[\"HF_TOKEN\"] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2a2e8cc", + "metadata": {}, + "outputs": [], + "source": [ + "from huggingface_hub import snapshot_download\n", + "\n", + "deepseekr1_local_dir = snapshot_download(\"deepseek-ai/DeepSeek-R1-Distill-Llama-8B\", ignore_patterns=\"original/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "ca6a0afd", + "metadata": {}, + "source": [ + "## 2️⃣ Register DeepSeek-R1 distilled Llama3.1 8B-Instruct into Hopsworks Model Registry\n", + "\n", + "Once the model files are downloaded from the HuggingFace repository, we can register the models files into the Hopsworks Model Registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddf42277", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec858cc7", + "metadata": {}, + "outputs": [], + "source": [ + "# The following instantiates a Hopsworks LLM model, not yet saved in the Model Registry\n", + "\n", + "deepseekr1 = mr.llm.create_model(\n", + " name=\"deepseekr1_instruct\",\n", + " description=\"DeepSeek-R1 distilled Llama3.1-8B model (via HF)\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83b4f4e2", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Register the distilled model pointing to the local model files\n", + "\n", + "deepseekr1.save(deepseekr1_local_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "0ccd72b4", + "metadata": {}, + "source": [ + "## 3️⃣ Deploy DeepSeek-R1 distilled Llama3.1-8B\n", + "\n", + "After registering the LLM model into the Model Registry, we can create a deployment that serves it using the vLLM engine." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be612b55", + "metadata": {}, + "outputs": [], + "source": [ + "# Get a reference to the distilled model if not obtained yet\n", + "\n", + "deepseekr1 = mr.get_model(\"deepseekr1_instruct\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2356dbb5", + "metadata": {}, + "outputs": [], + "source": [ + "# Upload vllm engine config file for the deployments\n", + "\n", + "ds_api = project.get_dataset_api()\n", + "\n", + "path_to_config_file = f\"/Projects/{project.name}/\" + ds_api.upload(\"deepseek_vllmconfig.yaml\", \"Resources\", overwrite=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9826f0d9", + "metadata": {}, + "source": [ + "### 🟨 Using vLLM OpenAI server\n", + "\n", + "Create a model deployment by providing a configuration file with the arguments for the vLLM engine." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "077b328c", + "metadata": {}, + "outputs": [], + "source": [ + "deepseekr1_depl = deepseekr1.deploy(\n", + " name=\"deepseekr1\",\n", + " description=\"Deepseek-R1 distilled Llama3.1-8B from HuggingFace\",\n", + " config_file=path_to_config_file,\n", + " resources={\"num_instances\": 1, \"requests\": {\"cores\": 2, \"memory\": 1024*16, \"gpus\": 1}},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "366284d7", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47e32e67", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve one of the deployments created above\n", + "\n", + "ms = project.get_model_serving()\n", + "deepseekr1_depl = ms.get_deployment(\"deepseekr1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3cc9df8", + "metadata": {}, + "outputs": [], + "source": [ + "deepseekr1_depl.start(await_running=60*15) # wait for 15 minutes maximum" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc616b17", + "metadata": {}, + "outputs": [], + "source": [ + "# deepseekr1.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3725f580", + "metadata": {}, + "outputs": [], + "source": [ + "deepseekr1_depl.get_state()" + ] + }, + { + "cell_type": "markdown", + "id": "49f478b0", + "metadata": {}, + "source": [ + "## 4️⃣ Prompting DeepSeek-R1 distilled Llama3.1 8B-Instruct\n", + "\n", + "Once the deployment is up and running, we can start sending user prompts to the LLM. You can either use an OpenAI API-compatible client (e.g., openai library) or any other http client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d50339a4", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Get the istio endpoint from the deployment page in the Hopsworks UI.\n", + "istio_endpoint = \"\" # with format \"http://\"\n", + " \n", + "# Resolve base uri. NOTE: KServe's vLLM server prepends the URIs with /openai\n", + "base_uri = \"/openai\" if deepseekr1_depl.predictor.script_file is not None else \"\"\n", + "\n", + "openai_v1_uri = istio_endpoint + base_uri + \"/v1\"\n", + "completions_url = openai_v1_uri + \"/completions\" \n", + "chat_completions_url = openai_v1_uri + \"/chat/completions\"\n", + "\n", + "# Resolve API key for request authentication\n", + "if \"SERVING_API_KEY\" in os.environ:\n", + " # if running inside Hopsworks\n", + " api_key_value = os.environ[\"SERVING_API_KEY\"]\n", + "else:\n", + " # Create an API KEY using the Hopsworks UI and place the value below\n", + " api_key_value = \"\"\n", + " \n", + "# Prepare request headers\n", + "headers = {\n", + " 'Content-Type': 'application/json',\n", + " 'Authorization': 'ApiKey ' + api_key_value,\n", + " 'Host': f\"{deepseekr1_depl.name}.{project.name.lower().replace('_', '-')}.hopsworks.ai\", # also provided in the Hopsworks UI\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "cedb83ac", + "metadata": {}, + "source": [ + "### 🟨 Using httpx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7320b408", + "metadata": {}, + "outputs": [], + "source": [ + "import httpx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f4f6ff2", + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Chat Completion for a user message\n", + "#\n", + "\n", + "# Round 1\n", + "user_message = \"9.11 and 9.8, which is greater?\"\n", + "completion_request = {\n", + " \"model\": deepseekr1_depl.name,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": user_message\n", + " }\n", + " ]\n", + "}\n", + "\n", + "response = httpx.post(chat_completions_url, headers=headers, json=completion_request, timeout=45.0)\n", + "print(response)\n", + "content = response.json()[\"choices\"][0][\"message\"][\"content\"]\n", + "\n", + "print(\"Resoning content: \", response.json()[\"choices\"][0][\"message\"][\"reasoning_content\"])\n", + "print(\"Content: \", content)\n", + "\n", + "# Round 2\n", + "completion_request[\"messages\"].append({\"role\": \"assistant\", \"content\": content})\n", + "completion_request[\"messages\"].append({\n", + " \"role\": \"user\",\n", + " \"content\": \"How many Rs are there in the word 'strawberry'?\",\n", + "})\n", + "\n", + "response = httpx.post(chat_completions_url, headers=headers, json=completion_request, timeout=45.0)\n", + "content = response.json()[\"choices\"][0][\"message\"][\"content\"]\n", + "\n", + "print(\"Resoning content: \", response.json()[\"choices\"][0][\"message\"][\"reasoning_content\"])\n", + "print(\"Content: \", content)" + ] + }, + { + "cell_type": "markdown", + "id": "9bdc4d24", + "metadata": {}, + "source": [ + "### 🟨 Using OpenAI client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "759b6f03", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install openai --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8418bfe2", + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f1c261b", + "metadata": {}, + "outputs": [], + "source": [ + "client = OpenAI(\n", + " base_url=openai_v1_uri,\n", + " api_key=\"X\",\n", + " default_headers=headers\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e15c22a4", + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Chat Completion for a user message\n", + "#\n", + "\n", + "# Round 1\n", + "messages = [{\"role\": \"user\", \"content\": \"9.11 and 9.8, which is greater?\"}]\n", + "\n", + "response = client.chat.completions.create(model=deepseekr1_depl.name, messages=messages)\n", + "content = response.choices[0].message.content\n", + "\n", + "print(\"reasoning_content for Round 1:\", response.choices[0].message.reasoning_content)\n", + "print(\"content for Round 1:\", content)\n", + "\n", + "# Round 2\n", + "messages.append({\"role\": \"assistant\", \"content\": content})\n", + "messages.append({\n", + " \"role\": \"user\",\n", + " \"content\": \"How many Rs are there in the word 'strawberry'?\",\n", + "})\n", + "response = client.chat.completions.create(model=model, messages=messages)\n", + "content = response.choices[0].message.content\n", + "\n", + "print(\"reasoning_content for Round 2:\", response.choices[0].message.reasoning_content)\n", + "print(\"content for Round 2:\", content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbea21f7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}