From 05f1d3eb700b415f2e8f03ee80bac7832e63b99d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?=
 <javier@logicalclocks.com>
Date: Thu, 6 Mar 2025 10:03:20 +0100
Subject: [PATCH] Add tutorial for deploying DeepSeek-R1-Distill-Llama3.1-8B

---
 .../deepseek_vllmconfig.yaml                  |   5 +
 ...eploy_deepseek_r1_distill_llama31-8b.ipynb | 434 ++++++++++++++++++
 2 files changed, 439 insertions(+)
 create mode 100644 integrations/vllm/deepseek-r1-distill-llama31-8b/deepseek_vllmconfig.yaml
 create mode 100644 integrations/vllm/deepseek-r1-distill-llama31-8b/deploy_deepseek_r1_distill_llama31-8b.ipynb

diff --git a/integrations/vllm/deepseek-r1-distill-llama31-8b/deepseek_vllmconfig.yaml b/integrations/vllm/deepseek-r1-distill-llama31-8b/deepseek_vllmconfig.yaml
new file mode 100644
index 00000000..adb96d9b
--- /dev/null
+++ b/integrations/vllm/deepseek-r1-distill-llama31-8b/deepseek_vllmconfig.yaml
@@ -0,0 +1,5 @@
+dtype: "half"
+max_model_len: 2048
+gpu_memory_utilization: 0.96
+enable_reasoning: true
+reasoning_parser: "deepseek_r1"
\ No newline at end of file
diff --git a/integrations/vllm/deepseek-r1-distill-llama31-8b/deploy_deepseek_r1_distill_llama31-8b.ipynb b/integrations/vllm/deepseek-r1-distill-llama31-8b/deploy_deepseek_r1_distill_llama31-8b.ipynb
new file mode 100644
index 00000000..507d32b9
--- /dev/null
+++ b/integrations/vllm/deepseek-r1-distill-llama31-8b/deploy_deepseek_r1_distill_llama31-8b.ipynb
@@ -0,0 +1,434 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ed30e8ad",
+   "metadata": {},
+   "source": [
+    "# A Guide for DeepSeek-R1 distilled Llama3.1-8B on Hopsworks\n",
+    "\n",
+    "For details about this Large Language Model (LLM) visit the model page in the HuggingFace repository ➡️ [link](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a01e6751",
+   "metadata": {},
+   "source": [
+    "### 1️⃣ Download DeepSeek-R1 distilled Llama3.1-8B using the huggingface_hub library\n",
+    "\n",
+    "First, we download the Llama3.1 model files (e.g., weights, configuration files) directly from the HuggingFace repository.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a47db2b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install huggingface_hub --quiet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47f3a195",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Place your HuggingFace token in the HF_TOKEN environment variable\n",
+    "\n",
+    "import os\n",
+    "os.environ[\"HF_TOKEN\"] = \"<INSERT_YOUR_HF_TOKEN>\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2a2e8cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import snapshot_download\n",
+    "\n",
+    "deepseekr1_local_dir = snapshot_download(\"deepseek-ai/DeepSeek-R1-Distill-Llama-8B\", ignore_patterns=\"original/*\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca6a0afd",
+   "metadata": {},
+   "source": [
+    "## 2️⃣ Register DeepSeek-R1 distilled Llama3.1 8B-Instruct into Hopsworks Model Registry\n",
+    "\n",
+    "Once the model files are downloaded from the HuggingFace repository, we can register the models files into the Hopsworks Model Registry."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ddf42277",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import hopsworks\n",
+    "\n",
+    "project = hopsworks.login()\n",
+    "mr = project.get_model_registry()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec858cc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The following instantiates a Hopsworks LLM model, not yet saved in the Model Registry\n",
+    "\n",
+    "deepseekr1 = mr.llm.create_model(\n",
+    "    name=\"deepseekr1_instruct\",\n",
+    "    description=\"DeepSeek-R1 distilled Llama3.1-8B model (via HF)\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83b4f4e2",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Register the distilled model pointing to the local model files\n",
+    "\n",
+    "deepseekr1.save(deepseekr1_local_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ccd72b4",
+   "metadata": {},
+   "source": [
+    "## 3️⃣ Deploy DeepSeek-R1 distilled Llama3.1-8B\n",
+    "\n",
+    "After registering the LLM model into the Model Registry, we can create a deployment that serves it using the vLLM engine."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be612b55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get a reference to the distilled model if not obtained yet\n",
+    "\n",
+    "deepseekr1 = mr.get_model(\"deepseekr1_instruct\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2356dbb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Upload vllm engine config file for the deployments\n",
+    "\n",
+    "ds_api = project.get_dataset_api()\n",
+    "\n",
+    "path_to_config_file = f\"/Projects/{project.name}/\" + ds_api.upload(\"deepseek_vllmconfig.yaml\", \"Resources\", overwrite=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9826f0d9",
+   "metadata": {},
+   "source": [
+    "### 🟨 Using vLLM OpenAI server\n",
+    "\n",
+    "Create a model deployment by providing a configuration file with the arguments for the vLLM engine."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "077b328c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "deepseekr1_depl = deepseekr1.deploy(\n",
+    "    name=\"deepseekr1\",\n",
+    "    description=\"Deepseek-R1 distilled Llama3.1-8B from HuggingFace\",\n",
+    "    config_file=path_to_config_file,\n",
+    "    resources={\"num_instances\": 1, \"requests\": {\"cores\": 2, \"memory\": 1024*16, \"gpus\": 1}},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "366284d7",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47e32e67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Retrieve one of the deployments created above\n",
+    "\n",
+    "ms = project.get_model_serving()\n",
+    "deepseekr1_depl = ms.get_deployment(\"deepseekr1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3cc9df8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "deepseekr1_depl.start(await_running=60*15) # wait for 15 minutes maximum"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc616b17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# deepseekr1.stop()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3725f580",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "deepseekr1_depl.get_state()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49f478b0",
+   "metadata": {},
+   "source": [
+    "## 4️⃣ Prompting DeepSeek-R1 distilled Llama3.1 8B-Instruct\n",
+    "\n",
+    "Once the deployment is up and running, we can start sending user prompts to the LLM. You can either use an OpenAI API-compatible client (e.g., openai library) or any other http client."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d50339a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Get the istio endpoint from the deployment page in the Hopsworks UI.\n",
+    "istio_endpoint = \"<ISTIO_ENDPOINT>\" # with format \"http://<ip-address>\"\n",
+    "    \n",
+    "# Resolve base uri. NOTE: KServe's vLLM server prepends the URIs with /openai\n",
+    "base_uri = \"/openai\" if deepseekr1_depl.predictor.script_file is not None else \"\"\n",
+    "\n",
+    "openai_v1_uri = istio_endpoint + base_uri + \"/v1\"\n",
+    "completions_url = openai_v1_uri + \"/completions\" \n",
+    "chat_completions_url = openai_v1_uri + \"/chat/completions\"\n",
+    "\n",
+    "# Resolve API key for request authentication\n",
+    "if \"SERVING_API_KEY\" in os.environ:\n",
+    "    # if running inside Hopsworks\n",
+    "    api_key_value = os.environ[\"SERVING_API_KEY\"]\n",
+    "else:\n",
+    "    # Create an API KEY using the Hopsworks UI and place the value below\n",
+    "    api_key_value = \"<API_KEY>\"\n",
+    "    \n",
+    "# Prepare request headers\n",
+    "headers = {\n",
+    "    'Content-Type': 'application/json',\n",
+    "    'Authorization': 'ApiKey ' + api_key_value,\n",
+    "    'Host': f\"{deepseekr1_depl.name}.{project.name.lower().replace('_', '-')}.hopsworks.ai\", # also provided in the Hopsworks UI\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cedb83ac",
+   "metadata": {},
+   "source": [
+    "### 🟨 Using httpx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7320b408",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import httpx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f4f6ff2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#\n",
+    "# Chat Completion for a user message\n",
+    "#\n",
+    "\n",
+    "# Round 1\n",
+    "user_message = \"9.11 and 9.8, which is greater?\"\n",
+    "completion_request = {\n",
+    "    \"model\": deepseekr1_depl.name,\n",
+    "    \"messages\": [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": user_message\n",
+    "        }\n",
+    "    ]\n",
+    "}\n",
+    "\n",
+    "response = httpx.post(chat_completions_url, headers=headers, json=completion_request, timeout=45.0)\n",
+    "print(response)\n",
+    "content = response.json()[\"choices\"][0][\"message\"][\"content\"]\n",
+    "\n",
+    "print(\"Resoning content: \", response.json()[\"choices\"][0][\"message\"][\"reasoning_content\"])\n",
+    "print(\"Content: \", content)\n",
+    "\n",
+    "# Round 2\n",
+    "completion_request[\"messages\"].append({\"role\": \"assistant\", \"content\": content})\n",
+    "completion_request[\"messages\"].append({\n",
+    "    \"role\": \"user\",\n",
+    "    \"content\": \"How many Rs are there in the word 'strawberry'?\",\n",
+    "})\n",
+    "\n",
+    "response = httpx.post(chat_completions_url, headers=headers, json=completion_request, timeout=45.0)\n",
+    "content = response.json()[\"choices\"][0][\"message\"][\"content\"]\n",
+    "\n",
+    "print(\"Resoning content: \", response.json()[\"choices\"][0][\"message\"][\"reasoning_content\"])\n",
+    "print(\"Content: \", content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9bdc4d24",
+   "metadata": {},
+   "source": [
+    "### 🟨 Using OpenAI client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "759b6f03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install openai --quiet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8418bfe2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f1c261b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = OpenAI(\n",
+    "    base_url=openai_v1_uri,\n",
+    "    api_key=\"X\",\n",
+    "    default_headers=headers\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e15c22a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#\n",
+    "# Chat Completion for a user message\n",
+    "#\n",
+    "\n",
+    "# Round 1\n",
+    "messages = [{\"role\": \"user\", \"content\": \"9.11 and 9.8, which is greater?\"}]\n",
+    "\n",
+    "response = client.chat.completions.create(model=deepseekr1_depl.name, messages=messages)\n",
+    "content = response.choices[0].message.content\n",
+    "\n",
+    "print(\"reasoning_content for Round 1:\", response.choices[0].message.reasoning_content)\n",
+    "print(\"content for Round 1:\", content)\n",
+    "\n",
+    "# Round 2\n",
+    "messages.append({\"role\": \"assistant\", \"content\": content})\n",
+    "messages.append({\n",
+    "    \"role\": \"user\",\n",
+    "    \"content\": \"How many Rs are there in the word 'strawberry'?\",\n",
+    "})\n",
+    "response = client.chat.completions.create(model=model, messages=messages)\n",
+    "content = response.choices[0].message.content\n",
+    "\n",
+    "print(\"reasoning_content for Round 2:\", response.choices[0].message.reasoning_content)\n",
+    "print(\"content for Round 2:\", content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbea21f7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}