From 0dd9cb2cab8010e62a37f8dabfc633cf5a46c2ad Mon Sep 17 00:00:00 2001 From: chrisu-inigra Date: Fri, 9 May 2025 10:25:11 +0200 Subject: [PATCH 1/5] Docker containerization for ReproducibleVLLM Validator --- containerization_job/Dockerfile | 23 +++ containerization_job/app.py | 53 +++++++ containerization_job/download_model.py | 26 ++++ containerization_job/requirements.txt | 8 + containerization_job/schema.py | 24 +++ containerization_job/vllm_llm.py | 200 +++++++++++++++++++++++++ 6 files changed, 334 insertions(+) create mode 100644 containerization_job/Dockerfile create mode 100644 containerization_job/app.py create mode 100644 containerization_job/download_model.py create mode 100644 containerization_job/requirements.txt create mode 100644 containerization_job/schema.py create mode 100644 containerization_job/vllm_llm.py diff --git a/containerization_job/Dockerfile b/containerization_job/Dockerfile new file mode 100644 index 000000000..30d7b2d2d --- /dev/null +++ b/containerization_job/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + git build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY download_model.py . + +ARG LLM_MODEL=mrfakename/mistral-small-3.1-24b-instruct-2503-hf +ENV MODEL_PATH=./downloaded_model + +RUN python download_model.py --model-name "$LLM_MODEL" --model-path "$MODEL_PATH" + +COPY . . + +EXPOSE 8000 + +CMD ["python", "app.py"] diff --git a/containerization_job/app.py b/containerization_job/app.py new file mode 100644 index 000000000..97d7f44ad --- /dev/null +++ b/containerization_job/app.py @@ -0,0 +1,53 @@ +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse +import uvicorn +from vllm_llm import ReproducibleVLLM +from schema import ChatRequest, LogitsRequest +from loguru import logger +import os + +MODEL_PATH = os.getenv('MODEL_PATH') + +class ReproducibleVllmApp: + def __init__(self): + self.llm = ReproducibleVLLM(model_id=MODEL_PATH) + self.app = FastAPI() + self.app.post("/generate")(self.generate) + self.app.post("/generate_logits")(self.generate_logits) + + async def generate(self, request: ChatRequest): + try: + result = await self.llm.generate( + messages=[m.dict() for m in request.messages], + sampling_params=request.sampling_parameters.dict(), + seed=request.seed, + continue_last_message=request.continue_last_message + ) + return { + "result": result + } + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + async def generate_logits(self, request: LogitsRequest): + try: + logits, prompt = await self.llm.generate_logits( + messages=[m.dict() for m in request.messages], + top_logprobs=request.top_logprobs, + sampling_params=request.sampling_parameters.dict(), + seed=request.seed, + continue_last_message=request.continue_last_message + ) + return { + "logits": logits, + "prompt": prompt + } + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + def run(self): + uvicorn.run(self.app, host="0.0.0.0", port=8000) + +if __name__ == "__main__": + server = ReproducibleVllmApp() + server.run() diff --git a/containerization_job/download_model.py b/containerization_job/download_model.py new file mode 100644 index 000000000..9449ceb95 --- /dev/null +++ b/containerization_job/download_model.py @@ -0,0 +1,26 @@ +import argparse +from huggingface_hub import snapshot_download + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download model files") + parser.add_argument( + "--model-name", + type=str, + help="Model name to use", + ) + parser.add_argument( + "--model-path", + type=str, + help="Path to save the model files", + ) + + args = parser.parse_args() + + print(f"Downloading Model {args.model_name}, files downloaded to {args.model_path}") + + snapshot_download( + repo_id=args.model_name, + local_dir=args.model_path + ) + + print(f"Model files downloaded to {args.model_path}") \ No newline at end of file diff --git a/containerization_job/requirements.txt b/containerization_job/requirements.txt new file mode 100644 index 000000000..548a6f0f9 --- /dev/null +++ b/containerization_job/requirements.txt @@ -0,0 +1,8 @@ +fastapi==0.115.0 +uvicorn==0.23.2 +pydantic==2.9.0 +vllm==0.8.3 +torch==2.6.0 +numpy==1.26.4 +loguru==0.7.2 +huggingface-hub==0.30.0 \ No newline at end of file diff --git a/containerization_job/schema.py b/containerization_job/schema.py new file mode 100644 index 000000000..d65bcedb8 --- /dev/null +++ b/containerization_job/schema.py @@ -0,0 +1,24 @@ +from pydantic import BaseModel, Field +from typing import List, Optional, Literal + +class ChatMessage(BaseModel): + content: str + role: Literal["user", "assistant", "system"] + +class SamplingParameters(BaseModel): + temperature: Optional[float] = 1.0 + top_p: Optional[float] = 1.0 + max_tokens: Optional[int] = 512 + presence_penalty: Optional[float] = 0.0 + frequency_penalty: Optional[float] = 0.0 + top_k: Optional[int] = -1 + logprobs: Optional[int] = None + +class ChatRequest(BaseModel): + messages: List[ChatMessage] + seed: Optional[int] + sampling_parameters: Optional[SamplingParameters] = SamplingParameters() + continue_last_message: Optional[bool] = False + +class LogitsRequest(ChatRequest): + top_logprobs: Optional[int] = 10 \ No newline at end of file diff --git a/containerization_job/vllm_llm.py b/containerization_job/vllm_llm.py new file mode 100644 index 000000000..083cc21eb --- /dev/null +++ b/containerization_job/vllm_llm.py @@ -0,0 +1,200 @@ +import gc +import random + +import numpy as np +import torch +from loguru import logger +from vllm import LLM, SamplingParams +from vllm.distributed import destroy_model_parallel + + +class ReproducibleVLLM: + def __init__( + self, + model_id: str = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ", + device: str = "cuda:0", + sampling_params: dict[str, str | float | int | bool] | None = None, + ): + """Deterministic VLLM model.""" + self._device = device + self.model_id = model_id + self.sampling_params = {} if sampling_params else sampling_params + + # VLLM specific initialization + # gpu_memory_utilization = 0.9 # Default high utilization since VLLM is memory efficient + self.model = LLM( + model=model_id, + # tensor_parallel_size=1, # Single GPU by default + # dtype="float16", + trust_remote_code=True, + gpu_memory_utilization=0.9, + max_model_len=8192, + ) + + # Store tokenizer from VLLM for consistency + self.tokenizer = self.model.get_tokenizer() + + @classmethod + async def get_max_tokens( + cls, + sampling_params: dict[str, str | float | int | bool], + default_value: int = 512, + ) -> int: + # Process max tokens with backward compatibility. + max_tokens = sampling_params.get("max_tokens") + if max_tokens is None: + max_tokens = sampling_params.get("max_new_tokens") + if max_tokens is None: + max_tokens = sampling_params.get("max_completion_tokens", default_value) + return max_tokens + + @classmethod + async def prepare_sampling_params( + cls, sampling_params: dict[str, str | float | int | bool] | None = None + ) -> SamplingParams: + sampling_params = sampling_params or {} + max_tokens = await cls.get_max_tokens(sampling_params) + + params = SamplingParams( + temperature=float(sampling_params.get("temperature", 1.0)), + top_p=float(sampling_params.get("top_p", 1.0)), + max_tokens=int(max_tokens), + presence_penalty=float(sampling_params.get("presence_penalty", 0.0)), + frequency_penalty=float(sampling_params.get("frequency_penalty", 0.0)), + top_k=int(sampling_params.get("top_k", -1)), + logprobs=sampling_params.get("logprobs", None), + ) + return params + + async def generate( + self, + messages: list[str] | list[dict[str, str]], + sampling_params: dict[str, str | float | int | bool] | None = None, + seed: int | None = None, + continue_last_message: bool = False, + ) -> str: + """Generate text with optimized performance using VLLM.""" + self.set_random_seeds(seed) + + # Convert chat messages to prompt string using tokenizer's chat template + if isinstance(messages, list) and isinstance(messages[0], dict): + try: + # Extract any trailing whitespace before applying template + trailing_space = "" + if continue_last_message and messages[-1]["content"]: + content = messages[-1]["content"] + stripped = content.rstrip() + if len(content) > len(stripped): + trailing_space = content[len(stripped) :] + + # Try using the tokenizer's chat template + prompt = self.tokenizer.apply_chat_template( + conversation=messages, + tokenize=False, + add_generation_prompt=not continue_last_message, + continue_final_message=continue_last_message, + ) + + # Append back just the trailing whitespace if it was stripped + if trailing_space: + prompt += trailing_space + except (AttributeError, NotImplementedError): + raise ValueError(f"Chat template not supported for model {self.model_id}") + else: + prompt = messages[0] if isinstance(messages, list) else messages + + # Convert sampling parameters to vLLM format. + params = sampling_params if sampling_params else self.sampling_params + vllm_params = await self.prepare_sampling_params(params) + outputs = self.model.generate(prompt, vllm_params) + + if not outputs: + return "" + + # Return just the generated text without the prompt + result = outputs[0].outputs[0].text + return result + + async def generate_logits( + self, + messages: list[str] | list[dict[str, str]], + top_logprobs: int = 10, + sampling_params: dict[str, str | float | int | bool] | None = None, + seed: int | None = None, + continue_last_message: bool = False, + ) -> dict[str, float]: + """Generate logits for the next token prediction. + + Args: + messages: Input messages or text. + top_logprobs: Number of top logits to return (default: 10). + sampling_params: Generation parameters. + seed: Random seed for reproducibility. + continue_last_message: Whether to continue the last message in chat format. + + Returns: + Dictionary mapping tokens to their log probabilities. + """ + self.set_random_seeds(seed) + params = sampling_params if sampling_params else self.sampling_params + params = params.copy() + params["max_tokens"] = 1 + params["logprobs"] = top_logprobs + vllm_params = await self.prepare_sampling_params(params) + + prompt = self.tokenizer.apply_chat_template( + conversation=messages, + tokenize=False, + add_generation_prompt=not continue_last_message, + continue_final_message=continue_last_message, + ) + + outputs = self.model.generate(prompt, vllm_params) + + if not outputs or not outputs[0].outputs[0].logprobs: + return {} + + logprobs = outputs[0].outputs[0].logprobs[0] + token_logprobs = {self.tokenizer.decode([token]): logprob.logprob for token, logprob in logprobs.items()} + sorted_token_logprobs = dict(sorted(token_logprobs.items(), key=lambda item: item[1], reverse=True)) + return sorted_token_logprobs, prompt + + def set_random_seeds(self, seed: int | None = 42): + """Set random seeds for reproducibility across all relevant libraries.""" + if seed is not None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + def unload_model(self): + try: + destroy_model_parallel() + if hasattr(self.model, "llm_engine") and hasattr(self.model.llm_engine, "driver_worker"): + del self.model.llm_engine.driver_worker + if hasattr(self.model, "model"): + del self.model + if hasattr(self.model, "tokenizer"): + del self.tokenizer + + gc.collect() + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + logger.info("Successfully deleted the LLM pipeline and freed GPU memory") + except BaseException as e: + logger.error(f"An error occurred during model unloading: {e}") + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def __del__(self): + self.unload_model() + + @staticmethod + def format_messages(messages: list[str] | list[dict[str, str]]) -> list[dict[str, str | list[dict[str, str]]]]: + return messages From 423fd30d2eb0484716beffaf03559dff9d010aac Mon Sep 17 00:00:00 2001 From: chrisu-inigra Date: Mon, 12 May 2025 12:36:43 +0200 Subject: [PATCH 2/5] Use build-context --- containerization_job/Dockerfile | 1 + containerization_job/vllm_llm.py | 200 ------------------------------- 2 files changed, 1 insertion(+), 200 deletions(-) delete mode 100644 containerization_job/vllm_llm.py diff --git a/containerization_job/Dockerfile b/containerization_job/Dockerfile index 30d7b2d2d..6c1984d1f 100644 --- a/containerization_job/Dockerfile +++ b/containerization_job/Dockerfile @@ -17,6 +17,7 @@ ENV MODEL_PATH=./downloaded_model RUN python download_model.py --model-name "$LLM_MODEL" --model-path "$MODEL_PATH" COPY . . +COPY --from=external_context /vllm_llm.py . EXPOSE 8000 diff --git a/containerization_job/vllm_llm.py b/containerization_job/vllm_llm.py deleted file mode 100644 index 083cc21eb..000000000 --- a/containerization_job/vllm_llm.py +++ /dev/null @@ -1,200 +0,0 @@ -import gc -import random - -import numpy as np -import torch -from loguru import logger -from vllm import LLM, SamplingParams -from vllm.distributed import destroy_model_parallel - - -class ReproducibleVLLM: - def __init__( - self, - model_id: str = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ", - device: str = "cuda:0", - sampling_params: dict[str, str | float | int | bool] | None = None, - ): - """Deterministic VLLM model.""" - self._device = device - self.model_id = model_id - self.sampling_params = {} if sampling_params else sampling_params - - # VLLM specific initialization - # gpu_memory_utilization = 0.9 # Default high utilization since VLLM is memory efficient - self.model = LLM( - model=model_id, - # tensor_parallel_size=1, # Single GPU by default - # dtype="float16", - trust_remote_code=True, - gpu_memory_utilization=0.9, - max_model_len=8192, - ) - - # Store tokenizer from VLLM for consistency - self.tokenizer = self.model.get_tokenizer() - - @classmethod - async def get_max_tokens( - cls, - sampling_params: dict[str, str | float | int | bool], - default_value: int = 512, - ) -> int: - # Process max tokens with backward compatibility. - max_tokens = sampling_params.get("max_tokens") - if max_tokens is None: - max_tokens = sampling_params.get("max_new_tokens") - if max_tokens is None: - max_tokens = sampling_params.get("max_completion_tokens", default_value) - return max_tokens - - @classmethod - async def prepare_sampling_params( - cls, sampling_params: dict[str, str | float | int | bool] | None = None - ) -> SamplingParams: - sampling_params = sampling_params or {} - max_tokens = await cls.get_max_tokens(sampling_params) - - params = SamplingParams( - temperature=float(sampling_params.get("temperature", 1.0)), - top_p=float(sampling_params.get("top_p", 1.0)), - max_tokens=int(max_tokens), - presence_penalty=float(sampling_params.get("presence_penalty", 0.0)), - frequency_penalty=float(sampling_params.get("frequency_penalty", 0.0)), - top_k=int(sampling_params.get("top_k", -1)), - logprobs=sampling_params.get("logprobs", None), - ) - return params - - async def generate( - self, - messages: list[str] | list[dict[str, str]], - sampling_params: dict[str, str | float | int | bool] | None = None, - seed: int | None = None, - continue_last_message: bool = False, - ) -> str: - """Generate text with optimized performance using VLLM.""" - self.set_random_seeds(seed) - - # Convert chat messages to prompt string using tokenizer's chat template - if isinstance(messages, list) and isinstance(messages[0], dict): - try: - # Extract any trailing whitespace before applying template - trailing_space = "" - if continue_last_message and messages[-1]["content"]: - content = messages[-1]["content"] - stripped = content.rstrip() - if len(content) > len(stripped): - trailing_space = content[len(stripped) :] - - # Try using the tokenizer's chat template - prompt = self.tokenizer.apply_chat_template( - conversation=messages, - tokenize=False, - add_generation_prompt=not continue_last_message, - continue_final_message=continue_last_message, - ) - - # Append back just the trailing whitespace if it was stripped - if trailing_space: - prompt += trailing_space - except (AttributeError, NotImplementedError): - raise ValueError(f"Chat template not supported for model {self.model_id}") - else: - prompt = messages[0] if isinstance(messages, list) else messages - - # Convert sampling parameters to vLLM format. - params = sampling_params if sampling_params else self.sampling_params - vllm_params = await self.prepare_sampling_params(params) - outputs = self.model.generate(prompt, vllm_params) - - if not outputs: - return "" - - # Return just the generated text without the prompt - result = outputs[0].outputs[0].text - return result - - async def generate_logits( - self, - messages: list[str] | list[dict[str, str]], - top_logprobs: int = 10, - sampling_params: dict[str, str | float | int | bool] | None = None, - seed: int | None = None, - continue_last_message: bool = False, - ) -> dict[str, float]: - """Generate logits for the next token prediction. - - Args: - messages: Input messages or text. - top_logprobs: Number of top logits to return (default: 10). - sampling_params: Generation parameters. - seed: Random seed for reproducibility. - continue_last_message: Whether to continue the last message in chat format. - - Returns: - Dictionary mapping tokens to their log probabilities. - """ - self.set_random_seeds(seed) - params = sampling_params if sampling_params else self.sampling_params - params = params.copy() - params["max_tokens"] = 1 - params["logprobs"] = top_logprobs - vllm_params = await self.prepare_sampling_params(params) - - prompt = self.tokenizer.apply_chat_template( - conversation=messages, - tokenize=False, - add_generation_prompt=not continue_last_message, - continue_final_message=continue_last_message, - ) - - outputs = self.model.generate(prompt, vllm_params) - - if not outputs or not outputs[0].outputs[0].logprobs: - return {} - - logprobs = outputs[0].outputs[0].logprobs[0] - token_logprobs = {self.tokenizer.decode([token]): logprob.logprob for token, logprob in logprobs.items()} - sorted_token_logprobs = dict(sorted(token_logprobs.items(), key=lambda item: item[1], reverse=True)) - return sorted_token_logprobs, prompt - - def set_random_seeds(self, seed: int | None = 42): - """Set random seeds for reproducibility across all relevant libraries.""" - if seed is not None: - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - - def unload_model(self): - try: - destroy_model_parallel() - if hasattr(self.model, "llm_engine") and hasattr(self.model.llm_engine, "driver_worker"): - del self.model.llm_engine.driver_worker - if hasattr(self.model, "model"): - del self.model - if hasattr(self.model, "tokenizer"): - del self.tokenizer - - gc.collect() - if torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - logger.info("Successfully deleted the LLM pipeline and freed GPU memory") - except BaseException as e: - logger.error(f"An error occurred during model unloading: {e}") - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - def __del__(self): - self.unload_model() - - @staticmethod - def format_messages(messages: list[str] | list[dict[str, str]]) -> list[dict[str, str | list[dict[str, str]]]]: - return messages From 09a73c955a561480272eb94563818fac4fd1463c Mon Sep 17 00:00:00 2001 From: chrisu-inigra Date: Tue, 13 May 2025 11:57:30 +0200 Subject: [PATCH 3/5] Container building script for sn1-validator-api --- {containerization_job => containerized_job}/Dockerfile | 2 +- {containerization_job => containerized_job}/app.py | 0 containerized_job/build.sh | 10 ++++++++++ .../download_model.py | 0 .../requirements.txt | 0 {containerization_job => containerized_job}/schema.py | 0 6 files changed, 11 insertions(+), 1 deletion(-) rename {containerization_job => containerized_job}/Dockerfile (87%) rename {containerization_job => containerized_job}/app.py (100%) create mode 100755 containerized_job/build.sh rename {containerization_job => containerized_job}/download_model.py (100%) rename {containerization_job => containerized_job}/requirements.txt (100%) rename {containerization_job => containerized_job}/schema.py (100%) diff --git a/containerization_job/Dockerfile b/containerized_job/Dockerfile similarity index 87% rename from containerization_job/Dockerfile rename to containerized_job/Dockerfile index 6c1984d1f..b2e754d41 100644 --- a/containerization_job/Dockerfile +++ b/containerized_job/Dockerfile @@ -11,7 +11,7 @@ RUN pip install --no-cache-dir -r requirements.txt COPY download_model.py . -ARG LLM_MODEL=mrfakename/mistral-small-3.1-24b-instruct-2503-hf +ARG LLM_MODEL ENV MODEL_PATH=./downloaded_model RUN python download_model.py --model-name "$LLM_MODEL" --model-path "$MODEL_PATH" diff --git a/containerization_job/app.py b/containerized_job/app.py similarity index 100% rename from containerization_job/app.py rename to containerized_job/app.py diff --git a/containerized_job/build.sh b/containerized_job/build.sh new file mode 100755 index 000000000..230583607 --- /dev/null +++ b/containerized_job/build.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +IMAGE_NAME="sn1-validator-api" +MODEL_NAME="mrfakename/mistral-small-3.1-24b-instruct-2503-hf" + +DOCKER_BUILDKIT=1 docker build \ + --build-arg LLM_MODEL="$MODEL_NAME" \ + -t "$IMAGE_NAME" \ + --build-context external_context=../prompting/llms \ + . \ No newline at end of file diff --git a/containerization_job/download_model.py b/containerized_job/download_model.py similarity index 100% rename from containerization_job/download_model.py rename to containerized_job/download_model.py diff --git a/containerization_job/requirements.txt b/containerized_job/requirements.txt similarity index 100% rename from containerization_job/requirements.txt rename to containerized_job/requirements.txt diff --git a/containerization_job/schema.py b/containerized_job/schema.py similarity index 100% rename from containerization_job/schema.py rename to containerized_job/schema.py From 797968d9fd3a45b9e62f0163ad656fe74206c59e Mon Sep 17 00:00:00 2001 From: chrisu-inigra Date: Tue, 27 May 2025 14:11:46 +0200 Subject: [PATCH 4/5] Fixes for formatters and linters --- containerized_job/app.py | 26 ++++++++++++-------------- containerized_job/download_model.py | 8 +++----- containerized_job/schema.py | 9 +++++++-- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/containerized_job/app.py b/containerized_job/app.py index 97d7f44ad..89e52fda3 100644 --- a/containerized_job/app.py +++ b/containerized_job/app.py @@ -1,12 +1,14 @@ +import os + +import uvicorn from fastapi import FastAPI, Request from fastapi.responses import JSONResponse -import uvicorn -from vllm_llm import ReproducibleVLLM -from schema import ChatRequest, LogitsRequest from loguru import logger -import os +from schema import ChatRequest, LogitsRequest +from vllm_llm import ReproducibleVLLM + +MODEL_PATH = os.getenv("MODEL_PATH") -MODEL_PATH = os.getenv('MODEL_PATH') class ReproducibleVllmApp: def __init__(self): @@ -21,11 +23,9 @@ async def generate(self, request: ChatRequest): messages=[m.dict() for m in request.messages], sampling_params=request.sampling_parameters.dict(), seed=request.seed, - continue_last_message=request.continue_last_message + continue_last_message=request.continue_last_message, ) - return { - "result": result - } + return {"result": result} except Exception as e: return JSONResponse(status_code=500, content={"error": str(e)}) @@ -36,18 +36,16 @@ async def generate_logits(self, request: LogitsRequest): top_logprobs=request.top_logprobs, sampling_params=request.sampling_parameters.dict(), seed=request.seed, - continue_last_message=request.continue_last_message + continue_last_message=request.continue_last_message, ) - return { - "logits": logits, - "prompt": prompt - } + return {"logits": logits, "prompt": prompt} except Exception as e: return JSONResponse(status_code=500, content={"error": str(e)}) def run(self): uvicorn.run(self.app, host="0.0.0.0", port=8000) + if __name__ == "__main__": server = ReproducibleVllmApp() server.run() diff --git a/containerized_job/download_model.py b/containerized_job/download_model.py index 9449ceb95..31a436fd6 100644 --- a/containerized_job/download_model.py +++ b/containerized_job/download_model.py @@ -1,4 +1,5 @@ import argparse + from huggingface_hub import snapshot_download if __name__ == "__main__": @@ -18,9 +19,6 @@ print(f"Downloading Model {args.model_name}, files downloaded to {args.model_path}") - snapshot_download( - repo_id=args.model_name, - local_dir=args.model_path - ) + snapshot_download(repo_id=args.model_name, local_dir=args.model_path) - print(f"Model files downloaded to {args.model_path}") \ No newline at end of file + print(f"Model files downloaded to {args.model_path}") diff --git a/containerized_job/schema.py b/containerized_job/schema.py index d65bcedb8..06281ad42 100644 --- a/containerized_job/schema.py +++ b/containerized_job/schema.py @@ -1,10 +1,13 @@ +from typing import List, Literal, Optional + from pydantic import BaseModel, Field -from typing import List, Optional, Literal + class ChatMessage(BaseModel): content: str role: Literal["user", "assistant", "system"] + class SamplingParameters(BaseModel): temperature: Optional[float] = 1.0 top_p: Optional[float] = 1.0 @@ -14,11 +17,13 @@ class SamplingParameters(BaseModel): top_k: Optional[int] = -1 logprobs: Optional[int] = None + class ChatRequest(BaseModel): messages: List[ChatMessage] seed: Optional[int] sampling_parameters: Optional[SamplingParameters] = SamplingParameters() continue_last_message: Optional[bool] = False + class LogitsRequest(ChatRequest): - top_logprobs: Optional[int] = 10 \ No newline at end of file + top_logprobs: Optional[int] = 10 From 60a8f59ba8a75b9bbef15fbbef9b1ba00db94f41 Mon Sep 17 00:00:00 2001 From: chrisu-inigra Date: Wed, 28 May 2025 10:42:16 +0200 Subject: [PATCH 5/5] Fixes from pre-commit hooks --- containerized_job/app.py | 3 +-- containerized_job/build.sh | 2 +- containerized_job/requirements.txt | 2 +- containerized_job/schema.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/containerized_job/app.py b/containerized_job/app.py index 89e52fda3..e93c75e46 100644 --- a/containerized_job/app.py +++ b/containerized_job/app.py @@ -1,9 +1,8 @@ import os import uvicorn -from fastapi import FastAPI, Request +from fastapi import FastAPI from fastapi.responses import JSONResponse -from loguru import logger from schema import ChatRequest, LogitsRequest from vllm_llm import ReproducibleVLLM diff --git a/containerized_job/build.sh b/containerized_job/build.sh index 230583607..f85949397 100755 --- a/containerized_job/build.sh +++ b/containerized_job/build.sh @@ -7,4 +7,4 @@ DOCKER_BUILDKIT=1 docker build \ --build-arg LLM_MODEL="$MODEL_NAME" \ -t "$IMAGE_NAME" \ --build-context external_context=../prompting/llms \ - . \ No newline at end of file + . diff --git a/containerized_job/requirements.txt b/containerized_job/requirements.txt index 548a6f0f9..56862af5c 100644 --- a/containerized_job/requirements.txt +++ b/containerized_job/requirements.txt @@ -5,4 +5,4 @@ vllm==0.8.3 torch==2.6.0 numpy==1.26.4 loguru==0.7.2 -huggingface-hub==0.30.0 \ No newline at end of file +huggingface-hub==0.30.0 diff --git a/containerized_job/schema.py b/containerized_job/schema.py index 06281ad42..96354aa34 100644 --- a/containerized_job/schema.py +++ b/containerized_job/schema.py @@ -1,6 +1,6 @@ from typing import List, Literal, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel class ChatMessage(BaseModel):