First commit

lucataco · Aug 30, 2023 · 1414a59 · 1414a59
commit 1414a59
Show file tree

Hide file tree

Showing 6 changed files with 134 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,17 @@
+# The .dockerignore file excludes files from the container build process.
+#
+# https://docs.docker.com/engine/reference/builder/#dockerignore-file
+
+# Exclude Git files
+.git
+.github
+.gitignore
+
+# Exclude Python cache files
+__pycache__
+.mypy_cache
+.pytest_cache
+.ruff_cache
+
+# Exclude Python virtual environment
+/venv
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__
+.cog
+cache/
diff --git a/README.md b/README.md
@@ -0,0 +1,13 @@
+# TheBloke/WizardCoder-Python-34B-V1.0-GPTQ Cog model
+
+This is an implementation of the [TheBloke/WizardCoder-Python-34B-V1.0-GPTQQ](https://huggingface.co/TheBloke/WizardCoder-Python-34B-V1.0-GPTQ) as a Cog model. [Cog packages machine learning models as standard containers.](https://github.com/replicate/cog)
+
+First, download the pre-trained weights:
+
+    cog run script/download-weights
+
+Then run the git clone command at the end of the download-weights file
+
+Then, you can run predictions:
+
+    cog predict -i prompt="Tell me about AI"
diff --git a/cog.yaml b/cog.yaml
@@ -0,0 +1,15 @@
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+  cuda: "11.8"
+  python_version: "3.10"
+  python_packages:
+    - "torch==2.0.1"
+    - "transformers==4.31.0"
+
+  run:
+    - "wget https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.0/auto_gptq-0.4.0+cu118-cp310-cp310-linux_x86_64.whl"
+    - "pip install auto_gptq-0.4.0+cu118-cp310-cp310-linux_x86_64.whl"
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/predict.py b/predict.py
@@ -0,0 +1,54 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+
+from cog import BasePredictor, Input
+import torch
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM
+
+MODEL_NAME = "TheBloke/WizardCoder-Python-34B-V1.0-GPTQ"
+MODEL_CACHE = "cache"
+use_triton = False
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_CACHE, 
+            use_fast=True
+        )
+        self.model = AutoGPTQForCausalLM.from_quantized(
+            MODEL_CACHE,
+            use_safetensors=True,
+            trust_remote_code=False,
+            device="cuda:0",
+            use_triton=use_triton,
+            quantize_config=None,
+            inject_fused_attention=False
+        )
+
+    def predict(
+        self,
+        prompt: str = Input(description="Your prompt", default="Tell me about AI"),
+        system_prompt: str = Input(description="System prompt that helps guide system behavior", default="Below is an instruction that describes a task. Write a response that appropriately completes the request."),
+        temperature: float = Input(description="Randomness of outputs, 0 is deterministic, greater than 1 is random", ge=0, le=5, default=0.7),
+        max_new_tokens: int = Input(description="Number of new tokens", ge=1, le=4096 , default=512),
+        top_p: float = Input(description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens", ge=0.01, le=1, default=0.95),
+        repetition_penalty: float = Input(description="Penalty for repeated words in generated text; 1 is no penalty, values greater than 1 discourage repetition, less than 1 encourage it", ge=0, le=5, default=1.15),
+    ) -> str:
+        """Run a single prediction on the model"""
+        prompt_template=f'''{system_prompt}
+
+### Instruction:
+{prompt}
+
+### Response:
+'''
+        input_ids = self.tokenizer(prompt_template, return_tensors='pt').input_ids.to("cuda")
+        outputs = self.model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty)
+        output = self.tokenizer.decode(outputs[0])
+        parts = output.split("### Response:", 1)
+        response = parts[1]
+
+        return response
+
diff --git a/script/download-weights b/script/download-weights
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+import os
+import sys
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM
+
+# append project directory to path so predict.py can be imported
+sys.path.append('.')
+
+from predict import MODEL_NAME, MODEL_CACHE
+use_triton = True
+
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_NAME,
+    use_fast=True
+)
+tokenizer.save_pretrained(MODEL_CACHE)
+
+# model = AutoGPTQForCausalLM.from_quantized(
+#     MODEL_NAME,
+#     model_basename=MODEL_BASENAME,
+#     use_safetensors=True,
+#     trust_remote_code=True,
+#     device="cuda:0",
+#     use_triton=use_triton,
+#     quantize_config=None,
+# )
+# model.save_quantized(MODEL_CACHE, use_safetensors=True)
+
+# Manually run:
+# os.system("git clone --branch gptq-4bit-32g-actorder_True https://huggingface.co/TheBloke/WizardCoder-Python-34B-V1.0-GPTQ cache")