Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
lucataco committed Aug 30, 2023
0 parents commit 1414a59
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 0 deletions.
17 changes: 17 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# The .dockerignore file excludes files from the container build process.
#
# https://docs.docker.com/engine/reference/builder/#dockerignore-file

# Exclude Git files
.git
.github
.gitignore

# Exclude Python cache files
__pycache__
.mypy_cache
.pytest_cache
.ruff_cache

# Exclude Python virtual environment
/venv
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__pycache__
.cog
cache/
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# TheBloke/WizardCoder-Python-34B-V1.0-GPTQ Cog model

This is an implementation of the [TheBloke/WizardCoder-Python-34B-V1.0-GPTQQ](https://huggingface.co/TheBloke/WizardCoder-Python-34B-V1.0-GPTQ) as a Cog model. [Cog packages machine learning models as standard containers.](https://github.com/replicate/cog)

First, download the pre-trained weights:

cog run script/download-weights

Then run the git clone command at the end of the download-weights file

Then, you can run predictions:

cog predict -i prompt="Tell me about AI"
15 changes: 15 additions & 0 deletions cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
build:
# set to true if your model requires a GPU
gpu: true
cuda: "11.8"
python_version: "3.10"
python_packages:
- "torch==2.0.1"
- "transformers==4.31.0"

run:
- "wget https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.0/auto_gptq-0.4.0+cu118-cp310-cp310-linux_x86_64.whl"
- "pip install auto_gptq-0.4.0+cu118-cp310-cp310-linux_x86_64.whl"

# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"
54 changes: 54 additions & 0 deletions predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md

from cog import BasePredictor, Input
import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

MODEL_NAME = "TheBloke/WizardCoder-Python-34B-V1.0-GPTQ"
MODEL_CACHE = "cache"
use_triton = False

class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_CACHE,
use_fast=True
)
self.model = AutoGPTQForCausalLM.from_quantized(
MODEL_CACHE,
use_safetensors=True,
trust_remote_code=False,
device="cuda:0",
use_triton=use_triton,
quantize_config=None,
inject_fused_attention=False
)

def predict(
self,
prompt: str = Input(description="Your prompt", default="Tell me about AI"),
system_prompt: str = Input(description="System prompt that helps guide system behavior", default="Below is an instruction that describes a task. Write a response that appropriately completes the request."),
temperature: float = Input(description="Randomness of outputs, 0 is deterministic, greater than 1 is random", ge=0, le=5, default=0.7),
max_new_tokens: int = Input(description="Number of new tokens", ge=1, le=4096 , default=512),
top_p: float = Input(description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens", ge=0.01, le=1, default=0.95),
repetition_penalty: float = Input(description="Penalty for repeated words in generated text; 1 is no penalty, values greater than 1 discourage repetition, less than 1 encourage it", ge=0, le=5, default=1.15),
) -> str:
"""Run a single prediction on the model"""
prompt_template=f'''{system_prompt}
### Instruction:
{prompt}
### Response:
'''
input_ids = self.tokenizer(prompt_template, return_tensors='pt').input_ids.to("cuda")
outputs = self.model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty)
output = self.tokenizer.decode(outputs[0])
parts = output.split("### Response:", 1)
response = parts[1]

return response

32 changes: 32 additions & 0 deletions script/download-weights
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env python

import os
import sys
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# append project directory to path so predict.py can be imported
sys.path.append('.')

from predict import MODEL_NAME, MODEL_CACHE
use_triton = True

tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
use_fast=True
)
tokenizer.save_pretrained(MODEL_CACHE)

# model = AutoGPTQForCausalLM.from_quantized(
# MODEL_NAME,
# model_basename=MODEL_BASENAME,
# use_safetensors=True,
# trust_remote_code=True,
# device="cuda:0",
# use_triton=use_triton,
# quantize_config=None,
# )
# model.save_quantized(MODEL_CACHE, use_safetensors=True)

# Manually run:
# os.system("git clone --branch gptq-4bit-32g-actorder_True https://huggingface.co/TheBloke/WizardCoder-Python-34B-V1.0-GPTQ cache")

0 comments on commit 1414a59

Please sign in to comment.