Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
lucataco committed Jul 24, 2023
0 parents commit 91b480c
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__pycache__
.cog
cache/
Llama-2-7b-Chat-GPTQ/
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# TheBloke/Llama-2-7b-Chat-GPTQ Cog model

This is an implementation of the [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/SG161222/Realistic_Vision_V3.0_VAE) as a Cog model. [Cog packages machine learning models as standard containers.](https://github.com/replicate/cog)

First, download the pre-trained weights:

cog run script/download-weights

Then, you can run predictions:

cog predict -i prompt="Tell me about AI"
22 changes: 22 additions & 0 deletions cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
build:
# set to true if your model requires a GPU
gpu: true
cuda: "11.8"
python_version: "3.10"
python_packages:
- "torch==2.0.0"
# - "transformers==4.31.0"
- "accelerate==0.21.0"
- "safetensors==0.3.1"
- "auto_gptq"
- "peft"
- "numpy"
- "rouge"
- "datasets"

run:
- pip install git+https://github.com/huggingface/transformers
- pip install xformers

# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"
54 changes: 54 additions & 0 deletions predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md

from cog import BasePredictor, Input
import torch
from transformers import AutoTokenizer, pipeline
from auto_gptq import AutoGPTQForCausalLM

MODEL_NAME = "TheBloke/Llama-2-7b-Chat-GPTQ"
MODEL_BASENAME = "gptq_model-4bit-128g"
MODEL_CACHE = "cache"
use_triton = True

class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
use_fast=True,
cache_dir=MODEL_CACHE
)
model = AutoGPTQForCausalLM.from_quantized(
"Llama-2-7b-Chat-GPTQ",
use_safetensors=True,
device="cuda:0",
use_triton=use_triton,
quantize_config=None
)
# Pytorch 2 optimization
self.model = torch.compile(model)

def predict(
self,
prompt: str = Input(description="Prompt to send to Llama v2", default="Tell me about AI"),
system_prompt: str = Input(description="System prompt that helps guide system behavior", default="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."),
max_new_tokens: int = Input(description="Number of new tokens", ge=1, le=4096 , default=512),
temperature: float = Input(description="Randomness of outputs, 0 is deterministic, greater than 1 is random", ge=0, le=5, default=0.75),
# top_p: float = Input(description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens", ge=0.01, le=1, default=0.95),
# repetition_penalty: float = Input(description="Penalty for repeated words in generated text; 1 is no penalty, values greater than 1 discourage repetition, less than 1 encourage it", ge=0, le=5, default=1.1),
) -> str:
"""Run a single prediction on the model"""
prompt_template=f'''[INST] <<SYS>>
{system_prompt}
<</SYS>>
{prompt}[/INST]'''

input_ids = self.tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
outputs = self.model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens)
output = self.tokenizer.decode(outputs[0])
parts = output.split("[/INST]", 1)
final = parts[1]

return final

30 changes: 30 additions & 0 deletions script/download-weights
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python

import os
import sys
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# append project directory to path so predict.py can be imported
sys.path.append('.')

from predict import MODEL_NAME, MODEL_BASENAME, MODEL_CACHE
use_triton = True

tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
use_fast=True,
cache_dir=MODEL_CACHE,
)

# model = AutoGPTQForCausalLM.from_quantized(
# MODEL_NAME,
# model_basename=MODEL_BASENAME,
# use_safetensors=True,
# trust_remote_code=True,
# device="cuda:0",
# use_triton=use_triton,
# quantize_config=None,
# )
# model.save_quantized("cache-gptq", use_safetensors=True)
os.system("git clone --branch main https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ")

0 comments on commit 91b480c

Please sign in to comment.