### Load the Original Mistral 7B model

In [1]:
import torch
import cat

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" # the device to load the model onto

torch.cuda.empty_cache()

# load the model like this, on aws ec2 g5 family remember the dtype should be bfloat16, otherwise out of memory
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16)
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

# messages = [
#     {"role": "user", "content": "What is your favourite condiment?"},
#     {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
#     {"role": "user", "content": "Do you have mayonnaise recipes?"}
# ]

# encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

# model_inputs = encodeds.to(device)
# model.to(device)

# generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
# decoded = tokenizer.batch_decode(generated_ids)
# print(decoded[0])

## Load AWQ Model 

In [4]:
TOKENIZERS_PARALLELISM=False

In [None]:
# !pip3 install git+https://github.com/huggingface/transformers.git@72958fcd3c98a7afdc61f953aa58c544ebda2f79

In [None]:
# !pip3 install git+https://github.com/casper-hansen/AutoAWQ.git@1c5ccc791fa2cb0697db3b4070df1813f1736208

In [5]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "TheBloke/Mistral-7B-v0.1-AWQ"

In [6]:
# Load model
awq_model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=False, safetensors=True)
awq_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)


Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

Replacing layers...: 100%|███████████████████████████████████████████████████████████████████| 32/32 [00:03<00:00,  9.31it/s]


In [7]:
awq_tokenizer.pad_token = awq_tokenizer.eos_token

In [None]:
# messages = [
#     {"role": "user", "content": "What is your favourite condiment?"},
#     {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
#     {"role": "user", "content": "Do you have mayonnaise recipes?"}
# ]

# encodeds = awq_tokenizer.apply_chat_template(messages, return_tensors="pt")

# model_inputs = encodeds.to(device)

# awq_model.to(device)

# generated_ids = awq_model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
# decoded = awq_tokenizer.batch_decode(generated_ids)
# print(decoded[0])

### Log in hugging face with access token in order to read/write to your repo

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

### Push the model to the repo

In [None]:
# awq_model.push_to_hub('AWQ4bit')

## evaluating

In [8]:
import evaluate_awq
import argparse

In [None]:
argument = "--ntrain 5 --data_dir data --save_dir awq_results --model mistralai/Mistral-7B-v0.1"
argument = argument.split(" ")
parser = evaluate_awq.get_parser()
args = parser.parse_args(argument)
accuracy_result = evaluate_awq.main(args, pass_model=awq_model)

In [9]:
import cat

In [11]:
# get the overall accuracy
cat.get_overall_acc(accuracy_result)

0.6287719298245615


In [13]:
# get the accuracy for categories
cat_acc = cat.get_cat_acc(accuracy_result)

STEM : 0.513
humanities : 0.674
social sciences : 0.727
other (business, health, misc.) : 0.651


## Training 

# The Benchmark for memory and latency

### Benchmark for different batch size

In [None]:
from optimum_benchmark.logging_utils import setup_logging
from optimum_benchmark.experiment import launch, ExperimentConfig
from optimum_benchmark.backends.pytorch.config import PyTorchConfig
from optimum_benchmark.launchers.torchrun.config import TorchrunConfig
from optimum_benchmark.benchmarks.inference.config import InferenceConfig

input_shapes={"batch_size": 1, "num_choices": 2, "seqeunce_length": 16}
l_ori = []
l_8bit = []
l_4bit = []

batch_step = 5

for i in range(1, 10):
    input_shapes["batch_size"] = i * 5
    
    try:
        setup_logging(level="INFO")
        launcher_config = TorchrunConfig(nproc_per_node=1)
        benchmark_config = InferenceConfig(latency=True, memory=True, input_shapes=input_shapes)
        backend_config = PyTorchConfig(model="mistralai/Mistral-7B-v0.1", device="cuda", device_ids="0",
                                      torch_dtype="bfloat16", quantization_scheme="bnb",
                                      quantization_config={"load_in_4bit": True})
        experiment_config = ExperimentConfig(
            experiment_name="api-launch",
            benchmark=benchmark_config,
            launcher=launcher_config,
            backend=backend_config,
        )
        benchmark_report = launch(experiment_config)
        
        # experiment_config.push_to_hub("AwAppp/benchmarks_8bit_batch_size"+str(i * batch_step))
        benchmark_report.push_to_hub("AwAppp/benchmarks_4bit_batch_size"+str(i * batch_step))
        
        l_4bit += [benchmark_report.to_dict()]
        
    except:
        break

for i in range(1, 10):
    input_shapes["batch_size"] = i * batch_step
    
    try:
        setup_logging(level="INFO")
        launcher_config = TorchrunConfig(nproc_per_node=1)
        benchmark_config = InferenceConfig(latency=True, memory=True)
        backend_config = PyTorchConfig(model="mistralai/Mistral-7B-v0.1", device="cuda", device_ids="0", no_weights=True, torch_dtype='bfloat16')
        experiment_config = ExperimentConfig(
            experiment_name="api-launch",
            benchmark=benchmark_config,
            launcher=launcher_config,
            backend=backend_config,
        )
        benchmark_report = launch(experiment_config)
        
        # experiment_config.push_to_hub("AwAppp/benchmarks_original_batch_size"+str(i * batch_step))
        benchmark_report.push_to_hub("AwAppp/benchmark_original_batch_size"+str(i * batch_step))
        
        l_ori += [benchmark_report.to_dict()]
        
    except:
        break
        
        
for i in range(1, 10):
    input_shapes["batch_size"] = i * 5
    
    try:
        setup_logging(level="INFO")
        launcher_config = TorchrunConfig(nproc_per_node=1)
        benchmark_config = InferenceConfig(latency=True, memory=True, input_shapes=input_shapes)
        backend_config = PyTorchConfig(model="AwAppp/q8bit", device="cuda", device_ids="0",
                                      torch_dtype="bfloat16", quantization_scheme="bnb",
                                      quantization_config={"load_in_8bit": True})
        experiment_config = ExperimentConfig(
            experiment_name="api-launch",
            benchmark=benchmark_config,
            launcher=launcher_config,
            backend=backend_config,
        )
        benchmark_report = launch(experiment_config)
        
        # experiment_config.push_to_hub("AwAppp/benchmarks_8bit_batch_size"+str(i * batch_step))
        benchmark_report.push_to_hub("AwAppp/benchmarks_8bit_batch_size"+str(i * batch_step))
        
        l_8bit += [benchmark_report.to_dict()]
        
    except:
        break
        
    # experiment_config.push_to_hub("IlyasMoutawwakil/benchmarks") # pushes experiment_config.json to the hub
    # benchmark_report.push_to_hub("IlyasMoutawwakil/benchmarks") # pushes benchmark_report.json to the hub

### plot of the prefill benchmark

In [None]:
# read the test result from hugging face
from huggingface_hub import hf_hub_download
import json

batch_step = 5

l_ori = []
l_4bit = []
l_8bit = []

base_repo_name = "AwAppp/benchmark_original_batch_size"

for i in range(1, 10):
    batch_size = i * batch_step
    hf_hub_download(repo_id=base_repo_name+str(batch_size), filename="benchmark_report.json", local_dir='./')
    
    with open('./benchmark_report.json') as json_file:
        data = json.load(json_file)
        l_ori += [data]
        
base_repo_name = "AwAppp/benchmarks_8bit_batch_size"

for i in range(1, 10):
    batch_size = i * 5
    hf_hub_download(repo_id=base_repo_name+str(batch_size), filename="benchmark_report.json", local_dir='./')
    
    with open('./benchmark_report.json') as json_file:
        data = json.load(json_file)
        l_8bit += [data]
        
base_repo_name = "AwAppp/benchmarks_4bit_batch_size"

for i in range(1, 10):
    batch_size = i * 5
    hf_hub_download(repo_id=base_repo_name+str(batch_size), filename="benchmark_report.json", local_dir='./')
    
    with open('./benchmark_report.json') as json_file:
        data = json.load(json_file)
        l_4bit += [data]

In [None]:
import matplotlib.pyplot as plt

x = []
for i in range(1, 10):
    x += [i * batch_step]
    
plt.figure(figsize=(8, 10))

plt.subplot(311)
plt.xticks(range(len(x)), x)
plt.xlabel('batch size')
plt.ylabel('prefill latency')
prefill_latency_ori = list(map(lambda x: x['prefill']['latency']['mean'], l_ori))
prefill_latency_8bit = list(map(lambda x: x['prefill']['latency']['mean'], l_8bit))
prefill_latency_4bit = list(map(lambda x: x['prefill']['latency']['mean'], l_4bit))
plt.plot(prefill_latency_ori, label='7B latency')
plt.plot(prefill_latency_8bit, label='INT8 7B latency')
plt.plot(prefill_latency_4bit, label='INT4 7B latency')
plt.legend()

plt.subplot(312)
plt.xticks(range(len(x)), x)
plt.xlabel('batch size')
plt.ylabel('prefill throughput')
prefill_throughput_ori = list(map(lambda x: x['prefill']['throughput']['value'], l_ori))
prefill_throughput_8bit = list(map(lambda x: x['prefill']['throughput']['value'], l_8bit))
prefill_throughput_4bit = list(map(lambda x: x['prefill']['throughput']['value'], l_4bit))
plt.plot(prefill_throughput_ori, label='7B throughput')
plt.plot(prefill_throughput_8bit, label='INT8 7B throughput')
plt.plot(prefill_throughput_4bit, label='INT4 7B throughput')
plt.legend()

plt.subplot(313)
plt.xticks(range(len(x)), x)
plt.xlabel('batch size')
plt.ylabel('prefill vRAM')
prefill_vram_ori = list(map(lambda x: x['prefill']['memory']['max_vram'], l_ori))
prefill_vram_8bit = list(map(lambda x: x['prefill']['memory']['max_vram'], l_8bit))
prefill_vram_4bit = list(map(lambda x: x['prefill']['memory']['max_vram'], l_4bit))
plt.plot(prefill_vram_ori, label='7B vram')
plt.plot(prefill_vram_8bit, label='INT8 7B vram')
plt.plot(prefill_vram_4bit, label='INT4 7B vram')
plt.legend()

### plot of the decode benchmark

In [None]:
plt.figure(figsize=(8, 10))

plt.subplot(311)
plt.xticks(range(len(x)), x)
plt.xlabel('batch size')
plt.ylabel('decode latency')
decode_latency_ori = list(map(lambda x: x['decode']['latency']['mean'], l_ori))
decode_latency_8bit = list(map(lambda x: x['decode']['latency']['mean'], l_8bit))
decode_latency_4bit = list(map(lambda x: x['decode']['latency']['mean'], l_4bit))
plt.plot(decode_latency_ori, label='7B latency')
plt.plot(decode_latency_8bit, label='INT8 7B latency')
plt.plot(decode_latency_4bit, label='INT4 7B latency')
plt.legend()

plt.subplot(312)
plt.xticks(range(len(x)), x)
plt.xlabel('batch size')
plt.ylabel('decode throughput')
decode_throughput_ori = list(map(lambda x: x['decode']['throughput']['value'], l_ori))
decode_throughput_8bit = list(map(lambda x: x['decode']['throughput']['value'], l_8bit))
decode_throughput_4bit = list(map(lambda x: x['decode']['throughput']['value'], l_4bit))
plt.plot(decode_throughput_ori, label='7B throughput')
plt.plot(decode_throughput_8bit, label='INT8 7B throughput')
plt.plot(decode_throughput_4bit, label='INT4 7B throughput')
plt.legend()

plt.subplot(313)
plt.xticks(range(len(x)), x)
plt.xlabel('batch size')
plt.ylabel('prefill vRAM')
decode_vram_ori = list(map(lambda x: x['decode']['memory']['max_vram'], l_ori))
decode_vram_8bit = list(map(lambda x: x['decode']['memory']['max_vram'], l_8bit))
decode_vram_4bit = list(map(lambda x: x['decode']['memory']['max_vram'], l_4bit))
plt.plot(decode_vram_ori, label='7B vram')
plt.plot(decode_vram_8bit, label='INT8 7B vram')
plt.plot(decode_vram_4bit, label='INT4 7B vram')
plt.legend()