In [1]:
# Note, working with 1.7 Deepsparse and SparseML here

from sparseml.transformers import oneshot, SparseAutoModel, SparseAutoModelForCausalLM
from datasets import load_dataset
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from typing import Union
from evaluate import evaluator
from sparseml import export
import sparseml.core.session as session_manager

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Setup

model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
dataset_name = "tweet_eval"
dataset_subname = "sentiment"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_calibration_samples = 512
dataset_train = load_dataset(dataset_name, dataset_subname, split="train").shuffle(seed=69).select(range(num_calibration_samples))
dataset_test = load_dataset(dataset_name, dataset_subname, split="test").shuffle(seed=420)#.select(range(500))

In [3]:
recipe = """
test_stage:
  obcq_modifiers:
    QuantizationModifier:
      ignore:
      - classifier
      - LayerNorm
      - GELUActivation
      scheme_overrides:
        Embedding:
          input_activations: null
          weights:
            num_bits: 8
            symmetric: false
        Linear:
          input_activations:
            num_bits: 8
            symmetric: false
          weights:
            num_bits: 8
            symmetric: true
    SparseGPTModifier:
      sparsity: 0.0
      quantize: true
      targets: ["re:roberta.encoder.layer.\\\d+$"]
"""

In [4]:
### Apply One-Shot

def format_data(data):
    return {"text": data["text"], "labels": data["label"]}


model = AutoModelForSequenceClassification.from_pretrained(model_name)

oneshot(
    model=model,
    dataset=dataset_train,
    recipe=recipe,
    preprocessing_func = format_data,
    output_dir="./oneshot_output",
    pad_to_max_length=False,
    num_calibration_samples = num_calibration_samples,
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Logging all SparseML modifier-level logs to sparse_logs/14-05-2024_15.22.43.log
2024-05-14 15:22:43 sparseml.core.logger.logger INFO     Logging all SparseML modifier-level logs to sparse_logs/14-05-2024_15.22.43.log
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
2

['label', 'labels', 'input_ids', 'attention_mask']
{'labels': tensor([2], device='cuda:0'), 'input_ids': tensor([[    0,   113,   387, 11702,   324,  4966, 12019,  4428, 13910,   205,
           112,   620,   631,    11,     5,   475,  4244,     4,  3180,  7428,
            53,    11,    10,   203,   357,  6711,     4,  1009, 26003,  1420,
            11,     5,   935,     6,   101,    38,   437,    10,  1528,   310,
           102,  3226,   113,     2]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


test_stage:
  obcq_modifiers:
    QuantizationModifier:
      ignore:
      - classifier
      - LayerNorm
      - GELUActivation
      scheme_overrides:
        Embedding:
          input_activations: null
          weights:
            num_bits: 8
            symmetric: false
        Linear:
          input_activations:
            num_bits: 8
            symmetric: false
          weights:
            num_bits: 8
            symmetric: true
    SparseGPTModifier:
      sparsity: 0.0
      quantize: true
      targets: ["re:roberta.encoder.layer.\\d+$"]

2024-05-14 15:22:44 sparseml.modifiers.quantization.pytorch INFO     Running QuantizationModifier calibration with 512 samples...
100%|██████████| 512/512 [00:37<00:00, 13.61it/s]
2024-05-14 15:23:21 sparseml.modifiers.pruning.wanda.pytorch INFO     Preparing roberta.encoder.layer.0 for compression
2024-05-14 15:23:21 sparseml.modifiers.pruning.wanda.pytorch INFO     Preparing roberta.encoder.layer.1 for compression
2024-05-14 15:23:

In [10]:
### Evaluate

active_session = session_manager.active_session()
active_session.reset()

def evaluate_model(model: Union[str, AutoModel]):
    task_evaluator = evaluator("text-classification")
    eval_results = task_evaluator.compute(
        model_or_pipeline=model,
        tokenizer = tokenizer,
        data=dataset_test,
        metric="accuracy",
        label_mapping=config.label2id,
        )
    return eval_results

m = SparseAutoModel.text_classification_from_pretrained("./oneshot_output")
eval_quant = evaluate_model(m)
eval_baseline = evaluate_model(model_name)

print(f"Evaluation quantized model:\n{eval_quant}")
print(f"Evaluation baseline model:\n{eval_baseline}")



- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-05-14 15:44:23 sparseml.transformers.utils.helpers INFO     Found recipe in the model_path: ./oneshot_output/recipe.yaml
2024-05-14 15:44:23 sparseml.core.recipe.recipe INFO     Loading recipe from file ./oneshot_output/recipe.yaml
manager stage: Model structure initialized
2024-05-14 15:44:23 sparseml.pytorch.model_load.helpers INFO     Applied an unstaged recipe to the model at ./

Evaluation quantized model:
{'accuracy': 0.7218332790621947, 'total_time_in_seconds': 296.39042346703354, 'samples_per_second': 41.445333679501644, 'latency_in_seconds': 0.02412816863131175}
Evaluation baseline model:
{'accuracy': 0.7218332790621947, 'total_time_in_seconds': 73.40699948498514, 'samples_per_second': 167.34099045299627, 'latency_in_seconds': 0.005975822165824254}


In [6]:
# print recipe.yaml from ./oneshot_output
with open("./oneshot_output/recipe.yaml", "r") as f:
    print(f.read())

test_stage:
  obcq_modifiers:
    QuantizationModifier:
      ignore: [classifier, LayerNorm, GELUActivation]
      scheme_overrides:
        Embedding:
          input_activations: null
          weights: {num_bits: 8, symmetric: false}
        Linear:
          input_activations: {num_bits: 8, symmetric: false}
          weights: {num_bits: 8, symmetric: true}
    SparseGPTModifier:
      sparsity: 0.0
      quantize: true
      targets: ['re:roberta.encoder.layer.\d+$']



In [7]:
m

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(
        50265, 768, padding_idx=1
        (activation_post_process): Identity()
        (weight_fake_quant): FakeQuantizeWrapper(
          fake_quant_enabled=tensor([1], device='cuda:0', dtype=torch.uint8), observer_enabled=tensor([0], device='cuda:0', dtype=torch.uint8), quant_min=-128, quant_max=127, dtype=torch.qint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([0.0109], device='cuda:0'), zero_point=tensor([-3], device='cuda:0', dtype=torch.int32)
          (activation_post_process): MovingAverageMinMaxObserver(min_val=-1.3561680316925049, max_val=1.420897126197815)
        )
      )
      (position_embeddings): Embedding(
        514, 768, padding_idx=1
        (activation_post_process): Identity()
        (weight_fake_quant): FakeQuantizeWrapper(
          fake_quant_enabled=tensor([1], device='cuda:0', dtype=torch.uint8), observer_en

In [8]:
for n_, _ in m.named_modules():
    print(n_)


roberta
roberta.embeddings
roberta.embeddings.word_embeddings
roberta.embeddings.word_embeddings.activation_post_process
roberta.embeddings.word_embeddings.weight_fake_quant
roberta.embeddings.word_embeddings.weight_fake_quant.activation_post_process
roberta.embeddings.position_embeddings
roberta.embeddings.position_embeddings.activation_post_process
roberta.embeddings.position_embeddings.weight_fake_quant
roberta.embeddings.position_embeddings.weight_fake_quant.activation_post_process
roberta.embeddings.token_type_embeddings
roberta.embeddings.token_type_embeddings.activation_post_process
roberta.embeddings.token_type_embeddings.weight_fake_quant
roberta.embeddings.token_type_embeddings.weight_fake_quant.activation_post_process
roberta.embeddings.LayerNorm
roberta.embeddings.dropout
roberta.encoder
roberta.encoder.layer
roberta.encoder.layer.0
roberta.encoder.layer.0.attention
roberta.encoder.layer.0.attention.self
roberta.encoder.layer.0.attention.self.query
roberta.encoder.layer.0.

In [9]:
### Export
export(source_path = "./oneshot_output", target_path = "./oneshot_deployment", task="text-classification")

2024-05-14 15:30:14 sparseml.export.export INFO     Starting export for transformers model...
2024-05-14 15:30:14 sparseml.transformers.integration_helper_functions INFO     Fetching default helper functions for transformers integration
2024-05-14 15:30:14 sparseml.export.export INFO     Creating model for the export...


- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-05-14 15:30:15 sparseml.transformers.utils.helpers INFO     Found recipe in the model_path: /root/sparseml/oneshot_output/recipe.yaml
2024-05-14 15:30:15 sparseml.core.recipe.recipe INFO     Loading recipe from file /root/sparseml/oneshot_output/recipe.yaml
manager stage: Model structure initialized
2024-05-14 15:30:15 sparseml.pytorch.model_load.helpers INFO     Applied an unstaged