In [19]:
import torch

device = torch.device("cpu")

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
import torch

model_id = "./sougou_test_trainer_256/checkpoint-96"
onnx_path = "./sougou_test_trainer_256/onnx_256"

# load vanilla transformers and convert to onnx
model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

In [2]:
from transformers import pipeline

vanilla_clf = pipeline("text-classification", model=model, tokenizer=tokenizer)
vanilla_clf("这期节目继续关注中国篮球的话题。众所周知，我们已经结束了男篮世界杯的所有赛程，一胜四负的一个成绩，甚至比上一届的世界杯成绩还要差。因为这一次我们连奥运会落选赛也都没有资格参加，所以，连续两次错过了巴黎奥运会的话，对于中国篮协，还有对于姚明来说，确实成为了他任职的一个最大的败笔。对于球迷非常关注的一个话题，乔尔杰维奇是否下课，可能对于这个悬念来说也都是暂时有答案了。")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[{'label': 'LABEL_0', 'score': 0.9963239431381226}]

In [3]:
vanilla_clf("9月4日上午，参加中泰“蓝色突击-2023”海军联合训练的双方海军陆战分队，在泰军陆战队轻武器射击场展开冲锋枪、手枪实弹射击，标志着本次联训活动全面展开。按照本次双方达成的联训计划，联训双方在混合编组、示范讲解、相互体验对方武器后，随即在陆上、水面等多个场地同时展开装甲战术、防化、土制炸弹搜排、机降等课目训练。在实弹射击现场，记者看到中泰两军队员统一编组，每组各5名队员上场射击，在规定的时间内完成立姿、跪姿，向左向右转向的轻武器射击。")

[{'label': 'LABEL_2', 'score': 0.9959322810173035}]

In [4]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.optimize(
    save_dir=onnx_path,
    optimization_config=optimization_config,
)

Optimizing model...
Configuration saved in sougou_test_trainer_256\onnx_256\ort_config.json
Optimized model saved at: sougou_test_trainer_256\onnx_256 (external data format: False; saved all tensor to one file: True)


WindowsPath('sougou_test_trainer_256/onnx_256')

In [6]:
from transformers import pipeline

# load optimized model
onnx_model = ORTModelForSequenceClassification.from_pretrained(onnx_path, file_name="model_optimized.onnx")

# create optimized pipeline
optimized_clf = pipeline("text-classification", model=onnx_model, tokenizer=tokenizer)
optimized_clf("今年7月，教育部等四部门联合印发了《关于在深化非学科类校外培训治理中加强艺考培训规范管理的通知》（以下简称《通知》）。《通知》针对近年来校外艺术培训的状况而发布，并从源头就校外艺术培训机构的“培训主体、从业人员、招生行为、安全底线”等方面进行严格规范。校外艺术培训之所以火热，主要在于高中阶段艺术教育发展迟滞于学生需求。分析教育部数据，2021年艺术学科在校生占比为9.84%，高于2020年的9.73%；2020至2021年艺术学科在校生的年增长率为5.04%，远高于4.28%的总在校生年增长率。增长的数据，是近年来艺考招生连年火热的缩影，在未来一段时间内，艺考或将在全国范围内继续保持高热度。")


[{'label': 'LABEL_3', 'score': 0.9926980137825012}]

In [7]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(onnx_model)
dqconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)

# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.quantize(
    save_dir=onnx_path,
    quantization_config=dqconfig,
)


Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/u8, channel-wise: False)
Quantizing model...
Saving quantized model at: sougou_test_trainer_256\onnx_256 (external data format: False)
Configuration saved in sougou_test_trainer_256\onnx_256\ort_config.json


In [8]:
import os

# get model file size
size = os.path.getsize(os.path.join(onnx_path, "model_optimized.onnx"))/(1024*1024)
quantized_model = os.path.getsize(os.path.join(onnx_path, "model_optimized_quantized.onnx"))/(1024*1024)

print(f"Model file size: {size:.2f} MB")
print(f"Quantized Model file size: {quantized_model:.2f} MB")

Model file size: 390.17 MB
Quantized Model file size: 97.98 MB


In [20]:
# load quantization model
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer

quantized_model = ORTModelForSequenceClassification.from_pretrained(onnx_path, file_name="model_optimized_quantized.onnx").to(device)
tokenizer = AutoTokenizer.from_pretrained(onnx_path)

The ONNX file model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [10]:
import pandas as pd

test_df = pd.read_csv("./data/sougou/test.csv")

In [32]:
# original model evaluate
import numpy as np
import time

cost_time_list = []
s_time = time.time()
true_labels, pred_labels = [], [] 
for i, row in test_df.iterrows():
    row_s_time = time.time()
    true_labels.append(row["label"])
    encoded_text = tokenizer(row['text'], max_length=256, truncation=True, padding=True, return_tensors='pt')
    # print(encoded_text)
    logits = model(**encoded_text)
    label_id = np.argmax(logits[0].detach().numpy(), axis=1)[0]
    pred_labels.append(label_id)
    cost_time_list.append((time.time() - row_s_time) * 1000)
    print(i, (time.time() - row_s_time) * 1000, label_id)

print("avg time:", (time.time() - s_time) * 1000 / test_df.shape[0])
print("P50 time:", np.percentile(np.array(cost_time_list), 50))
print("P95 time:", np.percentile(np.array(cost_time_list), 95))

0 266.28804206848145 0
1 265.2897834777832 0
2 251.33323669433594 0
3 259.30285453796387 0
4 259.3092918395996 0
5 238.36255073547363 0
6 251.3282299041748 0
7 278.2611846923828 0
8 270.27368545532227 0
9 272.27258682250977 0
10 364.0255928039551 0
11 475.73208808898926 0
12 469.7413444519043 0
13 497.67327308654785 0
14 469.7418212890625 0
15 502.657413482666 0
16 552.5238513946533 0
17 487.69617080688477 0
18 461.7640972137451 0
19 437.83068656921387 0
20 487.6973628997803 0
21 468.75452995300293 0
22 615.3562068939209 0
23 555.518388748169 0
24 551.5255928039551 0
25 493.6814308166504 0
26 407.9113006591797 0
27 449.68342781066895 0
28 418.87831687927246 0
29 346.07386589050293 0
30 370.0118064880371 0
31 464.76197242736816 0
32 443.81260871887207 0
33 461.7633819580078 0
34 561.5012645721436 0
35 639.2879486083984 0
36 417.88244247436523 0
37 301.19776725769043 0
38 537.5597476959229 0
39 431.84852600097656 0
40 527.5919437408447 0
41 490.6880855560303 0
42 545.5381870269775 0
43 5

343 448.80080223083496 3
344 448.80008697509766 3
345 494.6784973144531 0
346 469.7451591491699 3
347 325.131893157959 3
348 463.7606143951416 3
349 456.78234100341797 3
350 497.67041206359863 3
351 556.5090179443359 3
352 477.7259826660156 3
353 369.01307106018066 3
354 488.6951446533203 3
355 496.6714382171631 3
356 485.7010841369629 3
357 464.76078033447266 3
358 482.7096462249756 3
359 441.82300567626953 3
360 438.8251304626465 3
361 517.6148414611816 3
362 492.68245697021484 3
363 478.72018814086914 3
364 556.9519996643066 3
365 519.1981792449951 3
366 608.0160140991211 3
367 512.6278400421143 3
368 518.6138153076172 3
369 618.5324192047119 3
370 618.9062595367432 3
371 410.9005928039551 3
372 455.81889152526855 3
373 424.8650074005127 3
374 494.7049617767334 3
375 479.7511100769043 3
376 494.71354484558105 3
377 470.703125 3
378 477.72693634033203 3
379 466.79186820983887 3
380 446.8052387237549 3
381 455.7766914367676 3
382 511.63268089294434 3
383 484.7452640533447 3
384 392.95

In [33]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, pred_labels, digits=4))

              precision    recall  f1-score   support

           0     0.9802    1.0000    0.9900        99
           1     0.9789    0.9394    0.9588        99
           2     0.9900    1.0000    0.9950        99
           3     0.9223    0.9596    0.9406        99
           4     0.9896    0.9596    0.9744        99

    accuracy                         0.9717       495
   macro avg     0.9722    0.9717    0.9717       495
weighted avg     0.9722    0.9717    0.9717       495



In [30]:
import numpy as np
import time

cost_time_list = []
s_time = time.time()
true_labels, pred_labels = [], [] 
for i, row in test_df.iterrows():
    row_s_time = time.time()
    true_labels.append(row["label"])
    encoded_text = tokenizer(row['text'], max_length=256, truncation=True, padding=True, return_tensors='pt').to(device)
    # print(encoded_text)
    logits = quantized_model(**encoded_text)
    label_id = np.argmax(logits[0].detach().numpy(), axis=1)[0]
    pred_labels.append(label_id)
    cost_time_list.append((time.time() - row_s_time) * 1000)
    print(i, (time.time() - row_s_time) * 1000, label_id)

print("avg time:", (time.time() - s_time) * 1000 / test_df.shape[0])
print("P50 time:", np.percentile(np.array(cost_time_list), 50))
print("P95 time:", np.percentile(np.array(cost_time_list), 95))

0 224.39861297607422 0
1 195.4786777496338 0
2 216.42160415649414 0
3 188.49682807922363 0
4 218.41883659362793 0
5 217.41938591003418 0
6 215.4245376586914 0
7 198.46844673156738 0
8 289.2274856567383 0
9 329.12421226501465 0
10 343.0807590484619 0
11 371.0131645202637 0
12 395.94101905822754 0
13 384.9761486053467 0
14 370.00584602355957 0
15 350.0640392303467 0
16 346.07481956481934 0
17 371.01197242736816 0
18 363.02638053894043 0
19 348.0691909790039 0
20 377.9919147491455 0
21 328.1214237213135 0
22 357.04874992370605 0
23 365.0205135345459 0
24 372.0111846923828 0
25 384.97114181518555 0
26 325.1299858093262 0
27 374.00364875793457 0
28 317.1520233154297 0
29 282.2449207305908 0
30 321.16079330444336 0
31 386.9659900665283 0
32 366.02139472961426 0
33 394.9465751647949 0
34 379.9870014190674 0
35 333.10651779174805 0
36 327.12578773498535 0
37 203.45687866210938 0
38 381.97803497314453 0
39 346.07481956481934 0
40 360.04137992858887 0
41 362.0312213897705 0
42 346.07720375061035

341 391.9398784637451 3
342 370.0103759765625 3
343 360.04066467285156 3
344 326.1291980743408 3
345 353.0542850494385 0
346 383.9700222015381 3
347 260.3030204772949 3
348 353.0585765838623 3
349 370.0098991394043 3
350 346.07625007629395 3
351 436.8321895599365 3
352 385.96653938293457 4
353 280.25102615356445 3
354 352.05864906311035 3
355 353.0569076538086 3
356 352.05888748168945 3
357 378.9849281311035 3
358 355.05175590515137 3
359 339.0927314758301 3
360 343.0821895599365 3
361 367.0196533203125 3
362 387.96186447143555 3
363 407.909631729126 3
364 450.79874992370605 3
365 376.98960304260254 3
366 361.0363006591797 3
367 327.12292671203613 3
368 378.98707389831543 3
369 332.11231231689453 3
370 386.96742057800293 3
371 257.3127746582031 3
372 330.11460304260254 3
373 380.9845447540283 3
374 396.93641662597656 3
375 387.9661560058594 3
376 341.08638763427734 3
377 359.04479026794434 3
378 364.0255928039551 3
379 366.0242557525635 3
380 371.0064888000488 3
381 405.9147834777832 3

In [34]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, pred_labels, digits=4))

              precision    recall  f1-score   support

           0     0.9802    1.0000    0.9900        99
           1     0.9789    0.9394    0.9588        99
           2     0.9900    1.0000    0.9950        99
           3     0.9223    0.9596    0.9406        99
           4     0.9896    0.9596    0.9744        99

    accuracy                         0.9717       495
   macro avg     0.9722    0.9717    0.9717       495
weighted avg     0.9722    0.9717    0.9717       495

