# huggingface demo repo with bert base chinese 

primary demo for `AutoModelForSequenceClassification`

In [1]:
import os
import torch
import pandas as pd
import evaluate
import numpy as np
import joblib
from datasets import Dataset, concatenate_datasets, load_dataset

from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    pipeline, AutoModel 
)

2025-09-30 14:54:46.473405: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-30 14:54:46.518206: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


defining model

In [2]:
MODEL = 'google-bert/bert-base-chinese'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
model = AutoModelForSequenceClassification.from_pretrained(
            MODEL, 
            num_labels=14,
            trust_remote_code=True, 
            ).to(DEVICE)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

tokenizer & data collator

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [5]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer
    )
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

data loading, tokenizing, sorting labels, encoding

In [6]:
# Methods and `google-bert/bert-base-chinese`
dataset = load_dataset(
    'parquet', 
    data_files={
        'train': 'data/train_zh.parquet',
        'test': 'data/test_zh.parquet'
    })
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 4917
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 258
    })
})

In [7]:
# dataset = dataset.sort(column_names='label')
# dataset['train']['label']

In [8]:
# encoded_dataset = dataset.class_encode_column('label')
# encoded_dataset

In [9]:
def tokenize_dataset(dataset):
    return tokenizer(
        dataset["text"],
        max_length=tokenizer.model_max_length, 
        truncation=True,
        padding='max_length')

In [10]:
# tok_dataset = encoded_dataset.map(tokenize_dataset, batched=True)
tok_dataset = dataset.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/258 [00:00<?, ? examples/s]

In [11]:
tok_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4917
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 258
    })
})

In [12]:
# Before creating the model, check your labels
unique_labels = set(dataset['train']['label'])
num_labels = len(unique_labels)
print(f"Number of unique labels: {num_labels}")
print(f"Label values: {sorted(unique_labels)}")


Number of unique labels: 14
Label values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


defining data collator, training args, and trainer

In [13]:
training_args = TrainingArguments(
    output_dir='train/demo',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_dataset['train'],
    eval_dataset=tok_dataset['test'],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [15]:
trainer.train()

Step,Training Loss
500,0.5816
1000,0.3079


TrainOutput(global_step=1230, training_loss=0.40971578272377573, metrics={'train_runtime': 241.9031, 'train_samples_per_second': 40.653, 'train_steps_per_second': 5.085, 'total_flos': 2587712896733184.0, 'train_loss': 0.40971578272377573, 'epoch': 2.0})

In [16]:
eval_results = trainer.evaluate()
print("Final metrics:", eval_results)

Final metrics: {'eval_loss': 0.22594906389713287, 'eval_runtime': 1.9793, 'eval_samples_per_second': 130.352, 'eval_steps_per_second': 16.673, 'epoch': 2.0}


In [15]:
trainer.push_to_hub('mxngjxa/demo-model')

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...l-demo/train/demo/model.safetensors:   6%|5         | 23.0MB /  409MB            

  ...nts.1759178625.workstation.191437.0:   1%|          |  47.0B / 5.11kB            

  ...nts.1759178796.workstation.191437.1:   1%|          |  47.0B / 5.11kB            

  ...nts.1759178989.workstation.191437.2:   1%|          |  47.0B / 5.11kB            

  ...nts.1759179027.workstation.191437.3:   1%|          |  47.0B / 5.11kB            

  ...nts.1759179109.workstation.191437.4:   1%|          |  47.0B / 5.11kB            

  ...nts.1759179182.workstation.191437.5:   1%|          |  47.0B / 5.11kB            

  ...nts.1759180861.workstation.191437.6:   1%|          |  93.0B / 10.1kB            

  ...nts.1759181037.workstation.191437.7:   1%|          |  47.0B / 5.11kB            

  ...nts.1759181262.workstation.254654.0:   1%|          |  47.0B / 5.11kB            

CommitInfo(commit_url='https://huggingface.co/mxngjxa/demo/commit/74f369cbec6abe875284fa9cefc1754768945079', commit_message='mxngjxa/demo-model', commit_description='', oid='74f369cbec6abe875284fa9cefc1754768945079', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mxngjxa/demo', endpoint='https://huggingface.co', repo_type='model', repo_id='mxngjxa/demo'), pr_revision=None, pr_num=None)