# Install transformers and nlp library

In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [

# Import tokenizer and model

In [2]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import numpy as np

# Load dataset

In [3]:
dataset = load_dataset('csv', data_files='./drive/MyDrive/cn-spam-detection/output.csv', split='train')
type(dataset)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

datasets.arrow_dataset.Dataset

# Process dataset

In [4]:
dataset = dataset.train_test_split(test_size=0.3)
dataset
train_set = dataset['train']
test_set = dataset['test']

# Download BERT model

In [5]:
model = BertForSequenceClassification.from_pretrained('bert-base-chinese')

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Create tokenizer

In [6]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [7]:
def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))
train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

# Train model

## Define parameters

In [8]:
batchSize = 8
epochs = 5
warmupSteps = 500
weightDecay = 0.01
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batchSize,
    per_device_eval_batch_size=batchSize,
    warmup_steps=warmupSteps,
    weight_decay=weightDecay,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,0.104,0.07205
2,0.0694,0.109814
3,0.0359,0.075755
4,0.0183,0.078018
5,0.0135,0.064773


{'eval_loss': 0.06477342545986176,
 'eval_runtime': 56.4183,
 'eval_samples_per_second': 106.348,
 'eval_steps_per_second': 13.294,
 'epoch': 5.0}

In [13]:
trainer.save_model("./drive/MyDrive/cn-spam-detection/model")

In [12]:
model.save_pretrained("./pretrained_model")