# train

In [None]:
# train.py
import os.path as osp
from modelscope.trainers import build_trainer
from modelscope.msdatasets import MsDataset
from modelscope.utils.hub import read_config


model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'

WORK_DIR = './workspace'

cfg = read_config(model_id)
cfg.train.max_epochs = 10
cfg.train.work_dir = WORK_DIR
cfg.train.hooks = cfg.train.hooks = [{
        'type': 'TextLoggerHook',
        'interval': 100
    }]
cfg_file = osp.join(WORK_DIR, 'train_config.json')
cfg.dump(cfg_file)

local_data = MsDataset.load('csv', data_files={'train': ['./train.csv'], 'test': ['./test.csv']}) 

train_dataset = local_data['train'].to_hf_dataset()
eval_dataset = local_data['test'].to_hf_dataset()

# map float to index
def map_labels(examples):
    map_dict = {0: "不相似", 1: "相似"}
    examples['label'] = map_dict[int(examples['label'])]
    return examples

train_dataset = train_dataset.map(map_labels)
eval_dataset = eval_dataset.map(map_labels)

kwargs = dict(
    model=model_id,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    cfg_file=cfg_file)


trainer = build_trainer(name='nlp-base-trainer',default_args=kwargs)

print('===============================================================')
print('pre-trained model loaded, training started:')
print('===============================================================')

trainer.train()

print('===============================================================')
print('train success.')
print('===============================================================')

for i in range(cfg.train.max_epochs):
    eval_results = trainer.evaluate(f'{WORK_DIR}/epoch_{i+1}.pth')
    print(f'epoch {i} evaluation result:')
    print(eval_results)


print('===============================================================')
print('evaluate success')
print('===============================================================')

2023-05-17 20:46:04,609 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
Using custom data configuration default-de0d0344efc1eb27
Found cached dataset csv (/root/.cache/modelscope/hub/datasets/csv/default-de0d0344efc1eb27/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/modelscope/hub/datasets/csv/default-de0d0344efc1eb27/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-78ac892ea7e407b2.arrow
Loading cached processed dataset at /root/.cache/modelscope/hub/datasets/csv/default-de0d0344efc1eb27/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-de2d2f855260bd34.arrow
2023-05-17 20:46:06,540 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-17 20:46:07,156 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-17 20:46:07,568 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-17 20:46:07,785 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/damo/nlp_structbert_sentence-similarity_chinese-base
2023-05-17 20:46:09,744 - modelscope - INFO - The key of sentence1: sentence1, The key of sentence2: sentence2, The ke

** build_dataset error log: 'structbert is not in the custom_datasets registry group sentence-similarity. Please make sure the correct version of ModelScope library is used.'
** build_dataset error log: 'structbert is not in the custom_datasets registry group sentence-similarity. Please make sure the correct version of ModelScope library is used.'
pre-trained model loaded, training started:


2023-05-17 20:46:09,979 - modelscope - INFO - Checkpoints will be saved to ./workspace
2023-05-17 20:46:09,980 - modelscope - INFO - Text logs will be saved to ./workspace
2023-05-17 20:46:30,510 - modelscope - INFO - epoch [1][100/5660]	lr: 1.000e-03, eta: 3:13:10, iter_time: 0.205, data_load_time: 0.016, memory: 7484, loss: 0.9874
2023-05-17 20:46:50,896 - modelscope - INFO - epoch [1][200/5660]	lr: 1.000e-03, eta: 3:12:13, iter_time: 0.204, data_load_time: 0.014, memory: 7484, loss: 0.5434
2023-05-17 20:47:11,386 - modelscope - INFO - epoch [1][300/5660]	lr: 1.000e-03, eta: 3:12:00, iter_time: 0.205, data_load_time: 0.014, memory: 7484, loss: 0.5358
2023-05-17 20:47:31,950 - modelscope - INFO - epoch [1][400/5660]	lr: 1.000e-03, eta: 3:11:54, iter_time: 0.206, data_load_time: 0.014, memory: 7484, loss: 0.5049
2023-05-17 20:47:52,564 - modelscope - INFO - epoch [1][500/5660]	lr: 1.000e-03, eta: 3:11:47, iter_time: 0.206, data_load_time: 0.014, memory: 7484, loss: 0.5176
2023-05-17 20

# test

In [2]:
# test.py
trainer = build_trainer(name='nlp-base-trainer',default_args=kwargs)
eval_results = trainer.evaluate(f'{WORK_DIR}/epoch_6.pth')

print(eval_results)

2023-05-21 15:05:25,541 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-21 15:05:26,309 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-21 15:05:27,228 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-21 15:05:27,448 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/damo/nlp_structbert_sentence-similarity_chinese-base
2023-05-21 15:05:33,281 - modelscope - INFO - The key of sentence1: sentence1, The key of sentence2: sentence2, The key of label: label
2023-05-21 15:05:33,292 - modelscope - INFO - The key of sentence1: sentence1, The key of sentence2: sentence2, The key of label: label


** build_dataset error log: 'structbert is not in the custom_datasets registry group sentence-similarity. Please make sure the correct version of ModelScope library is used.'
** build_dataset error log: 'structbert is not in the custom_datasets registry group sentence-similarity. Please make sure the correct version of ModelScope library is used.'


2023-05-21 15:05:36,947 - modelscope - INFO - {
    "framework": "pytorch",
    "task": "sentence-similarity",
    "preprocessor": {
        "train": {
            "type": "sen-sim-tokenizer",
            "first_sequence": "sentence1",
            "second_sequence": "sentence2",
            "mode": "train",
            "use_fast": true
        },
        "val": {
            "type": "sen-sim-tokenizer",
            "first_sequence": "sentence1",
            "second_sequence": "sentence2",
            "mode": "eval",
            "use_fast": true
        }
    },
    "model": {
        "type": "structbert"
    },
    "pipeline": {
        "type": "sentence-similarity"
    },
    "train": {
        "work_dir": "./workspace",
        "max_epochs": 10,
        "dataloader": {
            "batch_size_per_gpu": 16,
            "workers_per_gpu": 1
        },
        "optimizer": {
            "type": "SGD",
            "lr": 0.01,
            "options": {
                "grad_clip": {
      

{'accuracy': 0.8385862634359629, 'binary-f1': 0.9009230644111906, 'f1': 0.9009230644111906}





# predict

In [1]:
# predict.py
import os.path as osp
from modelscope.trainers import build_trainer
from modelscope.msdatasets import MsDataset
from modelscope.utils.hub import read_config


model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'

WORK_DIR = './workspace'

cfg = read_config(model_id)
cfg.train.max_epochs = 10
cfg.train.work_dir = WORK_DIR
cfg.train.hooks = cfg.train.hooks = [{
        'type': 'TextLoggerHook',
        'interval': 100
    }]
cfg_file = osp.join(WORK_DIR, 'train_config.json')
cfg.dump(cfg_file)

local_data = MsDataset.load('csv', data_files={'train': ['./train.csv'], 'test': ['./test.csv']}) 

train_dataset = local_data['train'].to_hf_dataset()
eval_dataset = local_data['test'].to_hf_dataset()

# map float to index
def map_labels(examples):
    map_dict = {0: "不相似", 1: "相似"}
    examples['label'] = map_dict[int(examples['label'])]
    return examples

train_dataset = train_dataset.map(map_labels)
eval_dataset = eval_dataset.map(map_labels)

kwargs = dict(
    model=model_id,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    cfg_file=cfg_file)

2023-05-21 15:05:03,405 - modelscope - INFO - PyTorch version 1.11.0+cu113 Found.
2023-05-21 15:05:03,412 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2023-05-21 15:05:03,477 - modelscope - INFO - Loading done! Current index file version is 1.5.0, with md5 ca25ccc146d421f40d58ac06319b4460 and a total number of 860 components indexed
2023-05-21 15:05:04,490 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
Using custom data configuration default-de0d0344efc1eb27


Downloading and preparing dataset csv/default to /root/.cache/modelscope/hub/datasets/csv/default-de0d0344efc1eb27/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/modelscope/hub/datasets/csv/default-de0d0344efc1eb27/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/90560 [00:00<?, ?ex/s]

  0%|          | 0/60379 [00:00<?, ?ex/s]

In [2]:
# predict
import os
if not os.path.exists('./workspace/tmp/'):
    os.mkdir('./workspace/tmp/')
    
trainer = build_trainer(name='nlp-base-trainer',default_args=kwargs)
pred_data = MsDataset.load('csv', data_files='dev_pred01.csv') 

class SavingFn:
    def __init__(self, inputs, outputs):
        self.filename = './workspace/tmp/results01.txt'
        self.__call__(inputs, outputs)
    def __call__(self, inputs, outputs):
        import numpy as np
        weights = outputs['logits'].cpu().numpy()
        predictions = np.argmax(outputs['logits'].cpu().numpy(), axis=1)
        with open(self.filename, 'a') as f:
            for weight, pred in zip(weights, predictions):
                f.writelines(f'{weight[0]}, {weight[1]}, {pred}\n')

trainer.predict(predict_datasets = pred_data, checkpoint_path='./workspace/epoch_6.pth', saving_fn=SavingFn)

2023-05-19 01:42:24,574 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-19 01:42:25,156 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-19 01:42:26,031 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.1
2023-05-19 01:42:26,239 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/damo/nlp_structbert_sentence-similarity_chinese-base
2023-05-19 01:42:32,237 - modelscope - INFO - The key of sentence1: sentence1, The key of sentence2: sentence2, The key of label: label
2023-05-19 01:42:32,243 - modelscope - INFO - The key of sentence1: sentence1, The key of sentence2: sentence2, The key of label: label


** build_dataset error log: 'structbert is not in the custom_datasets registry group sentence-similarity. Please make sure the correct version of ModelScope library is used.'
** build_dataset error log: 'structbert is not in the custom_datasets registry group sentence-similarity. Please make sure the correct version of ModelScope library is used.'


2023-05-19 01:42:36,093 - modelscope - INFO - {
    "framework": "pytorch",
    "task": "sentence-similarity",
    "preprocessor": {
        "train": {
            "type": "sen-sim-tokenizer",
            "first_sequence": "sentence1",
            "second_sequence": "sentence2",
            "mode": "train",
            "use_fast": true
        },
        "val": {
            "type": "sen-sim-tokenizer",
            "first_sequence": "sentence1",
            "second_sequence": "sentence2",
            "mode": "eval",
            "use_fast": true
        }
    },
    "model": {
        "type": "structbert"
    },
    "pipeline": {
        "type": "sentence-similarity"
    },
    "train": {
        "work_dir": "./workspace",
        "max_epochs": 10,
        "dataloader": {
            "batch_size_per_gpu": 16,
            "workers_per_gpu": 1
        },
        "optimizer": {
            "type": "SGD",
            "lr": 0.01,
            "options": {
                "grad_clip": {
      

Downloading and preparing dataset csv/default to /root/.cache/modelscope/hub/datasets/csv/default-4c5039aca13d854c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/modelscope/hub/datasets/csv/default-4c5039aca13d854c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

2023-05-19 01:42:37,932 - modelscope - INFO - Stage: before_run:
    (LOW         ) CheckpointHook                     
    (VERY_LOW    ) TextLoggerHook                     
 -------------------- 
Stage: after_train_iter:
    (LOW         ) CheckpointHook                     
    (VERY_LOW    ) TextLoggerHook                     
 -------------------- 
Stage: after_train_epoch:
    (LOW         ) CheckpointHook                     
    (VERY_LOW    ) TextLoggerHook                     
 -------------------- 
Stage: after_val_epoch:
    (VERY_LOW    ) TextLoggerHook                     
 -------------------- 
Stage: after_run:
    (LOW         ) CheckpointHook                     
 -------------------- 
 --- Hook strategies info --- 

 --- Hook strategies info end --- 

2023-05-19 01:42:39,615 - modelscope - INFO - Checkpoint ./workspace/epoch_6.pth saving time: Wed May 17 22:42:58 2023, modelscope version: 1.5.0
Total test samples:   0%|          | 1/21089 [00:00<1:07:06,  5.24it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Total test samples: 100%|██████████| 21089/21089 [01:54<00:00, 183.69it/s]
