In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer
import evaluate
import numpy as np
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification


# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象
raw_datasets = load_dataset('paws-x', 'en')  # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#tokenizer.pad_token = tokenizer.eos_token


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=128, padding="max_length")


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the dataset since paws-x couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'en' at /home/maris/.cache/huggingface/datasets/paws-x/en/0.0.0/4cd8187c404bda33cb1f62b49b001115862acf37 (last modified on Tue Dec 10 00:22:45 2024).
Map: 100%|██████████████████████████████████████████████████████████████| 49401/49401 [00:02<00:00, 19365.12 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 20562.08 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 21314.63 examples/s]


In [2]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [3]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': (predictions==labels).sum() / len(labels)}

In [4]:
training_args = TrainingArguments(
    output_dir="ds_job_dna_2222",
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_ratio=0.1,
    optim='adamw_torch',
    weight_decay=0.0,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True
)

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
#model.config.pad_token_id = model.config.eos_token_id

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[2024-12-10 22:10:03,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/maris/miniconda3/envs/dnagpt/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/maris/miniconda3/envs/dnagpt/compiler_compat/ld: /usr/local/cuda-12/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/maris/miniconda3/envs/dnagpt/compiler_compat/ld: /usr/local/cuda-12/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/maris/miniconda3/envs/dnagpt/compiler_compat/ld: /usr/local/cuda-12/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/maris/miniconda3/envs/dnagpt/compiler_compat/ld: /usr/local/cuda-12/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/maris/miniconda3/envs/dnagpt/compiler_compat/ld: /usr/local/cuda-12/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/ho

In [5]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.568,0.383161,0.8375
2,0.2882,0.288868,0.896
3,0.1916,0.348355,0.8985
4,0.1471,0.330516,0.916


TrainOutput(global_step=9884, training_loss=0.2987214462422302, metrics={'train_runtime': 592.8745, 'train_samples_per_second': 333.298, 'train_steps_per_second': 16.671, 'total_flos': 1.299794924583936e+16, 'train_loss': 0.2987214462422302, 'epoch': 4.0})

In [6]:
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Using the latest cached version of the module from /home/maris/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sun Dec  1 21:21:15 2024) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.


{'accuracy': 0.9095, 'f1': 0.9045861887190301}

In [7]:
raw_datasets_fr = load_dataset('paws-x', 'fr')  # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets_fr = raw_datasets_fr.map(tokenize_function, batched=True)

Using the latest cached version of the dataset since paws-x couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'fr' at /home/maris/.cache/huggingface/datasets/paws-x/fr/0.0.0/4cd8187c404bda33cb1f62b49b001115862acf37 (last modified on Tue Dec 10 00:34:00 2024).
Map: 100%|██████████████████████████████████████████████████████████████| 49401/49401 [00:01<00:00, 25250.04 examples/s]


In [8]:
predictions = trainer.predict(tokenized_datasets_fr["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Using the latest cached version of the module from /home/maris/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sun Dec  1 21:21:15 2024) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.


{'accuracy': 0.7575, 'f1': 0.7201384881708021}

In [9]:
raw_datasets_de = load_dataset('google-research-datasets/paws-x', 'de')  # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
tokenized_datasets_de = raw_datasets_de.map(tokenize_function, batched=True)

Using the latest cached version of the dataset since google-research-datasets/paws-x couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'de' at /home/maris/.cache/huggingface/datasets/google-research-datasets___paws-x/de/0.0.0/4cd8187c404bda33cb1f62b49b001115862acf37 (last modified on Mon Dec  9 21:16:47 2024).


In [10]:
predictions = trainer.predict(tokenized_datasets_de["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Using the latest cached version of the module from /home/maris/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sun Dec  1 21:21:15 2024) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.


{'accuracy': 0.725, 'f1': 0.6609124537607891}

In [11]:
raw_datasets_zh = load_dataset('google-research-datasets/paws-x', 'zh')  # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
tokenized_datasets_zh = raw_datasets_zh.map(tokenize_function, batched=True)

Using the latest cached version of the dataset since google-research-datasets/paws-x couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'zh' at /home/maris/.cache/huggingface/datasets/google-research-datasets___paws-x/zh/0.0.0/4cd8187c404bda33cb1f62b49b001115862acf37 (last modified on Mon Dec  9 21:17:45 2024).


In [12]:
predictions = trainer.predict(tokenized_datasets_zh["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Using the latest cached version of the module from /home/maris/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sun Dec  1 21:21:15 2024) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.


{'accuracy': 0.595, 'f1': 0.5539647577092511}

In [13]:
raw_datasets_zh

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

In [14]:
raw_datasets_zh["train"][110]

{'id': 111,
 'sentence1': '这在澳大利亚地区和澳大利亚南部更为常见，但在澳大利亚城市已经普遍存在了数十年。',
 'sentence2': '这种情况在澳大利亚城市更为常见，但几十年来一直在澳大利亚和澳大利亚南部地区普遍使用。',
 'label': 0}

In [19]:
#dna
raw_datasets_dna = load_dataset('json', data_files='dna_150.json')['train'].train_test_split(test_size=0.05) #默认已经shuffle
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)

predictions = trainer.predict(tokenized_datasets_dna["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Map: 100%|██████████████████████████████████████████████████████████████| 17100/17100 [00:01<00:00, 16027.35 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 900/900 [00:00<00:00, 18288.76 examples/s]


Using the latest cached version of the module from /home/maris/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sun Dec  1 21:21:15 2024) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.


{'accuracy': 0.5655555555555556, 'f1': 0.22879684418145957}

In [16]:
raw_datasets_dna_protein = load_dataset('json', data_files='dna_protein_150.json')['train'].train_test_split(test_size=0.05) #默认已经shuffle

In [17]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=128, padding="max_length")


tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████| 800/800 [00:00<00:00, 15244.60 examples/s]


In [18]:
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Using the latest cached version of the module from /home/maris/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sun Dec  1 21:21:15 2024) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.


{'accuracy': 0.53375, 'f1': 0.1729490022172949}