#### Import packages and init classed

In [None]:
import logging
import os
import sys
import pdb
import subprocess

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple

import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score
from torch import nn

# from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoModel,
    AutoTokenizer,
    AutoModelForCausalLM,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)
from utils_ner__new import NerDataset, Split, get_labels

logger = logging.getLogger(__name__)

#在下面的代码中，@dataclass 装饰器省略了手动编写的 __init__ 方法，以及通过 self.x 和 self.y 定义的属性。这使得代码更加简洁和易读。
#同时，使用 @dataclass 装饰器后，就可以方便地在外部创建数据类的实例并传递参数
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
    # or just modify its tokenizer_config.json.
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."}
    )
    labels: Optional[str] = field(
        default=None,
        metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."},
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )



### Set directory path and arguments

In [2]:
#set root directory
root_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/"

#set other directory
# data_path = root_dir + "datasets/for_train/datasets_from_download/NER/lipid/2_LipidCorpus_Normalized.Name"
# data_path = root_dir + "datasets/for_train/dir_for_test/only_test"
data_path = root_dir + "datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/recognization_test"
labels_path = root_dir + "datasets/for_train/datasets_from_download/NER/lipid/1_LipidCorpus/labels.txt"
model_path = root_dir + "biobertModelWarehouse/model_from_trained/NER_add_words_change_split_way/1_LipidCorpus"
# output_path = root_dir + "biobertModelWarehouse/model_from_trained/NER_add_words_change_split_way/2_LipidCorpus_Normalized.Name"
output_path = root_dir + "datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/recognization_result"

model_args = ModelArguments(model_name_or_path=model_path)
data_args = DataTrainingArguments(data_dir=data_path, labels=labels_path,max_seq_length = 512)
training_args = TrainingArguments(output_dir=output_path, num_train_epochs=5, learning_rate=3e-5,do_predict = True)


### load tokenizer and model

In [None]:
set_seed(training_args.seed)

# Prepare CONLL-2003 task
labels = get_labels(data_args.labels)
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
num_labels = len(labels)

# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    num_labels=num_labels,
    id2label=label_map,
    label2id={label: i for i, label in enumerate(labels)},
    cache_dir=model_args.cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    use_fast=model_args.use_fast,
)
model = AutoModelForTokenClassification.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
)


### load train and eval dataset, init trainer

In [None]:
train_dataset = (
    NerDataset(
        data_dir=data_args.data_dir,
        data_file_name = 'train.txt',
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
    )
    if training_args.do_train
    else None
)
eval_dataset = (
    NerDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
    )
    if training_args.do_eval
    else None
)
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]
    
    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                out_label_list[i].append(label_map[label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    return preds_list, out_label_list

def compute_metrics(p: EvalPrediction) -> Dict:
    preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
    
    return {
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)



In [None]:
import os
import glob

dir_ = data_args.data_dir
input_file = 'test.txt'
test_dataset = NerDataset(
        data_dir= dir_,
        data_file_name = input_file,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.test,
    )

predictions, label_ids, metrics = trainer.predict(test_dataset)
preds_list, _ = align_predictions(predictions, label_ids)  #preds_list 是预测结果，对应的是每句话的实体列表，不是tokens

output_test_predictions_file = os.path.join(training_args.output_dir, input_file+"_predictions.txt")

with open(output_test_predictions_file, "w") as writer:
    with open(os.path.join(dir_, input_file), "r") as f:
        example_id = 0
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                writer.write(line)
                if not preds_list[example_id]:
                    example_id += 1
            elif preds_list[example_id]:
                entity_label = preds_list[example_id].pop(0)
                output_line = line.split()[0] + " " + entity_label + "\n"
                writer.write(output_line)
            else:
                logger.warning(
                    "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                )
files_to_delete = glob.glob(data_args.data_dir + '/*BertTokenizer*')
for file in files_to_delete:
    os.remove(file)

Pipline done. Next is the test code.

In [None]:
# tokenizer.convert_ids_to_tokens(test_dataset[0].input_ids)
# root_dir + "datasets/for_train/datasets_from_download/NER/lipid/2_LipidCorpus_Normalized.Name"
training_args.output_dir

In [None]:
# if_add_words_in_tokenizer = True
# if if_add_words_in_tokenizer:
#     added_lipid_list_filename = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/pytorch-biobert/named-entity-recognition/added_lipid_list.txt'
#     with open(added_lipid_list_filename, 'r') as f:
#         added_lipid_list = f.read().splitlines()
#     for lipid in added_lipid_list:
#         tokenizer.add_tokens(lipid)
#     model.resize_token_embeddings(len(tokenizer))
#     model.save_pretrained(model_args.model_name_or_path)
#     tokenizer.save_pretrained(model_args.model_name_or_path)

In [None]:
# text = "Thus, changes in plasma S1P d16:1 levels, plasma S1P d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer d18:1 levels were inferred to be disease-induced changes in Alzheimer's disease or DLB"
# text_split = text.split()
# print(text_split)
# with open(data_args.data_dir+ '/' + 'text_.txt', 'w') as f:
#     for i in text_split:
#         f.write(i+' '+'O'+'\n')


In [None]:
# Predict
import os
import glob

dir_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result_txt'
input_file = 'PMC9440283_bioc.json.txt_one_sentence'
test_dataset = NerDataset(
        data_dir= dir_,
        data_file_name = input_file,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.test,
    )

predictions, label_ids, metrics = trainer.predict(test_dataset)
preds_list, _ = align_predictions(predictions, label_ids)  #preds_list 是预测结果，对应的是每句话的实体列表，不是tokens

output_test_predictions_file = os.path.join(training_args.output_dir, input_file+"_predictions.txt")

with open(output_test_predictions_file, "w") as writer:
    with open(os.path.join(dir_, input_file), "r") as f:
        example_id = 0
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                writer.write(line)
                if not preds_list[example_id]:
                    example_id += 1
            elif preds_list[example_id]:
                entity_label = preds_list[example_id].pop(0)
                output_line = line.split()[0] + " " + entity_label + "\n"
                writer.write(output_line)
            else:
                logger.warning(
                    "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                )
files_to_delete = glob.glob(dir_ + '/*BertTokenizer*')
for file in files_to_delete:
    os.remove(file)

In [None]:
training_args.output_dir

In [None]:
#loop prediction
import os
import glob
import time
dir_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result_txt'
for input_file in os.listdir(dir_):
    test_dataset = NerDataset(
        data_dir= dir_,
        data_file_name = input_file,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.test,
    )

    predictions, label_ids, metrics = trainer.predict(test_dataset)
    preds_list, _ = align_predictions(predictions, label_ids)  #preds_list 是预测结果，对应的是每句话的实体列表，不是tokens

    output_test_predictions_file = os.path.join(training_args.output_dir, input_file+"_predictions.txt")

    with open(output_test_predictions_file, "w") as writer:
        with open(os.path.join(dir_, input_file), "r") as f:
            example_id = 0
            for line in f:
                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                    writer.write(line)
                    if not preds_list[example_id]:
                        example_id += 1
                elif preds_list[example_id]:
                    entity_label = preds_list[example_id].pop(0)
                    output_line = line.split()[0] + " " + entity_label + "\n"
                    writer.write(output_line)
                else:
                    logger.warning(
                        "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                    )
    files_to_delete = glob.glob(dir_ + '/*BertTokenizer*')
    for file in files_to_delete:
        os.remove(file)
    time.sleep(1)

In [None]:
os.path.join(dir_, input_file)

In [None]:
# predictions, label_ids, metrics = trainer.predict(test_dataset)
preds_list, _ = align_predictions(predictions, label_ids)  #preds_list 是预测结果，对应的是每句话的实体列表，不是tokens

# output_test_predictions_file = os.path.join(training_args.output_dir, input_file+"_predictions.txt")

with open(output_test_predictions_file, "w") as writer:
    with open(os.path.join(dir_, input_file), "r") as f:
        example_id = 0
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                writer.write(line)
                if not preds_list[example_id]:
                    example_id += 1
            elif preds_list[example_id]:
                entity_label = preds_list[example_id].pop(0)
                output_line = line.split()[0] + " " + entity_label + "\n"
                writer.write(output_line)
            else:
                print(len(line.split()))
                logger.warning(
                    "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                )
files_to_delete = glob.glob(data_args.data_dir + '/*BertTokenizer*')
for file in files_to_delete:
    os.remove(file)

In [None]:
predictions, label_ids, metrics = trainer.predict(test_dataset)
preds_list, _ = align_predictions(predictions, label_ids)  #preds_list 是预测结果，对应的是每句话的实体列表，不是tokens

output_test_predictions_file = os.path.join(training_args.output_dir, input_file+"_predictions.txt")


In [None]:
preds_list, _ = align_predictions(predictions, label_ids)  #preds_list 是预测结果，对应的是每句话的实体列表，不是tokens

with open(output_test_predictions_file, "w") as writer:
    with open(os.path.join(dir_, input_file), "r") as f:
        example_id = 0
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                writer.write(line)
                if not preds_list[example_id]:
                    example_id += 1
            elif preds_list[example_id]:
                entity_label = preds_list[example_id].pop(0)
                output_line = line.split()[0] + " " + entity_label + "\n"
                writer.write(output_line)
            else:
                print(example_id)
                # logger.warning(
                #     "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                # )


In [15]:
files_to_delete = glob.glob(data_args.data_dir + '/cached_test_BertTokenizer_max_seq_length_512*')
for file in files_to_delete:
    os.remove(file)
        

open pkl files.

In [None]:
import pickle
path_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result/'
with open(path_ + 'df_dict.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)
PMC_bioc_name = 'PMC9481132_bioc.json'
df_dict_only_lipid[PMC_bioc_name]


In [43]:
import re
with open('q.txt','w') as f:
    for sentence in df_dict_only_lipid[PMC_bioc_name]['split_sentence']:
        sentence = sentence.replace('\n', ' O\n')
        f.write(sentence +'\n\n')


    # sentence
# sentence.replace('\n', ' O\n')


In [None]:
import os
os.getcwd()