In [6]:
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import sys

def load_model_and_tokenizer(model_dir, device):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    
    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(texts, tokenizer, model, label_map, device):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    return [label_map[pred] for pred in predictions]

model_dir = './output/GAD-1'  # 从命令行参数获取模型目录
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config, tokenizer, model = load_model_and_tokenizer(model_dir, device)
label_map = config.id2label

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [None]:
# 示例输入文本
texts = [
    "The @GENE$ gene is less likely to play a substantial role in the development of atopy and @DISEASE$ in the Japanese population.",
    "The PC36:0  is less likely to play a substantial role in the development of atopy and @DISEASE$ in the Japanese population."
]

# 进行预测
predictions = predict(texts, tokenizer, model, label_map, device)

# 打印预测结果
for text, prediction in zip(texts, predictions):
    print(f"Text: {text}\nPrediction: {prediction}\n")


In [4]:
import pickle

ner_extract_dir = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result/'
ner_extract_file = ner_extract_dir +'df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_2.pkl'
with open(ner_extract_file,'rb') as f:
    df_dict = pickle.load(f)


In [5]:
df_ = df_dict['PMC10916870_bioc.json']
df_.head()
# df_['have_lipid_and_disease'] = ((df_['predictions_lipid'].notna()) & (df_['predictions_disease'].notna())).astype(int)
# filtered_df = df_[df_['have_lipid_and_disease'] == 1]
# filtered_df.head()
df_[]


Unnamed: 0,iao_name_1,iao_id_1,sentence,sentence_number,split_sentence,predictions,predictions_lipid,predictions_disease
0,document title,IAO:0000305,Cationic amphiphilic drugs induce accumulation...,1,Cationic\namphiphilic\ndrugs\ninduce\naccumula...,,,
1,textual abstract section,IAO:0000315,Lysosomes are acidic organelles responsible fo...,1,Lysosomes\nare\nacidic\norganelles\nresponsibl...,,,
2,textual abstract section,IAO:0000315,These drugs can also induce lysosomal membrane...,2,These\ndrugs\ncan\nalso\ninduce\nlysosomal\nme...,,,"{'tokens': ['[CLS]', 'these', 'drugs', 'can', ..."
3,textual abstract section,IAO:0000315,"Here, we uncover that the cationic amphiphilic...",3,"Here,\nwe\nuncover\nthat\nthe\ncationic\namphi...",,,"{'tokens': ['[CLS]', 'here', ',', 'we', 'un', ..."
4,textual abstract section,IAO:0000315,Using quantitative mass spectrometry-based sho...,4,Using\nquantitative\nmass\nspectrometry-based\...,,,"{'tokens': ['[CLS]', 'using', 'quantitative', ..."


In [8]:
sentence= df_.iloc[2]['sentence']
print(sentence)

tokens2label = df_.iloc[2]['predictions_disease']
print(tokens2label)



These drugs can also induce lysosomal membrane permeabilization and cancer cell death, but the underlying mechanism remains elusive.
{'tokens': ['[CLS]', 'these', 'drugs', 'can', 'also', 'induce', 'l', '##ys', '##oso', '##mal', 'membrane', 'per', '##me', '##abi', '##li', '##zation', 'and', 'cancer', 'cell', 'death', ',', 'but', 'the', 'underlying', 'mechanism', 'remains', 'el', '##usive', '.', '[SEP]'], 'labels': ['B-disease']}


In [None]:
len(tokens2label['tokens'])
tokens2label['labels']

In [14]:
# tokenizer.encode(tokens2label['tokens'])
# tokenizer.convert_ids_to_tokens(tokens2label['tokens'])

aa = tokenizer.tokenize(sentence)

In [15]:
type(aa)

list

In [3]:
label_map = config.id2label
for PMC_bioc_name in df_dict.keys():    
    if 'predictions_lipid' in df_dict[PMC_bioc_name].columns:
        continue
    else:
        print(PMC_bioc_name)
        results = df_dict[PMC_bioc_name]['sentence'].apply(
            lambda x: predict(x, tokenizer, model, label_map, device='cuda')
        )
        
        df_dict[PMC_bioc_name]['predictions_lipid'], df_dict[PMC_bioc_name]['predictions_disease'] = zip(*results)
        with open(f'df_dict_lipid_and_disease_{model_name}.pkl','wb') as f:
            pickle.dump(df_dict, f)


In [1]:
import re

# 示例数据
tokens = ['[CLS]', 'our', 'previous', 'studies', 'have', 'shown', 'that', 'ca', '##d', '-', 'induced', 'l', '##mp', 'and', 'cell', 'death', 'rely', 'on', 'the', 'in', '##hibition', 'of', 'l', '##ys', '##oso', '##mal', 'acid', 's', '##phi', '##ngo', '##my', '##elin', '##ase', '(', 'as', '##m', '/', 's', '##mp', '##d', '##1', ';', 'pet', '##erse', '##n', 'et', 'al', '.', ',', '2013', ')', ',', 'while']
labels = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

# 将子词组合成完整的单词，并为每个完整的单词分配标签
def combine_subwords(tokens, labels):
    combined_tokens = []
    combined_labels = []
    
    current_token = ""
    current_label = labels[0]
    
    for token, label in zip(tokens, labels):
        if token.startswith("##"):
            current_token += token[2:]
        else:
            if current_token:
                combined_tokens.append(current_token)
                combined_labels.append(current_label)
            current_token = token
            current_label = label
    
    # 添加最后一个 token
    if current_token:
        combined_tokens.append(current_token)
        combined_labels.append(current_label)
    
    return combined_tokens, combined_labels

# 执行函数
combined_tokens, combined_labels = combine_subwords(tokens, labels)

# 输出结果
for token, label in zip(combined_tokens, combined_labels):
    print(f"{token}: {label}")


[CLS]: O
our: O
previous: O
studies: O
have: O
shown: O
that: O
cad: O
-: O
induced: O
lmp: O
and: O
cell: O
death: O
rely: O
on: O
the: O
inhibition: O
of: O
lysosomal: O
acid: O
sphingomyelinase: O
(: O
asm: O
/: O
smpd1: O
;: O
petersen: O
et: O
al: O
.: O
,: O
2013: O
): O
,: O
while: O


In [2]:
combined_tokens

['[CLS]',
 'our',
 'previous',
 'studies',
 'have',
 'shown',
 'that',
 'cad',
 '-',
 'induced',
 'lmp',
 'and',
 'cell',
 'death',
 'rely',
 'on',
 'the',
 'inhibition',
 'of',
 'lysosomal',
 'acid',
 'sphingomyelinase',
 '(',
 'asm',
 '/',
 'smpd1',
 ';',
 'petersen',
 'et',
 'al',
 '.',
 ',',
 '2013',
 ')',
 ',',
 'while']

In [16]:
# from transformers import RobertaTokenizerFast
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import sys
config = AutoConfig.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

# tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')

example = "This is a tokenization example"

enc = tokenizer(example, add_special_tokens=False)

desired_output = []

#BatchEncoding.word_ids returns a list mapping words to tokens
for w_idx in set(enc.word_ids()):
    #BatchEncoding.word_to_tokens tells us which and how many tokens are used for the specific word
    start, end = enc.word_to_tokens(w_idx)
    # we add +1 because you wanted to start with 1 and not with 0
    start+=1
    end+=1
    desired_output.append(list(range(start,end)))

In [18]:
enc , desired_output

({'input_ids': [1142, 1110, 170, 22559, 2734, 1859], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]},
 [[1], [2], [3], [4, 5], [6]])

In [22]:
print({x : tokenizer.encode(x, add_special_tokens=False, add_prefix_space=True) for x in example.split()})

TypeError: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'add_prefix_space'

In [23]:
example.split()

['This', 'is', 'a', 'tokenization', 'example']

In [31]:
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification

import sys

def load_model_and_tokenizer(model_dir,device = 'cuda'):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    
    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(text, tokenizer, model, label_map, device = 'cuda'):
    # 将输入文本分词
    text_split = text.split()
    inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # 兼容不同版本的transformers库
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs.logits

    # 获取预测结果
    predictions = torch.argmax(logits, dim=2)
    
    predictions = predictions.cpu().detach().numpy()  #gpu中的torch.tensor,需要先把它放进cpu才可以转化
    

    # 将预测结果转换为标签
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    predicted_labels = [label_map[pred] for pred in predictions[0]]
    prediction ={'tokens':tokens, 'labels':predicted_labels}
    # 
    # 分别提取 lipid 和 disease 标签
    if_have_lipid_predictions = list(set(predicted_labels).intersection(['B-lipid','I-lipid']))
    if_have_disease_predictions = list(set(predicted_labels).intersection(['B-disease', 'I-disease']))

    lipid_result = {'tokens': tokens, 'labels': predicted_labels} if if_have_lipid_predictions else None
    disease_result = {'tokens': tokens, 'labels': predicted_labels} if if_have_disease_predictions else None
    # print(lipid_result, disease_result)
    return lipid_result, disease_result

model_dir = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_2'  # 从命令行参数获取模型目录
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config, tokenizer, model = load_model_and_tokenizer(model_dir, device)
label_map = config.id2label

In [69]:
text = "Thus, changes in plasma  PC(20:1) levels, plasma S1P d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer d18:1 levels were inferred to be disease-induced changes in Alzheimer's disease or DLB"
# predict(sentence, tokenizer, model, label_map, device='cuda')


text_split = text.split()
inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# 使用模型进行预测
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    
    # 兼容不同版本的transformers库
    if isinstance(outputs, tuple):
        logits = outputs[0]
    else:
        logits = outputs.logits

# 获取预测结果
predictions = torch.argmax(logits, dim=2)

predictions = predictions.cpu().detach().numpy()  #gpu中的torch.tensor,需要先把它放进cpu才可以转化


# desired_output = []
# for word_id in inputs.word_ids():
#     if word_id is not None:
#         start, end = inputs.word_to_tokens(word_id)
#         if start == end - 1:
#             tokens = [start]
#         else:
#             tokens = [start, end-1]
#         if len(desired_output) == 0 or desired_output[-1] != tokens:
#             desired_output.append(tokens)

# label_list = []
# for range_ in  desired_output:
#     if len(range_) == 1:
#         label_list.append(int(predictions[0][range_]))

#     else:
#         label_list.append(int(predictions[0][range_[0]:range_[1]+1]))


# 将预测结果转换为标签
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
predicted_labels = [label_map[pred] for pred in predictions[0]]
prediction ={'tokens':tokens, 'labels':predicted_labels}
# 
# 分别提取 lipid 和 disease 标签
if_have_lipid_predictions = list(set(predicted_labels).intersection(['B-lipid','I-lipid']))
if_have_disease_predictions = list(set(predicted_labels).intersection(['B-disease', 'I-disease']))

lipid_result = {'tokens': tokens, 'labels': predicted_labels} if if_have_lipid_predictions else None
disease_result = {'tokens': tokens, 'labels': predicted_labels} if if_have_disease_predictions else None

In [70]:
for (token,label) in zip(tokens,predicted_labels):
    print(token,label)
# tokens, predicted_labels

[CLS] O
thus O
, O
changes O
in O
plasma O
p B-lipid
##c O
( I-lipid
20 O
: O
1 O
) O
levels O
, O
plasma O
s O
##1 O
##p I-lipid
d I-lipid
##18 I-lipid
: O
1 O
levels O
, O
plasma O
mon O
##cer O
d O
##18 O
: O
1 O
levels O
or O
plasma O
la B-lipid
##cc B-lipid
##er O
d O
##18 O
: O
1 O
levels O
were O
in O
##ferred O
to O
be O
disease O
- O
induced O
changes O
in O
al B-disease
##z I-disease
##heimer I-disease
' I-disease
s I-disease
disease I-disease
or O
d O
##l O
##b O
[SEP] O


In [60]:
lipid_result

{'tokens': ['[CLS]',
  'thus',
  ',',
  'changes',
  'in',
  'plasma',
  's',
  '##1',
  '##p',
  'd',
  '##16',
  ':',
  '1',
  'levels',
  ',',
  'plasma',
  's',
  '##1',
  '##p',
  'd',
  '##18',
  ':',
  '1',
  'levels',
  ',',
  'plasma',
  'mon',
  '##cer',
  'd',
  '##18',
  ':',
  '1',
  'levels',
  'or',
  'plasma',
  'la',
  '##cc',
  '##er',
  'd',
  '##18',
  ':',
  '1',
  'levels',
  'were',
  'in',
  '##ferred',
  'to',
  'be',
  'disease',
  '-',
  'induced',
  'changes',
  'in',
  'al',
  '##z',
  '##heimer',
  "'",
  's',
  'disease',
  'or',
  'd',
  '##l',
  '##b',
  '[SEP]'],
 'labels': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-lipid',
  'B-lipid',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  '

In [None]:
len(tokens2label['tokens'])
tokens2label['labels']

In [40]:
predict_ = predict(sentence, tokenizer, model, label_map, device='cuda')

In [49]:
encoded.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 5,
 5,
 5,
 6,
 7,
 7,
 7,
 7,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 18,
 19,
 None]

In [50]:
predicted_labels


['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-disease',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [36]:
inputs #, input_ids, predictions

{'input_ids': tensor([[  101,  1292,  5557,  1169,  1145, 21497,   181,  6834, 22354,  7435,
         10936,  1679,  3263, 23156,  2646,  8569,  1105,  4182,  2765,  1473,
           117,  1133,  1103, 10311,  6978,  2606,  8468, 17849,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]])}

In [None]:
len(tokens2label['tokens'])
tokens2label['labels']

In [39]:
example = "These drugs can also induce lysosomal membrane permeabilization and cancer cell death, but the underlying mechanism remains elusive."

encoded = tokenizer(example)

desired_output = []
for word_id in encoded.word_ids():
    if word_id is not None:
        start, end = encoded.word_to_tokens(word_id)
        if start == end - 1:
            tokens = [start]
        else:
            tokens = [start, end-1]
        if len(desired_output) == 0 or desired_output[-1] != tokens:
            desired_output.append(tokens)
desired_output

[[1],
 [2],
 [3],
 [4],
 [5],
 [6, 9],
 [10],
 [11, 15],
 [16],
 [17],
 [18],
 [19],
 [20],
 [21],
 [22],
 [23],
 [24],
 [25],
 [26, 27],
 [28]]

In [37]:
encoded = tokenizer(example)


In [38]:
encoded

{'input_ids': [101, 1292, 5557, 1169, 1145, 21497, 181, 6834, 22354, 7435, 10936, 1679, 3263, 23156, 2646, 8569, 1105, 4182, 2765, 1473, 117, 1133, 1103, 10311, 6978, 2606, 8468, 17849, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [48]:
encoded = tokenizer(example)

desired_output = []
for word_id in encoded.word_ids():
    if word_id is not None:
        start, end = encoded.word_to_tokens(word_id)
        if start == end - 1:
            tokens = [start]
        else:
            tokens = [start, end-1]
        if len(desired_output) == 0 or desired_output[-1] != tokens:
            desired_output.append(tokens)
# desired_output,predict_[1]['tokens']

In [44]:
predict_

(None,
 {'tokens': ['[CLS]',
   'these',
   'drugs',
   'can',
   'also',
   'induce',
   'l',
   '##ys',
   '##oso',
   '##mal',
   'membrane',
   'per',
   '##me',
   '##abi',
   '##li',
   '##zation',
   'and',
   'cancer',
   'cell',
   'death',
   ',',
   'but',
   'the',
   'underlying',
   'mechanism',
   'remains',
   'el',
   '##usive',
   '.',
   '[SEP]'],
  'labels': ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-disease',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']})

In [71]:
# coding=utf-8
import logging
import os
import numpy as np
from torch import nn
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)
from utils_ner import NerDataset, Split, get_labels

logger = logging.getLogger(__name__)

def main():
    # 固定参数
    model_name_or_path = "~/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_2"
    data_dir = "~/bioinfo/P_subject/NLP/biobert/datasets/for_train/datasets_from_download/NER/lipid/2_LipidCorpus_Normalized.Name"
    output_dir = "~/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_2_for_test"
    labels_path = None  # 或者 "path/to/labels.txt"
    max_seq_length = 128
    overwrite_cache = False
    cache_dir = None
    use_fast_tokenizer = False
    seed = 42

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info("Training/evaluation parameters %s", output_dir)

    set_seed(seed)

    labels = get_labels(labels_path)
    label_map = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        cache_dir=cache_dir,
        use_fast=use_fast_tokenizer,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_name_or_path,
        config=config,
        cache_dir=cache_dir,
    )

    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=2)
        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

    # 固定的训练参数
    training_args = TrainingArguments(
        output_dir=output_dir,
        do_predict=True,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
    )

    # Predict
    test_dataset = NerDataset(
        data_dir=data_dir,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=max_seq_length,
        overwrite_cache=overwrite_cache,
        mode=Split.test,
    )

    predictions, label_ids, metrics = trainer.predict(test_dataset)
    preds_list, _ = align_predictions(predictions, label_ids)

    # Save predictions
    output_test_results_file = os.path.join(output_dir, "test_results.txt")
    with open(output_test_results_file, "w") as writer:
        logger.info("***** Test results *****")
        for key, value in metrics.items():
            logger.info("  %s = %s", key, value)
            writer.write("%s = %s\n" % (key, value))

    output_test_predictions_file = os.path.join(output_dir, "test_predictions.txt")
    with open(output_test_predictions_file, "w") as writer:
        with open(os.path.join(data_dir, "test.txt"), "r") as f:
            example_id = 0
            for line in f:
                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                    writer.write(line)
                    if not preds_list[example_id]:
                        example_id += 1
                elif preds_list[example_id]:
                    entity_label = preds_list[example_id].pop(0)
                    output_line = line.split()[0] + " " + entity_label + "\n"
                    writer.write(output_line)
                else:
                    logger.warning(
                        "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                    )

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'utils_ner'