In [4]:
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification

def load_model_and_tokenizer(model_dir, device='cuda'):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    
    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(text, tokenizer, model, label_map, device='cuda'):
    # 将输入文本分词
    text_split = text.split()
    inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # 兼容不同版本的transformers库
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs.logits

    # 获取预测结果
    predictions = torch.argmax(logits, dim=2)
    
    predictions = predictions.cpu().detach().numpy()  # 将GPU中的tensor移至CPU以便转换为numpy数组

    # 将预测结果映射回标签
    token_predictions = [label_map[pred] for pred in predictions[0]]
    word_predictions = []   

    # 将预测结果重新组合成原始单词形式
    for word, prediction in zip(text_split, token_predictions):
        word_predictions.append((word, prediction))

    return word_predictions

model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_3"  # 替换为你模型的保存路径
config, tokenizer, model = load_model_and_tokenizer(model_dir)
label_map = config.id2label
# text = '''This severe pulmonary defect in Pla2g5-Tg mice was attributable to marked reduction of the lung surfactant phospholipids, phosphatidylcholine (PC) (Figure 1a) and phosphatidylglycerol (PG) (Figure 1b), as demonstrated by ESI-MS (electrospray ionization mass spectrometry) analysis.'''
text = 'Thus, changes in plasma S1P d16:1 levels, plasma S1P d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer d18:1 levels were inferred to be disease-induced changes in AD or DLB'
prediction = predict(text, tokenizer, model, label_map)
print(prediction)


340.82s - Error calculating Smart Step Into Variants.
Traceback (most recent call last):
  File "/home/data/t200404/software/anaconda3/envs/python3_11_gpu/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py", line 1027, in internal_get_step_in_targets_json
    variants = pydevd_bytecode_utils.calculate_smart_step_into_variants(frame, start_line, end_line)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/data/t200404/software/anaconda3/envs/python3_11_gpu/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_bytecode_utils.py", line 816, in calculate_smart_step_into_variants
    for target in _get_smart_step_into_targets(code):
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/data/t200404/software/anaconda3/envs/python3_11_gpu/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_bytecode_utils.py", line 678, in _get_smart_step_i

[('Thus,', 'O'), ('changes', 'O'), ('in', 'O'), ('plasma', 'O'), ('S1P', 'O'), ('d16:1', 'O'), ('levels,', 'B-lipid'), ('plasma', 'B-lipid'), ('S1P', 'I-lipid'), ('d18:1', 'I-lipid'), ('levels,', 'I-lipid'), ('plasma', 'O'), ('MonCer', 'O'), ('d18:1', 'O'), ('levels', 'O'), ('or', 'O'), ('plasma', 'B-lipid'), ('LacCer', 'B-lipid'), ('d18:1', 'O'), ('levels', 'I-lipid'), ('were', 'I-lipid'), ('inferred', 'O'), ('to', 'O'), ('be', 'O'), ('disease-induced', 'O'), ('changes', 'O'), ('in', 'B-lipid'), ('AD', 'I-lipid'), ('or', 'I-lipid'), ('DLB', 'I-lipid'), ('Thus,', 'O'), ('changes', 'O'), ('in', 'O'), ('plasma', 'O'), ('S1P', 'O'), ('d16:1', 'O'), ('levels,', 'B-lipid'), ('plasma', 'B-lipid'), ('S1P', 'I-lipid'), ('d18:1', 'I-lipid'), ('levels,', 'I-lipid'), ('plasma', 'O'), ('MonCer', 'O'), ('d18:1', 'O'), ('levels', 'O'), ('or', 'O'), ('plasma', 'B-lipid'), ('LacCer', 'B-lipid'), ('d18:1', 'O'), ('levels', 'I-lipid'), ('were', 'I-lipid'), ('inferred', 'O'), ('to', 'O'), ('be', 'O'), ('

In [5]:
#for test, first version.
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification

def load_model_and_tokenizer(model_dir,device = 'cuda'):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    
    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(text, tokenizer, model, label_map, device = 'cuda'):
    # 将输入文本分词
    text_split = text.split()
    inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # 兼容不同版本的transformers库
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs.logits

    # 获取预测结果
    predictions = torch.argmax(logits, dim=2)
    
    predictions = predictions.cpu().detach().numpy()  #gpu中的torch.tensor,需要先把它放进cpu才可以转化
    

    # 将预测结果转换为标签
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    predicted_labels = [label_map[pred] for pred in predictions[0]]
    prediction ={'tokens':tokens, 'labels':predicted_labels}


    # retB = list(set(predicted_labels).intersection(['B-lipid','I-lipid']))  #
    # print(text)
    # if len(retB) >= 1:
    #     # print(prediction['tokens'])
    #     print(text)
    #     print(prediction['labels'])
    # if len(set(predicted_labels))  > 1:

    # 分别提取 lipid 和 disease 标签
    if_have_lipid_predictions = list(set(predicted_labels).intersection(['B-lipid','I-lipid']))
    if_have_disease_predictions = list(set(predicted_labels).intersection(['B-disease', 'I-disease']))

    lipid_result = {'tokens': tokens, 'labels': predicted_labels} if if_have_lipid_predictions else None
    disease_result = {'tokens': tokens, 'labels': predicted_labels} if if_have_disease_predictions else None
    # print(lipid_result, disease_result)
    return lipid_result, disease_result


model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_3"  # 替换为你模型的保存路径
config, tokenizer, model = load_model_and_tokenizer(model_dir)
label_map = config.id2label

In [11]:
#for test
text = ['''Thus, changes in plasma S1P d16:1 levels, plasma S1P d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer d18:1 levels were inferred to be disease-induced changes in Alzheimer's disease or DLB''']
for sentence in text:
    prediction = predict(sentence, tokenizer, model, label_map) 

{0: 'B-disease', 1: 'I-disease', 2: 'B-lipid', 3: 'I-lipid', 4: 'O'}

In [10]:
for (token, label) in zip(prediction[0]['tokens'][1:],prediction[0]['labels'][1:]):
    print(token, label)


thus O
, O
changes O
in O
plasma O
s B-lipid
##1 B-lipid
##p I-lipid
d I-lipid
##16 I-lipid
: O
1 O
levels O
, O
plasma O
s B-lipid
##1 B-lipid
##p O
d I-lipid
##18 I-lipid
: O
1 O
levels O
, O
plasma O
mon B-lipid
##cer I-lipid
d I-lipid
##18 I-lipid
: O
1 O
levels O
or O
plasma O
la O
##cc B-lipid
##er O
d O
##18 O
: O
1 O
levels O
were O
in O
##ferred O
to O
be O
disease O
- O
induced O
changes O
in O
al O
##z O
##heimer O
' O
s O
disease O
or O
d O
##l O
##b O
[SEP] O


In [12]:
import pickle
path_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result/'
with open(path_ + 'df_dict_only_lipid.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)
PMC_bioc_name = 'PMC9481132_bioc.json'

df_dict_only_lipid[PMC_bioc_name]['predictions_lipid_'] = df_dict_only_lipid[PMC_bioc_name]['sentence'].apply(
    lambda x: predict(x, tokenizer, model, label_map)
)

In [33]:
import pickle
with open('df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_2.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)
# PMC_bioc_name = 'PMC9529825_bioc.json'
# df_dict_only_lipid[PMC_bioc_name].head()

In [43]:
aa = list(df_dict_only_lipid.keys())
aa[3]

'PMC8917834_bioc.json'

In [46]:
PMC_bioc_name = aa[5]
df_dict_only_lipid[PMC_bioc_name].head()
# df_dict_only_lipid[PMC_bioc_name].iloc[2]['predictions_lipid']
# df_dict_only_lipid[PMC_bioc_name].iloc[2]['sentence']

Unnamed: 0,iao_name_1,iao_id_1,sentence,sentence_number,split_sentence,predictions,predictions_lipid,predictions_disease
0,document title,IAO:0000305,Integrative brain omics approach reveals key r...,1,Integrative\nbrain\nomics\napproach\nreveals\n...,,,"{'tokens': ['[CLS]', 'in', '##te', '##gra', '#..."
1,document part,IAO:0000314,ROSMAP resources can be requested at https://w...,1,ROSMAP\nresources\ncan\nbe\nrequested\nat\nhtt...,,,
2,document part,IAO:0000314,Data available upon reasonable request.,2,Data\navailable\nupon\nreasonable\nrequest.,,,
3,textual abstract section,IAO:0000315,The biology of individual lipid species and th...,1,The\nbiology\nof\nindividual\nlipid\nspecies\n...,,,"{'tokens': ['[CLS]', 'the', 'biology', 'of', '..."
4,textual abstract section,IAO:0000315,We utilized non-targeted mass spectrometry to ...,2,We\nutilized\nnon-targeted\nmass\nspectrometry...,,,


In [25]:
with open('df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_2.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)

In [24]:
a = list(df_dict_only_lipid.keys())
a.index('PMC9499334_bioc.json')

378

In [31]:
keys_name = a[800]
df_dict_only_lipid[keys_name]
# df_dict_only_lipid[keys_name].iloc[5]['predictions_disease']

Unnamed: 0,iao_name_1,iao_id_1,sentence,sentence_number,split_sentence,predictions
0,document title,IAO:0000305,Mapping Proteome and Lipidome Changes in Early...,1,Mapping\nProteome\nand\nLipidome\nChanges\nin\...,
1,keywords section,IAO:0000630,"3-dimensional cell culture, spheroids, human h...",1,"3-dimensional\ncell\nculture,\nspheroids,\nhum...",
2,document part,IAO:0000314,Lipidomics and proteomics datasets generated a...,1,Lipidomics\nand\nproteomics\ndatasets\ngenerat...,
3,document part,IAO:0000314,The data can be found at https://www.ebi.ac.uk...,2,The\ndata\ncan\nbe\nfound\nat\nhttps://www.ebi...,
4,textual abstract section,IAO:0000315,Non-alcoholic fatty liver disease affects one-...,1,Non-alcoholic\nfatty\nliver\ndisease\naffects\...,
...,...,...,...,...,...,...
1316,references section,IAO:0000320,Toxicol.,4,Toxicol.,
1317,references section,IAO:0000320,Res.,5,Res.,
1318,references section,IAO:0000320,2013;2:163–172.,6,2013;2:163–172.,
1319,references section,IAO:0000320,doi: 10.1039/c3tx20086h.,7,doi:\n10.1039/c3tx20086h.,


In [4]:
# import pickle
# with open('df_dict.pkl','rb') as f:
#     df_dict = pickle.load(f)

for pmc_ in df_dict.keys():
    df = df_dict[pmc_]
    list_ = list(set(df['predictions']))
    if len(list_) > 1:
        print(list_)
    # print(df.head())
    # break

In [15]:
sentence = '''
Thus, changes in plasma S1P d16:1 levels, plasma S1P d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer d18:1 levels were inferred to be disease-induced changes in Alzheimer's disease or DLB
'''
prediction = predict(sentence.split(), tokenizer, model, label_map)
print(list(set(prediction['labels'])))


['B-lipid', 'B-disease', 'I-disease', 'O', 'I-lipid']


In [None]:
from PIL import Image, ImageDraw, ImageFont

def highlight_text(text, word, highlight_color="yellow", font_path="arial.ttf", font_size=24, output_path="output.png"):
    # Initialize font
    font = ImageFont.truetype(font_path, font_size)
    
    # Create image with white background
    lines = text.split('\n')
    max_line_width = max([font.getsize(line)[0] for line in lines])
    image_height = font_size * len(lines) + 20
    image = Image.new('RGB', (max_line_width + 20, image_height), color = (255, 255, 255))
    draw = ImageDraw.Draw(image)
    
    y_text = 10
    for line in lines:
        words = line.split(' ')
        x_text = 10
        for w in words:
            word_width, word_height = draw.textsize(w + ' ', font=font)
            if w == word:
                # Draw highlighted rectangle
                draw.rectangle([x_text, y_text, x_text + word_width, y_text + word_height], fill=highlight_color)
            draw.text((x_text, y_text), w + ' ', font=font, fill=(0, 0, 0))
            x_text += word_width
        y_text += word_height

    # Save the image
    image.save(output_path)
    print(f"Image saved to {output_path}")

# Example usage
text = "This is a sample text where a specific word is highlighted."
word_to_highlight = "specific"
highlight_text(text, word_to_highlight, highlight_color="yellow", font_path="arial.ttf", font_size=24, output_path="output.png")


## Become image

In [12]:
from PIL import Image, ImageDraw, ImageFont

def highlight_text(text, word, highlight_color="yellow", font_size=24, output_path="output.png"):
    # Initialize default font
    try:
        font = ImageFont.truetype("arial.ttf", font_size)
    except OSError:
        font = ImageFont.load_default()
    
    # Create image with white background
    lines = text.split('\n')
    max_line_width = max([font.getsize(line)[0] for line in lines])
    image_height = font_size * len(lines) + 20
    image = Image.new('RGB', (max_line_width + 20, image_height), color=(255, 255, 255))
    draw = ImageDraw.Draw(image)
    
    y_text = 10
    for line in lines:
        words = line.split(' ')
        x_text = 10
        for w in words:
            word_width, word_height = draw.textsize(w + ' ', font=font)
            if w == word:
                # Draw highlighted rectangle
                draw.rectangle([x_text, y_text, x_text + word_width, y_text + word_height], fill=highlight_color)
            draw.text((x_text, y_text), w + ' ', font=font, fill=(0, 0, 0))
            x_text += word_width
        y_text += word_height

    # Save the image
    image.save(output_path)
    print(f"Image saved to {output_path}")

# Example usage
text = "This is a sample text where a specific word is highlighted."
word_to_highlight = "specific"
highlight_text(text, word_to_highlight, highlight_color="yellow", font_size=24, output_path="output.png")


Image saved to output.png


  max_line_width = max([font.getsize(line)[0] for line in lines])
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)
  word_width, word_height = draw.textsize(w + ' ', font=font)


In [None]:
from PIL import Image, ImageDraw, ImageFont

# 定义要处理的文本和标签
tokens = prediction['tokens']
labels = prediction['labels']
# 获取处理后的words和对应的labels
processed_words = prediction['tokens'][1:-1]  # 去掉[CLS]和[SEP]
processed_labels = prediction['labels'][1:-1]  # 去掉[CLS]和[SEP]

# 定义颜色
color_map = {
    "B-disease": "red",
    "I-disease": "red",
    "B-lipid": "blue",
    "I-lipid": "blue"
}

# 恢复原始文本
def reconstruct_text(tokens):
    words = []
    current_word = ""
    for token in tokens:
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
                words.append(current_word)
            current_word = token
    if current_word:
        words.append(current_word)
    return words

words = reconstruct_text(tokens[1:-1])  # 去掉[CLS]和[SEP]标记

# 创建一个函数来绘制带有高亮的文本
def draw_highlighted_text(words, labels, color_map, font_path="DejaVuSans-Bold.ttf", font_size=24, output_path="output.png"):
    # 初始化字体
    try:
        font = ImageFont.truetype(font_path, font_size)
    except OSError:
        font = ImageFont.load_default()
    
    # 创建白色背景图像
    image_width = 600  # 增加宽度以提高分辨率
    image_height = 600  # 增加高度以提高分辨率
    image = Image.new('RGB', (image_width, image_height), color=(255, 255, 255))
    draw = ImageDraw.Draw(image)
    
    x_text = 10
    y_text = 10
    
    for word, label in zip(words, labels):
        word_width, word_height = draw.textsize(word, font=font)
        color = color_map.get(label, None)
        if color:
            draw.rectangle([x_text, y_text, x_text + word_width, y_text + word_height], fill=color)
        draw.text((x_text, y_text), word, font=font, fill=(0, 0, 0))
        x_text += word_width + 5  # 添加一些空隙
        
        # 换行处理
        if x_text > image_width - 100:
            x_text = 10
            y_text += word_height + 10
    
    # 保存图片
    image.save(output_path)
    print(f"Image saved to {output_path}")


# 绘制并保存图片
draw_highlighted_text(processed_words, processed_labels, color_map, font_size=24, output_path="output.png")

In [33]:
for token, label in zip(tokens, labels):
    if label != 'O':
        print(token, label)

d I-lipid
la B-lipid
##cc B-lipid
##er I-lipid
d I-lipid
##18 I-lipid
al B-disease
##heimer I-disease
' I-disease
s I-disease
disease I-disease


In [9]:
import pickle
with open('df_dict_only_lipid.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)

have = 0
nohave = 0
for key in df_dict_only_lipid.keys():
    # print(key)
    if 'predictions_lipid' in df_dict_only_lipid[key].columns:
        if len(set(df_dict_only_lipid[key]['predictions_lipid'])) > 1:
            print(set(df_dict_only_lipid[key]['predictions_lipid']))
        have += 1
    else:
        print(f'{key},no predictions_lipid')
        nohave += 1
print(f'have predictions_lipid: {have}, no predictions_lipid: {nohave}')

PMC9481132_bioc.json,no predictions_lipid
PMC9552013_bioc.json,no predictions_lipid
PMC10361545_bioc.json,no predictions_lipid
PMC10497391_bioc.json,no predictions_lipid
PMC9305268_bioc.json,no predictions_lipid
PMC10667563_bioc.json,no predictions_lipid
PMC10760103_bioc.json,no predictions_lipid
PMC7345851_bioc.json,no predictions_lipid
PMC10808295_bioc.json,no predictions_lipid
PMC10412683_bioc.json,no predictions_lipid
PMC10537536_bioc.json,no predictions_lipid
PMC9668183_bioc.json,no predictions_lipid
PMC10499237_bioc.json,no predictions_lipid
PMC9225104_bioc.json,no predictions_lipid
PMC9319954_bioc.json,no predictions_lipid
PMC9406929_bioc.json,no predictions_lipid
PMC11141675_bioc.json,no predictions_lipid
PMC10954412_bioc.json,no predictions_lipid
PMC9127619_bioc.json,no predictions_lipid
PMC10746288_bioc.json,no predictions_lipid
PMC9137556_bioc.json,no predictions_lipid
PMC9280915_bioc.json,no predictions_lipid
PMC10846697_bioc.json,no predictions_lipid
PMC10479895_bioc.json,

In [5]:
df_dict_only_lipid.keys()
df_dict_only_lipid['PMC10916870_bioc.json']['predictions_lipid']

0      None
1      None
2      None
3      None
4      None
       ... 
509    None
510    None
511    None
512    None
513    None
Name: predictions_lipid, Length: 514, dtype: object

In [4]:
#lipid and disease
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification
import pickle
import pandas as pd

def load_model_and_tokenizer(model_dir,device):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    
    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(text, tokenizer, model, label_map, device):
    # 将输入文本分词
    text_split = text.split()
    inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512, add_special_tokens=False)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # 兼容不同版本的transformers库
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs.logits

    # 获取预测结果
    predictions = torch.argmax(logits, dim=2)
    
    predictions = predictions.cpu().detach().numpy()  #gpu中的torch.tensor,需要先把它放进cpu才可以转化
    print(predictions)

    # 将预测结果转换为标签
    # example = "This is a tokenization example"
    txt_label_index_list = []
    #BatchEncoding.word_ids returns a list mapping words to tokens
    for w_idx in set(inputs.word_ids()):
        start, _ = inputs.word_to_tokens(w_idx)  #BatchEncoding.word_to_tokens tells us which and how many tokens are used for the specific word
        txt_label_index_list.append(start)         # we add +1 because you wanted to start with 1 and not with 0
    txt_predictions  = [predictions[0][index] for index in txt_label_index_list]
    return text_split,txt_predictions


# path_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result/'
# with open(path_ + 'df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_2.pkl','rb') as f:
    # df_dict = pickle.load(f)

model_name = '4_combine_lipid_disease_ture_combine_3'
model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/"  # 替换为你模型的保存路径
model_dir = model_dir + model_name + "/"
config, tokenizer, model = load_model_and_tokenizer(model_dir,device = 'cuda')

label_map = config.id2label

# PMC_bioc_name = 'PMC11130959_bioc.json'
# dict_keys_list = list(df_dict.keys())
# start_index = dict_keys_list.index(PMC_bioc_name)
# for PMC_bioc_name in dict_keys_list[start_index:]:    
# save_batch = 10
# index_ = 0
# for PMC_bioc_name in dict_keys_list:    

#     if 'predictions_lipid' in df_dict[PMC_bioc_name].columns:
#         continue
#     else:
#         print(PMC_bioc_name)
#         results = df_dict[PMC_bioc_name]['sentence'].apply(
#             lambda x: predict(x, tokenizer, model, label_map, device='cuda')
#         )
#         df_dict[PMC_bioc_name]['text_split'], df_dict[PMC_bioc_name]['txt_predictions'] = zip(*results)
#         index_ += 1
#         if index_ % save_batch == 0:
#             with open(path_+ f'df_dict_lipid_and_disease_{model_name}'+'_word2label'+'.pkl','wb') as f:
#                 pickle.dump(df_dict, f)


In [6]:
x= '''Thus, changes in plasma S1P d16:1 levels, plasma Cer d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer (d18:1/16:0) levels were inferred to be disease-induced changes in Alzheimer's disease or DLB'''
text_, pred_label = predict(x, tokenizer, model, label_map, device='cuda')
df_ = pd.DataFrame({'text': text_, 'pred_label': pred_label})
df_