## Extract entities label from biobert model output.

In [41]:
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification
from collections import defaultdict
import numpy as np
import re 
def load_model_and_tokenizer(model_dir, device):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    
    #add tokenizer
    new_tokens = ['COVID', 'hospitalization','LacCer','Phosphatidylcholine','S1P d18:1']
    num_added_toks = tokenizer.add_tokens(new_tokens)
    model.resize_token_embeddings(len(tokenizer))

    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(text, tokenizer, model, label_map, device):
    # 将输入文本分词
    text_split = text.split()
    # text_split = re.findall(r'\b\w+\s*\([^)]*\)|\b\w+', text)
    inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512, add_special_tokens=False)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # 兼容不同版本的transformers库
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs.logits

    # 获取预测结果
    predictions = logits[0].cpu().detach().numpy()

    # 将预测结果转换为标签
    label_probability = defaultdict(float)
    for idx, word_index in enumerate(inputs.word_ids()):
        label_probability[word_index] += predictions[idx]
    label_probability = np.array(list(label_probability.values()))
    text_label_predictions = np.argmax(label_probability, axis=1)

    return text_split,text_label_predictions


# def predict(text, tokenizer, model, label_map, device='cpu'):
#     # 将输入文本分词
#     text_split = text.split()
#     inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512)
#     input_ids = inputs["input_ids"].to(device)
#     attention_mask = inputs["attention_mask"].to(device)

#     # 使用模型进行预测
#     with torch.no_grad():
#         outputs = model(input_ids, attention_mask=attention_mask)
        
#         # 兼容不同版本的transformers库
#         if isinstance(outputs, tuple):
#             logits = outputs[0]
#         else:
#             logits = outputs.logits

#     # 获取预测结果
#     predictions = torch.argmax(logits, dim=2)
    
#     predictions = predictions.cpu().detach().numpy()  # 将GPU中的tensor移至CPU以便转换为numpy数组

#     # 将预测结果映射回标签
#     token_predictions = [label_map[pred] for pred in predictions[0]]
#     word_predictions = []   

#     # 将预测结果重新组合成原始单词形式
#     print('text_split length:     ' + str(len(text_split)))
#     print('tokens_predictions length:' + str(len(token_predictions)))
    
#     for word, prediction in zip(text_split, token_predictions):  #这种方式应该是不行的。
#         word_predictions.append((word, prediction))

#     return word_predictions

# model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_3.3_data_from_MetaboliteNER_replace"  # 替换为你模型的保存路径
# model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_3"  # 替换为你模型的保存路径
model_name = '1_LipidCorpus'
model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER_add_words_change_split_way/" + model_name
model_name = '4_combine_lipid_disease_ture_combine_3'
model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/" + model_name


config, tokenizer, model = load_model_and_tokenizer(model_dir,device = 'cuda')
label_map = config.id2label
# text = '''This severe pulmonary defect in Pla2g5-Tg mice was attributable to marked reduction of the lung surfactant phospholipids, phosphatidylcholine (PC) (Figure 1a) and phosphatidylglycerol (PG) (Figure 1b), as demonstrated by ESI-MS (electrospray ionization mass spectrometry) analysis.'''
# text = 'Thus, changes in plasma PC (16:0) levels, plasma S1P d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer d18:1 levels were inferred to be disease-induced changes in AD or DLB'
# prediction = predict(text, tokenizer, model, label_map,device = 'cuda')
# print(prediction)


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 29001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


In [45]:
text = 'Thus, changes in plasma PC (16:0/18:2) levels, Cer d18:2/21:0， Phosphatidylcholine (22:4w6/22:0)'
# text = 'Additionally we searched for a between SL metabolites and key AERD biomarkers, FA 20:4;15OH (15-HETE), human leukcyte antigen HLA-DBP1*0301'
# text = 'In contrast, ceramide (Cer) Cer 18:1;O2/21:0 and Cer 18:2;O2/23:0, ursodeoxycholic acid (UDCA), and the ratios BCAA to ArAA, and BCAA to tyrosine (BTR) decreased along with the fibrosis stages of the studied patients (absolute PCC'
# text = 'The MS/MS spectrum of PC(17:0/17:0) is shown in the,which showed'
# text = 'This severe pulmonary defect in Pla2g5-Tg mice was attributable to marked reduction of the lung surfactant phospholipids, phosphatidylcholine (PC) (Figure 1a) and phosphatidylglycerol (PG) (Figure 1b), as demonstrated by ESI-MS (electrospray ionization mass spectrometry) analysis.'
prediction = predict(text, tokenizer, model, label_map,device = 'cuda')
for (word, label) in zip(prediction[0],prediction[1]):
    print(word, label_map[label])
# print(prediction)

Thus, O
changes O
in O
plasma O
PC B-lipid
(16:0/18:2) O
levels, O
Cer B-lipid
d18:2/21:0， O
Phosphatidylcholine O
(22:4w6/22:0) O


In [13]:
label_map

{0: 'B-disease', 1: 'I-disease', 2: 'B-lipid', 3: 'I-lipid', 4: 'O'}

In [5]:
len(tokenizer)

29230

In [None]:
import re

sentence = "Hello world (do not split this part) and keep (this together too)"
# 使用正则表达式匹配：普通单词，或者包含括号的部分和前面的单词
parts = re.findall(r'\b\w+\s*\([^)]*\)|\b\w+', sentence)

print(parts)


In [None]:
model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_3"
config = AutoConfig.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForTokenClassification.from_pretrained(model_dir)
new_tokens = ['COVID', 'hospitalization','Cer','Phosphatidylcholine']
num_added_toks = tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))
model.to('cpu')
model.eval()


In [16]:
from transformers import BertForMaskedLM, BertTokenizer
tokenizer = BertTokenizer.from_pretrained(model_dir, use_fast=True)
model = BertForMaskedLM.from_pretrained(model_dir)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at /home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_3_add_words_test and are newly initialized: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
vocab_size = tokenizer.vocab_size
vocab_size
# print(f"词表大小：{vocab_size}")
# print(f"最大索引：{input_ids.max().item()}")
# if input_ids.max().item() >= vocab_size:
#     print("输入中存在超出词表范围的索引。")

28996

In [None]:
#for test, first version.
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification

def load_model_and_tokenizer(model_dir,device):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    
    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(text, tokenizer, model, label_map, device):
    # 将输入文本分词
    text_split = text.split()
    inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512, add_special_tokens=False)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # 兼容不同版本的transformers库
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs.logits

    # 获取预测结果
    predictions = torch.argmax(logits, dim=2)
    
    predictions = predictions.cpu().detach().numpy()  #gpu中的torch.tensor,需要先把它放进cpu才可以转化
    

    # 将预测结果转换为标签
    # example = "This is a tokenization example"
    txt_label_index_list = []
    #BatchEncoding.word_ids returns a list mapping words to tokens
    for w_idx in set(inputs.word_ids()):
        start, _ = inputs.word_to_tokens(w_idx)  #BatchEncoding.word_to_tokens tells us which and how many tokens are used for the specific word
        txt_label_index_list.append(start)         # we add +1 because you wanted to start with 1 and not with 0
    txt_predictions  = [predictions[0][index] for index in txt_label_index_list]
    return text_split,txt_predictions


model_name = "4_combine_lipid_disease_ture_combine_3.3_data_from_MetaboliteNER_replace"
# model_name = "4_combine_lipid_disease_ture_combine_3"
model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/" + model_name  # 替换为你模型的保存路径
config, tokenizer, model = load_model_and_tokenizer(model_dir,device="cuda")
label_map = config.id2label

In [23]:
#for test
text = ['''Thus, changes in plasma S1P d16:1 levels, plasma S1P d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer d18:1 levels were inferred to be disease-induced changes in Alzheimer's disease or DLB''']
for sentence in text:
    prediction = predict(sentence, tokenizer, model, label_map, device = 'cuda') 

for word, number in zip(prediction[0], prediction[1]):
    print(word, number)

Thus, 4
changes 4
in 4
plasma 4
S1P 4
d16:1 4
levels, 4
plasma 4
Ceremide(d18:1) 4
levels, 4
plasma 4
MonCer 4
d18:1 4
levels 4
or 4
plasma 4
LacCer 4
d18:1 4
levels 4
were 4
inferred 4
to 4
be 4
disease-induced 4
changes 4
in 4
Alzheimer's 4
disease 4
or 4
DLB 4


In [None]:
for (token, label) in zip(prediction[0]['tokens'][1:],prediction[0]['labels'][1:]):
    print(token, label)


In [12]:
import pickle
path_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result/'
with open(path_ + 'df_dict_only_lipid.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)
PMC_bioc_name = 'PMC9481132_bioc.json'

df_dict_only_lipid[PMC_bioc_name]['predictions_lipid_'] = df_dict_only_lipid[PMC_bioc_name]['sentence'].apply(
    lambda x: predict(x, tokenizer, model, label_map)
)

In [33]:
import pickle
with open('df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_2.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)
# PMC_bioc_name = 'PMC9529825_bioc.json'
# df_dict_only_lipid[PMC_bioc_name].head()

In [43]:
aa = list(df_dict_only_lipid.keys())
aa[3]

'PMC8917834_bioc.json'

In [None]:
PMC_bioc_name = aa[5]
df_dict_only_lipid[PMC_bioc_name].head()
# df_dict_only_lipid[PMC_bioc_name].iloc[2]['predictions_lipid']
# df_dict_only_lipid[PMC_bioc_name].iloc[2]['sentence']

In [25]:
with open('df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_2.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)

In [24]:
a = list(df_dict_only_lipid.keys())
a.index('PMC9499334_bioc.json')

378

In [None]:
keys_name = a[800]
df_dict_only_lipid[keys_name]
# df_dict_only_lipid[keys_name].iloc[5]['predictions_disease']

In [4]:
# import pickle
# with open('df_dict.pkl','rb') as f:
#     df_dict = pickle.load(f)

for pmc_ in df_dict.keys():
    df = df_dict[pmc_]
    list_ = list(set(df['predictions']))
    if len(list_) > 1:
        print(list_)
    # print(df.head())
    # break

In [40]:
sentence = '''
Thus, changes in plasma S1P d16:1 levels, plasma S1P d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer d18:1 levels were inferred to be disease-induced changes in Alzheimer's disease or DLB
'''
prediction = predict(sentence, tokenizer, model, label_map)
print(list(set(prediction['labels'])))


TypeError: predict() missing 1 required positional argument: 'device'

## The tokens score is be used for output.

In [10]:
#for test, first version.
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification

def load_model_and_tokenizer(model_dir,device = 'cuda'):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    
    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(text, tokenizer, model, label_map, device = 'cuda'):
    # 将输入文本分词
    text_split = text.split()
    inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    # 获取预测结果matrix
        
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
    predictions = outputs[0][0].cpu()
    return tokens, predictions


model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_3.2_data_from_MetaboliteNER_replace"  # 替换为你模型的保存路径
config, tokenizer, model = load_model_and_tokenizer(model_dir)
label_map = config.id2label

TypeError: load_model_and_tokenizer() missing 1 required positional argument: 'device'

In [4]:
import pickle
path_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result/'
with open(path_ + 'df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_3.1_data_from_MetaboliteNER_replace_word2label.pkl','rb') as f:
    df_dict = pickle.load(f)

In [20]:
df_dict.keys()

dict_keys(['PMC10916870_bioc.json', 'PMC9529825_bioc.json', 'PMC9029192_bioc.json', 'PMC8917834_bioc.json', 'PMC10524330_bioc.json', 'PMC10925467_bioc.json', 'PMC10643672_bioc.json', 'PMC9599269_bioc.json', 'PMC11130959_bioc.json', 'PMC10436799_bioc.json', 'PMC10522101_bioc.json', 'PMC10510131_bioc.json', 'PMC9512465_bioc.json', 'PMC10588952_bioc.json', 'PMC10511619_bioc.json', 'PMC10888776_bioc.json', 'PMC10351449_bioc.json', 'PMC10531690_bioc.json', 'PMC9440283_bioc.json', 'PMC10605133_bioc.json', 'PMC9052151_bioc.json', 'PMC10909962_bioc.json', 'PMC9322850_bioc.json', 'PMC9134818_bioc.json', 'PMC10820332_bioc.json', 'PMC10513930_bioc.json', 'PMC8994740_bioc.json', 'PMC9083465_bioc.json', 'PMC10418585_bioc.json', 'PMC10495680_bioc.json', 'PMC10421736_bioc.json', 'PMC10418258_bioc.json', 'PMC9499571_bioc.json', 'PMC9693522_bioc.json', 'PMC10499024_bioc.json', 'PMC10785251_bioc.json', 'PMC10489812_bioc.json', 'PMC9523326_bioc.json', 'PMC9327366_bioc.json', 'PMC10826307_bioc.json', 'PMC

In [6]:
df_ = df_dict['PMC8917834_bioc.json']
df_.iloc[0,:]

iao_name_1                                            document title
iao_id_1                                                 IAO:0000305
sentence           Sphingolipid-Induced Programmed Cell Death is ...
sentence_number                                                    1
split_sentence     Sphingolipid-Induced\nProgrammed\nCell\nDeath\...
predictions                                                     None
text_split         [Sphingolipid-Induced, Programmed, Cell, Death...
txt_predictions    [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...
Name: 0, dtype: object

In [7]:
for i in range(len(df_['txt_predictions'])):
    if len(set(df_['txt_predictions'][i]))>1:
        # print(i)
        # print(df_['txt_predictions'][i])
        print(df_['sentence'][i])
        for j in range(len(df_['txt_predictions'][i])):
            if df_['txt_predictions'][i][j] != 4:
                print(df_['text_split'][i][j], df_['txt_predictions'][i][j]) 

Hydroxylation at C1 and C3 generates sphinganine (d18:0), the simplest form of LCB.
sphinganine 2
2000, Brodersen et al.
al. 3
Recently it was shown that ENHANCED DISEASE SUSCEPTIBILITY 1 (EDS1) and PHYTOALEXIN DEFICIENT 4 (PAD4), two key players in regulation of SA synthesis in response to biological and abiotic stresses (Wiermer et al.
PHYTOALEXIN 2
DEFICIENT 3
2010, Saucedo-Garcia et al.
et 3
al. 3
The sid2-2, eds1-2 and pad4-1 mutants were kindly provided by Prof. Christiane Gatz (University of Goettingen, Germany).
Christiane 2
Agnieszka Zienkiewicz,  
Department for Plant Biochemistry, Albrecht-von-Haller-Institute for Plant Sciences, University of Goettingen, Justus-von-Liebig-Weg 11, Goettingen 37077, Germany.
Albrecht-von-Haller-Institute 2
(2016) Orosomucoid proteins interact with the small subunit of serine palmitoyltransferase and contribute to sphingolipid homeostasis and stress responses in Arabidopsis.
serine 2
palmitoyltransferase 3
(2011) Different roles of Enhanced Di

In [9]:
text_ = 'Hydroxylation at C1 and C3 generates sphinganine (d18:0), the simplest form of LCB.'
predict(text_,tokenizer,model,label_map,device = 'cuda')

(['[CLS]',
  'h',
  '##ydro',
  '##xy',
  '##lation',
  'at',
  'c',
  '##1',
  'and',
  'c',
  '##3',
  'generates',
  's',
  '##phi',
  '##nga',
  '##nine',
  '(',
  'd',
  '##18',
  ':',
  '0',
  ')',
  ',',
  'the',
  'simplest',
  'form',
  'of',
  'l',
  '##c',
  '##b',
  '.',
  '[SEP]'],
 tensor([[-11.5979, -12.0405,  -4.2480,  -7.0449,   8.7545],
         [-11.3651, -11.9584,  -5.0264,  -7.6798,   9.1329],
         [-11.2527, -12.0310,  -4.8509,  -7.6502,   8.9388],
         [-11.3015, -12.1064,  -4.2448,  -7.6327,   8.4771],
         [-11.3528, -11.9370,  -4.7792,  -7.5037,   8.7867],
         [-11.7667, -12.5819,  -4.8824,  -7.5821,   9.2205],
         [-11.0883, -12.0056,  -5.1577,  -7.5418,   9.2210],
         [-11.3911, -12.3407,  -4.8599,  -7.6260,   9.1165],
         [-11.1128, -11.9133,  -5.1442,  -7.5462,   9.1477],
         [-11.0246, -11.7948,  -5.2795,  -7.4045,   9.2204],
         [-11.5839, -12.6255,  -4.8079,  -7.5948,   9.1376],
         [-11.3385, -12.1225,  -5

BertTokenizerFast(name_or_path='/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/4_combine_lipid_disease_ture_combine_3', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [15]:
model_name = '4_combine_lipid_disease_ture_combine_2'
model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/"  # 替换模型的保存路径
model_dir = model_dir + model_name + "/"
config, tokenizer, model = load_model_and_tokenizer(model_dir,device = 'cuda')
label_map = config.id2label

In [None]:
import pickle

path_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result/'
with open(path_ + 'df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_3.1_data_from_MetaboliteNER_replace_word2label.pkl','rb') as f:
    df_dict = pickle.load(f)

# PMC_bioc_name = 'PMC11130959_bioc.json'
dict_keys_list = list(df_dict.keys())
# start_index = dict_keys_list.index(PMC_bioc_name)
# for PMC_bioc_name in dict_keys_list[start_index:]:    
save_batch = 10
index_ = 0
for PMC_bioc_name in dict_keys_list:    

    if 'score' in df_dict[PMC_bioc_name].columns:
        continue
    else:
        print(PMC_bioc_name)
        results = df_dict[PMC_bioc_name]['sentence'].apply(
            lambda x: predict(x, tokenizer, model, label_map, device='cuda')
        )
        df_dict[PMC_bioc_name]['tokens'], df_dict[PMC_bioc_name]['score'] = zip(*results)
        index_ += 1
        if index_ % save_batch == 0:
            with open(path_+ f'lipid_and_disease_{model_name}'+'_token2score'+'.pkl','wb') as f:
                pickle.dump(df_dict, f)


## Become image

In [None]:
from PIL import Image, ImageDraw, ImageFont

def highlight_text(text, word, highlight_color="yellow", font_path="arial.ttf", font_size=24, output_path="output.png"):
    # Initialize font
    font = ImageFont.truetype(font_path, font_size)
    
    # Create image with white background
    lines = text.split('\n')
    max_line_width = max([font.getsize(line)[0] for line in lines])
    image_height = font_size * len(lines) + 20
    image = Image.new('RGB', (max_line_width + 20, image_height), color = (255, 255, 255))
    draw = ImageDraw.Draw(image)
    
    y_text = 10
    for line in lines:
        words = line.split(' ')
        x_text = 10
        for w in words:
            word_width, word_height = draw.textsize(w + ' ', font=font)
            if w == word:
                # Draw highlighted rectangle
                draw.rectangle([x_text, y_text, x_text + word_width, y_text + word_height], fill=highlight_color)
            draw.text((x_text, y_text), w + ' ', font=font, fill=(0, 0, 0))
            x_text += word_width
        y_text += word_height

    # Save the image
    image.save(output_path)
    print(f"Image saved to {output_path}")

# Example usage
text = "This is a sample text where a specific word is highlighted."
word_to_highlight = "specific"
highlight_text(text, word_to_highlight, highlight_color="yellow", font_path="arial.ttf", font_size=24, output_path="output.png")


In [None]:
from PIL import Image, ImageDraw, ImageFont

def highlight_text(text, word, highlight_color="yellow", font_size=24, output_path="output.png"):
    # Initialize default font
    try:
        font = ImageFont.truetype("arial.ttf", font_size)
    except OSError:
        font = ImageFont.load_default()
    
    # Create image with white background
    lines = text.split('\n')
    max_line_width = max([font.getsize(line)[0] for line in lines])
    image_height = font_size * len(lines) + 20
    image = Image.new('RGB', (max_line_width + 20, image_height), color=(255, 255, 255))
    draw = ImageDraw.Draw(image)
    
    y_text = 10
    for line in lines:
        words = line.split(' ')
        x_text = 10
        for w in words:
            word_width, word_height = draw.textsize(w + ' ', font=font)
            if w == word:
                # Draw highlighted rectangle
                draw.rectangle([x_text, y_text, x_text + word_width, y_text + word_height], fill=highlight_color)
            draw.text((x_text, y_text), w + ' ', font=font, fill=(0, 0, 0))
            x_text += word_width
        y_text += word_height

    # Save the image
    image.save(output_path)
    print(f"Image saved to {output_path}")

# Example usage
text = "This is a sample text where a specific word is highlighted."
word_to_highlight = "specific"
highlight_text(text, word_to_highlight, highlight_color="yellow", font_size=24, output_path="output.png")


In [None]:
from PIL import Image, ImageDraw, ImageFont

# 定义要处理的文本和标签
tokens = prediction['tokens']
labels = prediction['labels']
# 获取处理后的words和对应的labels
processed_words = prediction['tokens'][1:-1]  # 去掉[CLS]和[SEP]
processed_labels = prediction['labels'][1:-1]  # 去掉[CLS]和[SEP]

# 定义颜色
color_map = {
    "B-disease": "red",
    "I-disease": "red",
    "B-lipid": "blue",
    "I-lipid": "blue"
}

# 恢复原始文本
def reconstruct_text(tokens):
    words = []
    current_word = ""
    for token in tokens:
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
                words.append(current_word)
            current_word = token
    if current_word:
        words.append(current_word)
    return words

words = reconstruct_text(tokens[1:-1])  # 去掉[CLS]和[SEP]标记

# 创建一个函数来绘制带有高亮的文本
def draw_highlighted_text(words, labels, color_map, font_path="DejaVuSans-Bold.ttf", font_size=24, output_path="output.png"):
    # 初始化字体
    try:
        font = ImageFont.truetype(font_path, font_size)
    except OSError:
        font = ImageFont.load_default()
    
    # 创建白色背景图像
    image_width = 600  # 增加宽度以提高分辨率
    image_height = 600  # 增加高度以提高分辨率
    image = Image.new('RGB', (image_width, image_height), color=(255, 255, 255))
    draw = ImageDraw.Draw(image)
    
    x_text = 10
    y_text = 10
    
    for word, label in zip(words, labels):
        word_width, word_height = draw.textsize(word, font=font)
        color = color_map.get(label, None)
        if color:
            draw.rectangle([x_text, y_text, x_text + word_width, y_text + word_height], fill=color)
        draw.text((x_text, y_text), word, font=font, fill=(0, 0, 0))
        x_text += word_width + 5  # 添加一些空隙
        
        # 换行处理
        if x_text > image_width - 100:
            x_text = 10
            y_text += word_height + 10
    
    # 保存图片
    image.save(output_path)
    print(f"Image saved to {output_path}")


# 绘制并保存图片
draw_highlighted_text(processed_words, processed_labels, color_map, font_size=24, output_path="output.png")

In [33]:
for token, label in zip(tokens, labels):
    if label != 'O':
        print(token, label)

d I-lipid
la B-lipid
##cc B-lipid
##er I-lipid
d I-lipid
##18 I-lipid
al B-disease
##heimer I-disease
' I-disease
s I-disease
disease I-disease


In [9]:
import pickle
with open('df_dict_only_lipid.pkl','rb') as f:
    df_dict_only_lipid = pickle.load(f)

have = 0
nohave = 0
for key in df_dict_only_lipid.keys():
    # print(key)
    if 'predictions_lipid' in df_dict_only_lipid[key].columns:
        if len(set(df_dict_only_lipid[key]['predictions_lipid'])) > 1:
            print(set(df_dict_only_lipid[key]['predictions_lipid']))
        have += 1
    else:
        print(f'{key},no predictions_lipid')
        nohave += 1
print(f'have predictions_lipid: {have}, no predictions_lipid: {nohave}')

PMC9481132_bioc.json,no predictions_lipid
PMC9552013_bioc.json,no predictions_lipid
PMC10361545_bioc.json,no predictions_lipid
PMC10497391_bioc.json,no predictions_lipid
PMC9305268_bioc.json,no predictions_lipid
PMC10667563_bioc.json,no predictions_lipid
PMC10760103_bioc.json,no predictions_lipid
PMC7345851_bioc.json,no predictions_lipid
PMC10808295_bioc.json,no predictions_lipid
PMC10412683_bioc.json,no predictions_lipid
PMC10537536_bioc.json,no predictions_lipid
PMC9668183_bioc.json,no predictions_lipid
PMC10499237_bioc.json,no predictions_lipid
PMC9225104_bioc.json,no predictions_lipid
PMC9319954_bioc.json,no predictions_lipid
PMC9406929_bioc.json,no predictions_lipid
PMC11141675_bioc.json,no predictions_lipid
PMC10954412_bioc.json,no predictions_lipid
PMC9127619_bioc.json,no predictions_lipid
PMC10746288_bioc.json,no predictions_lipid
PMC9137556_bioc.json,no predictions_lipid
PMC9280915_bioc.json,no predictions_lipid
PMC10846697_bioc.json,no predictions_lipid
PMC10479895_bioc.json,

In [5]:
df_dict_only_lipid.keys()
df_dict_only_lipid['PMC10916870_bioc.json']['predictions_lipid']

0      None
1      None
2      None
3      None
4      None
       ... 
509    None
510    None
511    None
512    None
513    None
Name: predictions_lipid, Length: 514, dtype: object

In [4]:
#lipid and disease
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification
import pickle
import pandas as pd

def load_model_and_tokenizer(model_dir,device):
    # 加载配置文件、分词器和模型
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    
    model.to(device)
    model.eval()

    return config, tokenizer, model

def predict(text, tokenizer, model, label_map, device):
    # 将输入文本分词
    text_split = text.split()
    inputs = tokenizer(text_split, return_tensors="pt", truncation=True, is_split_into_words=True, max_length=512, add_special_tokens=False)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # 兼容不同版本的transformers库
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs.logits

    # 获取预测结果
    predictions = torch.argmax(logits, dim=2)
    
    predictions = predictions.cpu().detach().numpy()  #gpu中的torch.tensor,需要先把它放进cpu才可以转化
    print(predictions)

    # 将预测结果转换为标签
    # example = "This is a tokenization example"
    txt_label_index_list = []
    #BatchEncoding.word_ids returns a list mapping words to tokens
    for w_idx in set(inputs.word_ids()):
        start, _ = inputs.word_to_tokens(w_idx)  #BatchEncoding.word_to_tokens tells us which and how many tokens are used for the specific word
        txt_label_index_list.append(start)         # we add +1 because you wanted to start with 1 and not with 0
    txt_predictions  = [predictions[0][index] for index in txt_label_index_list]
    return text_split,txt_predictions


# path_ = '/home/data/t200404/bioinfo/P_subject/NLP/biobert/datasets/for_recognize/download_paper_and_use_Auto-CORPus_deal_paper/deal/extract_result/'
# with open(path_ + 'df_dict_lipid_and_disease_4_combine_lipid_disease_ture_combine_2.pkl','rb') as f:
    # df_dict = pickle.load(f)

model_name = '4_combine_lipid_disease_ture_combine_3'
model_dir = "/home/data/t200404/bioinfo/P_subject/NLP/biobert/biobertModelWarehouse/model_from_trained/NER/"  # 替换为你模型的保存路径
model_dir = model_dir + model_name + "/"
config, tokenizer, model = load_model_and_tokenizer(model_dir,device = 'cuda')

label_map = config.id2label

# PMC_bioc_name = 'PMC11130959_bioc.json'
# dict_keys_list = list(df_dict.keys())
# start_index = dict_keys_list.index(PMC_bioc_name)
# for PMC_bioc_name in dict_keys_list[start_index:]:    
# save_batch = 10
# index_ = 0
# for PMC_bioc_name in dict_keys_list:    

#     if 'predictions_lipid' in df_dict[PMC_bioc_name].columns:
#         continue
#     else:
#         print(PMC_bioc_name)
#         results = df_dict[PMC_bioc_name]['sentence'].apply(
#             lambda x: predict(x, tokenizer, model, label_map, device='cuda')
#         )
#         df_dict[PMC_bioc_name]['text_split'], df_dict[PMC_bioc_name]['txt_predictions'] = zip(*results)
#         index_ += 1
#         if index_ % save_batch == 0:
#             with open(path_+ f'df_dict_lipid_and_disease_{model_name}'+'_word2label'+'.pkl','wb') as f:
#                 pickle.dump(df_dict, f)


In [6]:
x= '''Thus, changes in plasma S1P d16:1 levels, plasma Cer d18:1 levels, plasma MonCer d18:1 levels or plasma LacCer (d18:1/16:0) levels were inferred to be disease-induced changes in Alzheimer's disease or DLB'''
text_, pred_label = predict(x, tokenizer, model, label_map, device='cuda')
df_ = pd.DataFrame({'text': text_, 'pred_label': pred_label})
df_