In [4]:
import scispacy
import spacy

nlp = spacy.load("en_ner_bionlp13cg_md")
# text = """To investigate the mechanism underlying the anti-inflammatory effects of SGDGs, we tested the NF-κB pathway because it is critical for the LPS-induced expression of pro-inflammatory cytokines. Treatment of HEK293T cells expressing TLR4 and MD2 with SGDG(14:0/16:0) significantly reduced the NF-κB reporter activity while MGDG(14:0/16:0) did not (Fig. 6d), suggesting that SGDG exerts the anti-inflammatory effects via the TLR4/MD2 mediated NF-κB pathway.
# """
text="In line with our findings in patients, a previous study found decreased levels of 18:0-LPC in mice with NASH"
doc = nlp(text)

print(list(doc.sents))
print(doc.ents)



from spacy import displacy
displacy.render(next(doc.sents), style='dep', jupyter=True)

[In line with our findings in patients, a previous study found decreased levels of 18:0-LPC in mice with NASH]
(line, patients, mice)


In [6]:
import spacy

from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_ner_bc5cdr_md")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")
text="In line with our findings in patients, a previous study found decreased levels of 18:0-LPC in mice with NASH"
print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
SBMA 	 (33, 34) Spinal and bulbar muscular atrophy
SBMA 	 (6, 7) Spinal and bulbar muscular atrophy
AR 	 (29, 30) androgen receptor


In [16]:
# dir(nlp)
help(nlp)

Help on English in module spacy.lang.en object:

class English(spacy.language.Language)
 |  English(vocab: Union[spacy.vocab.Vocab, bool] = True, *, max_length: int = 1000000, meta: Dict[str, Any] = {}, create_tokenizer: Union[Callable[[ForwardRef('Language')], Callable[[str], spacy.tokens.doc.Doc]], NoneType] = None, create_vectors: Union[Callable[[ForwardRef('Vocab')], spacy.vectors.BaseVectors], NoneType] = None, batch_size: int = 1000, **kwargs) -> None
 |  
 |  A text-processing pipeline. Usually you'll load this once per process,
 |  and pass the instance around your application.
 |  
 |  Defaults (class): Settings, data and factory methods for creating the `nlp`
 |      object and processing pipeline.
 |  lang (str): IETF language code, such as 'en'.
 |  
 |  DOCS: https://spacy.io/api/language
 |  
 |  Method resolution order:
 |      English
 |      spacy.language.Language
 |      builtins.object
 |  
 |  Data and other attributes defined here:
 |  
 |  Defaults = <class 'spac

In [21]:
nlp.__dict__

{'_config': {'paths': {'vectors': 'output/en_core_sci_md_vectors',
   'init_tok2vec': None,
   'parser_tagger_path': 'output/en_core_sci_md_parser_tagger/model-best',
   'dev_path': 'assets/BC5CDR-IOB/devel.tsv',
   'train_path': 'assets/BC5CDR-IOB/train.tsv',
   'vocab_path': 'project_data/vocab_md.jsonl',
   'train': None,
   'dev': None},
  'system': {'gpu_allocator': None, 'seed': 0},
  'nlp': {'lang': 'en',
   'pipeline': ['tok2vec',
    'tagger',
    'attribute_ruler',
    'lemmatizer',
    'parser',
    'ner'],
   'tokenizer': {'@tokenizers': 'spacy.Tokenizer.v1'},
   'disabled': [],
   'before_creation': None,
   'after_creation': None,
   'after_pipeline_creation': None,
   'batch_size': 1000,
   'vectors': {'@vectors': 'spacy.Vectors.v1'}},
  'components': {'attribute_ruler': {'factory': 'attribute_ruler',
    'scorer': {'@scorers': 'spacy.attribute_ruler_scorer.v1'},
    'validate': False},
   'lemmatizer': {'factory': 'lemmatizer',
    'mode': 'rule',
    'model': None,
   

In [32]:
hasattr(nlp,'tok2vec')

False

In [33]:
callable(nlp.Defaults)

True

In [34]:
nlp.add_pipe

<bound method Language.add_pipe of <spacy.lang.en.English object at 0x7f9132e76fd0>>

In [10]:
import torch
from transformers import BertTokenizer, BertForTokenClassification
from torch.nn import Softmax

# 加载预训练的BERT模型和标记器
model = BertForTokenClassification.from_pretrained("./output/MetabolismNER/")
tokenizer = BertTokenizer.from_pretrained("./output/MetabolismNER/")

# 要识别的文本
text = "Go to: Polychlorinated dibenzo-p -dioxins (PCDDs, dioxins), polychlorinated dibenzofurans (PCDFs), and polychlorinated biphenyls (PCBs) are environmental endocrine disruptors that have half-lives of 7–10 years in the human body and have toxicities that probably include carcinogenesis."

# 将文本分词并添加特殊标记
tokens = tokenizer.tokenize(text)
tokens = ['[CLS]'] + tokens + ['[SEP]']

# 将标记转换为模型输入的索引
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([input_ids])

# 推理模式
model.eval()

# 使用模型进行推理
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs[0]

# 对输出进行softmax处理
softmax = Softmax(dim=2)
probs = softmax(logits)

# 获取最高概率的标签
preds = torch.argmax(probs, dim=2)

# 获取预测的标签并将其转换回实体
pred_labels = [tokenizer.convert_ids_to_tokens(pred.item()) for pred in preds[0]]

# 显示识别结果
for token, label in zip(tokens, pred_labels):
    print(f"{token}\t{label}")


[CLS]	[unused2]
go	[unused2]
to	[unused2]
:	[unused2]
p	[PAD]
##oly	[PAD]
##ch	[PAD]
##lor	[unused1]
##inated	[unused1]
di	[unused1]
##ben	[unused1]
##zo	[unused1]
-	[unused1]
p	[unused1]
-	[unused1]
di	[unused1]
##ox	[unused1]
##ins	[unused2]
(	[unused2]
p	[unused2]
##c	[unused2]
##dd	[unused2]
##s	[unused2]
,	[unused2]
di	[unused2]
##ox	[unused2]
##ins	[unused2]
)	[unused2]
,	[unused2]
p	[unused2]
##oly	[unused2]
##ch	[unused2]
##lor	[unused2]
##inated	[unused2]
di	[unused2]
##ben	[unused2]
##zo	[unused2]
##fu	[unused2]
##ran	[unused2]
##s	[unused2]
(	[unused2]
p	[unused2]
##c	[unused2]
##d	[unused2]
##fs	[unused2]
)	[unused2]
,	[unused2]
and	[unused2]
p	[unused2]
##oly	[unused2]
##ch	[unused2]
##lor	[unused2]
##inated	[unused2]
bi	[unused2]
##phe	[unused2]
##ny	[unused2]
##ls	[unused2]
(	[unused2]
p	[unused2]
##c	[unused2]
##bs	[unused2]
)	[unused2]
are	[unused2]
environmental	[unused2]
end	[unused2]
##oc	[unused2]
##rine	[unused2]
disrupt	[unused2]
##ors	[unused2]
that	[unused2]
ha

In [9]:
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis