In [1]:
import torch
from transformers import BioGptTokenizer, BioGptForCausalLM, set_seed

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BioGptTokenizer.from_pretrained("microsoft/BioGPT-Large")
model = BioGptForCausalLM.from_pretrained("microsoft/BioGPT-Large")

if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs")
    model = torch.nn.DataParallel(model)

model.to(device)  # move model to the device

set_seed(42)


  from .autonotebook import tqdm as notebook_tqdm
2023-03-02 23:11:22.022415: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-02 23:11:22.169216: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-02 23:11:22.682066: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-02 23:11:22.682174: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7

Using 8 GPUs


In [3]:
model.module.get_input_embeddings()

Embedding(57717, 1600, padding_idx=1)

In [8]:

sentence = "Ampicillin is"
inputs = tokenizer(sentence, return_tensors="pt").to(device)


with torch.no_grad():
    if torch.cuda.device_count() > 1:
        beam_output = model.module.generate(inputs["input_ids"],
                                             attention_mask=inputs["attention_mask"],
                                             min_length=100,
                                             max_length=300,
                                             num_beams=5,
                                             early_stopping=True
                                            )
    else:
        beam_output = model.generate(inputs["input_ids"],
                                      attention_mask=inputs["attention_mask"],
                                      min_length=100,
                                      max_length=300,
                                      num_beams=5,
                                      early_stopping=True
                                     )

output = tokenizer.decode(beam_output[0], skip_special_tokens=True)
print(output)

Ampicillin is still the drug of choice in the treatment of gonorrhoea in the UK. < / FREETEXT > < / PARAGRAPH > ▃ < PARAGRAPH > < FREETEXT > Keywords: < / FREETEXT > < / PARAGRAPH > ▃ < PARAGRAPH > < FREETEXT > Gonorrhoea, Penicillinase-producing Neisseria gonorrhoeae (PPNG), Ceftriaxone, Azithromycin < / FREETEXT > < / PARAGRAPH > ▃ < PARAGRAPH > < FREETEXT > Gonorrhoea is a sexually transmitted infection (STI) caused by the Gram-negative diplococcus Neisseria gonorrhoeae. It is the second most common bacterial STI in the world, with an estimated 7 8 million new cases per year. In the UK, gonorrhoea is the most common bacterial STI in men who have sex with men (MSM) and the second most common bacterial STI in heterosexuals. < / FREETEXT > < / PARAGRAPH > ▃ < PARAGRAPH > < FREETEXT > Gonorrhoea is a major public health concern in the UK due to the emergence of antimicrobial resistance (AMR) to the recommended first-line treatment, ceftriaxone (CRO). In 2 0 1 6, the UK Department of Hea

In [9]:
from transformers import pipeline, AutoConfig, AutoTokenizer, AutoModelForTokenClassification

# Load the tokenizer and model
config_d = AutoConfig.from_pretrained("alvaroalon2/biobert_diseases_ner")
tokenizer_d = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner", use_fast=True, return_offsets_mapping=True)
model_d = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_diseases_ner", config=config_d)

config_c = AutoConfig.from_pretrained("alvaroalon2/biobert_chemical_ner")
tokenizer_c = AutoTokenizer.from_pretrained("alvaroalon2/biobert_chemical_ner", use_fast=True, return_offsets_mapping=True)
model_c = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_chemical_ner", config=config_c)

config_g = AutoConfig.from_pretrained("alvaroalon2/biobert_genetic_ner")
tokenizer_g = AutoTokenizer.from_pretrained("alvaroalon2/biobert_genetic_ner", use_fast=True, return_offsets_mapping=True)
model_g = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_genetic_ner", config=config_g)


# Create the NER pipeline
ner_dis = pipeline(task="ner", model=model_d, tokenizer=tokenizer_d, framework='pt', grouped_entities=True)
ner_chem = pipeline(task="ner", model=model_c, tokenizer=tokenizer_c, framework='pt', grouped_entities=True)
ner_gene = pipeline(task="ner", model=model_g, tokenizer=tokenizer_g, framework='pt', grouped_entities=True)




In [10]:
import re

def filter_entities(results):
    filtered_results = []
    for result in results:
        entity_type = result['entity_group']
        entity_text = result['word']
        # Filter out entities that are too short or are all digits
        if len(entity_text) > 1 and not entity_text.isdigit():
            # Split the entity text into words and filter out short words and stop words
            words = [word for word in entity_text.split() if len(word) > 2 and not re.match(r'^\W+$', word)]
            # If the entity has at least one valid word, add it to the filtered results
            if len(words) > 0:
                filtered_results.append((entity_type, ' '.join(words)))
    return filtered_results


In [11]:
# Define the input text
text = "Xylazine is an alpha 2-adrenoceptor agonist that has been used as a sedative and analgesic in veterinary medicine for many years, but its effects on the cardiovascular system have not been extensively studied in the dog, and its effects on the central nervous system (CNS) have not been well characterized in the dog, despite the fact that xylazine has been widely used as a sedative and analgesic in veterinary medicine for more than 30 years."

# Run NER on the input text
results = []
results += ner_gene(text)
results += ner_chem(text)
results += ner_dis(text)

filtered_results = filter_entities(results)
print(filtered_results)

[('GENETIC', 'alpha adrenoceptor'), ('CHEMICAL', 'Xylazine'), ('CHEMICAL', '##ylazine'), ('0', 'Xylazine alpha adrenoceptor agonist that has been used sedative and analgesic veterinary medicine for many years, but its effects the cardiovascular system have not been extensively studied the dog, and its effects the central nervous system CNS have not been well characterized the dog, despite the fact that xylazine has been widely used sedative and analgesic veterinary medicine for more than years.')]


In [None]:
results

In [None]:
chemical_entities = []

# Iterate over the list of results
for r in results:
    # Check if the entity is a chemical
    if r["entity_group"] == "CHEMICAL":
        # Combine the subword tokens into a single string
        entity_text = "".join(r["word"])
        # Add the entity text to the list
        chemical_entities.append(entity_text)

# Print the list of chemical entities
print(chemical_entities)