In [10]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the pre-trained BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=2)

# Define the input text
input_text = "I have a headache and a fever"

# Tokenize the input text
input_ids = torch.tensor([tokenizer.encode(input_text)])

# Make predictions with the model
with torch.no_grad():
    outputs = model(input_ids)
    predictions = torch.argmax(outputs[0], axis=-1)

# Map the predicted labels to symptom entities
symptoms = []
for i, label in enumerate(predictions[0]):
    if label == 1:
        symptoms.append(tokenizer.convert_ids_to_tokens(input_ids[0][i].item()))




Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [12]:
print("Extracted Symptoms: ", symptoms)

Extracted Symptoms:  ['[CLS]', 'I', 'have', 'a', 'headache', 'and', 'a', 'fever', '[SEP]']


In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the pre-trained BioBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=3)

# Define the input text
input_text = "I have a headache and a fever"

# Tokenize the input text
tokens = tokenizer.tokenize(input_text)
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

# Pad the token ids to a fixed length
max_length = 32
padding_length = max_length - len(tokens_ids)
padded_tokens_ids = tokens_ids + [0] * padding_length
attention_mask = [1] * len(tokens_ids) + [0] * padding_length

# Convert the token ids to a PyTorch tensor
input_ids = torch.tensor([padded_tokens_ids])
attention_mask = torch.tensor([attention_mask])

# Make predictions with the model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs[0], axis=-1)

# Map the predicted labels to symptom entities
symptoms = []
for i, label in enumerate(predictions[0]):
    if label == 1:
        symptoms.append(tokenizer.convert_ids_to_tokens(input_ids[0][i].item()))

print("Extracted Symptoms: ", symptoms)


Downloading (…)lve/main/config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

Extracted Symptoms:  ['i', 'have', 'a', 'headache', 'and', 'a', 'fever', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [14]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scispacy
  Downloading scispacy-0.5.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pysbd
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting spacy<3.5.0,>=3.4.0
  Downloading spacy-3.4.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
Collecting conllu
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Collecting nmslib>=1.7.3.6
  Downloading nmslib-2.1.1-cp39-cp39-manylinux2010_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m48.3 MB/s[

In [15]:
import spacy
from spacy import displacy

# Load the COVID-19 specific model from SciSpacy
nlp = spacy.load("en_core_sci_sm")

# Define the input text
input_text = "I have a fever, cough, and shortness of breath. Could this be COVID-19?"

# Process the input text with the SciSpacy model
doc = nlp(input_text)

# Extract COVID-related symptoms from the processed text
symptoms = []
for entity in doc.ents:
    if entity.label_ == "SYMPTOM":
        symptoms.append(entity.text)

print("Extracted COVID Symptoms: ", symptoms)


Extracted COVID Symptoms:  []


In [16]:
# import required libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# define the model names
model_names = ['emilyalsentzer/Bio_ClinicalBERT']#, 'monologg/biobert_v1.1_pubmed', 
               #'allenai/scibert_scivocab_uncased']  #'BiospireDx/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12'

# initialize the models and tokenizers
models = []
tokenizers = []
for model_name in model_names:
    print(model_name)
    models.append(AutoModelForTokenClassification.from_pretrained(model_name))
    tokenizers.append(AutoTokenizer.from_pretrained(model_name))

# define the symptom labels
symptom_labels = ['cough', 'fever', 'shortness of breath', 'fatigue', 'body aches', 'loss of smell', 'loss of taste']

# initialize the NER pipelines
pipelines = []
for model, tokenizer in zip(models, tokenizers):
    nlp = pipeline('ner', model=model, tokenizer=tokenizer, device=0)
    pipelines.append(nlp)

# define the input text
input_text = "The patient presented with a cough, fever, and body aches, but no shortness of breath or loss of smell or taste."

# extract symptoms using each model
for i, nlp in enumerate(pipelines):
    print(f"\n\nUsing {model_names[i]}:")
    symptom_spans = []
    for ent in nlp(input_text):
        if ent['word'] in symptom_labels and ent['score'] > 0.5:
            symptom_spans.append((ent['start'], ent['end'], ent['word']))
    symptoms = [input_text[start:end] for start, end, text in symptom_spans]
    print(f"Extracted Symptoms: {symptoms}")


emilyalsentzer/Bio_ClinicalBERT


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 



Using emilyalsentzer/Bio_ClinicalBERT:
Extracted Symptoms: ['cough', 'fever']


In [17]:
# import required libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# define the model names
model_names = ['monologg/biobert_v1.1_pubmed']

# initialize the models and tokenizers
models = []
tokenizers = []
for model_name in model_names:
    print(model_name)
    models.append(AutoModelForTokenClassification.from_pretrained(model_name))
    tokenizers.append(AutoTokenizer.from_pretrained(model_name))

# define the symptom labels
symptom_labels = ['cough', 'fever', 'shortness of breath', 'fatigue', 'body aches', 'loss of smell', 'loss of taste']

# initialize the NER pipelines
pipelines = []
for model, tokenizer in zip(models, tokenizers):
    nlp = pipeline('ner', model=model, tokenizer=tokenizer, device=0)
    pipelines.append(nlp)

# define the input text
input_text = "The patient presented with a cough, fever, and body aches, but no shortness of breath or loss of smell or taste."



monologg/biobert_v1.1_pubmed


Some weights of the model checkpoint at monologg/biobert_v1.1_pubmed were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [18]:

# extract symptoms using each model
for i, nlp in enumerate(pipelines):
    print(f"\n\nUsing {model_names[i]}:")
    symptom_spans = []
    for ent in nlp(input_text):
        if ent['word'] in symptom_labels and ent['score'] > 0.5:
            symptom_spans.append((ent['start'], ent['end'], ent['word']))
    symptoms = [input_text[start:end] for start, end, text in symptom_spans]
    print(f"Extracted Symptoms: {symptoms}")



Using monologg/biobert_v1.1_pubmed:
Extracted Symptoms: ['cough', 'fever']


In [19]:
# import required libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# define the model names
model_names = ['allenai/scibert_scivocab_uncased']

# initialize the models and tokenizers
models = []
tokenizers = []
for model_name in model_names:
    print(model_name)
    models.append(AutoModelForTokenClassification.from_pretrained(model_name))
    tokenizers.append(AutoTokenizer.from_pretrained(model_name))

# define the symptom labels
symptom_labels = ['cough', 'fever', 'shortness of breath', 'fatigue', 'body aches', 'loss of smell', 'loss of taste']

# initialize the NER pipelines
pipelines = []
for model, tokenizer in zip(models, tokenizers):
    nlp = pipeline('ner', model=model, tokenizer=tokenizer, device=0)
    pipelines.append(nlp)

# define the input text
input_text = "The patient presented with a cough, fever, and body aches, but no shortness of breath or loss of smell or taste."

# extract symptoms using each model
for i, nlp in enumerate(pipelines):
    print(f"\n\nUsing {model_names[i]}:")
    symptom_spans = []
    for ent in nlp(input_text):
        if ent['word'] in symptom_labels and ent['score'] > 0.5:
            symptom_spans.append((ent['start'], ent['end'], ent['word']))
    symptoms = [input_text[start:end] for start, end, text in symptom_spans]
    print(f"Extracted Symptoms: {symptoms}")


allenai/scibert_scivocab_uncased


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi



Using allenai/scibert_scivocab_uncased:
Extracted Symptoms: ['cough', 'fever']
