In [None]:
# Import packages

from IPython.display import display
from IPython.display import HTML
from pypharma_nlp.pubmed import get_publication_sentences
from pypharma_nlp.pubmed import get_publications
from pypharma_nlp.pubmed import get_publications_table
from pypharma_nlp.pubmed import get_search_results
from pypharma_nlp.bert import get_tokens
from pypharma_nlp.bert import get_token_probabilities
from pypharma_nlp.bert import plot_token_probabilities
import pandas as pd

In [None]:
# Get abstracts having the text 'T790M'
#
# Note: T790M is a mutation in exon 20 of the EGFR gene that 
# confers resistance to treatment with Tyrosine Kinase 
# Inhibitors

records = get_publications("T790M[AB]", max_results=10)
abstracts_table = get_publications_table(records)
display(HTML(abstracts_table.to_html(index=False)))

In [None]:
# We now focus on PMID: 31442277.

records = get_publications(pmids=["31442277"], max_results=1)
record = next(records)
print("Title:\n%s" % record["TI"])
print("\nAbstract:\n%s" % record["AB"])

In [None]:
# Let's get a list of sentences in this abstract

sentences_generator = get_publication_sentences([record])
sentences = next(sentences_generator)

# We turn it into a nice pandas table

table_records = [[s] for s in sentences]
sentences_table = pd.DataFrame.from_records(table_records, columns=["Sentences"])
display(HTML(sentences_table.to_html(index=False)))

In [None]:
# Take a look at the 3rd and 4th sentence

sentence_3 = sentences[2]
print("Sentence 3:\n%s" % sentence_3)
sentence_4 = sentences[3]
print("\nSentence 4:\n%s" % sentence_4)

In [None]:
# BERT sees these sentences a sequence of tokens. We can 
# generate the list of tokens as follows

# First, we put them into a format that BERT can recognize
formatted_text = "[CLS] %s [SEP] %s [SEP]" % (sentence_3, sentence_4)
print(formatted_text)

In [None]:
# Next, we generate tokens from the formatted text.
#
# Tokens: These are sub-words.
#
# Token ID: Each sub-word has an index number in the vocabulary.
# There are also special tokens like [CLS], which is found at the 
# start of the first sentence, and [SEP] which is found at the end 
# of every sentence.
#
# Segmend ID: The index of the sentence (segment), 0 for the first 
# and 1 for the second sentence.

tokens, token_ids, segment_ids = get_tokens(formatted_text)
tokens_table = pd.DataFrame.from_dict({
    "Token" : tokens, 
    "Token ID" : token_ids, 
    "Segment ID" : segment_ids, 
})
display(HTML(tokens_table.to_html(index=False)))

In [None]:
# We can mask one token by setting it's value to [MASK]
# and let BERT try to predict what it should be.

probabilities, top_tokens, token_ids, masked_sentence = \
    get_token_probabilities(tokens, token_ids, segment_ids, 4)
plot_token_probabilities(probabilities, top_tokens, masked_sentence)