In [None]:
# Import packages

from IPython.display import display
from IPython.display import HTML
from pypharma_nlp.pubmed import get_publication_sentences
from pypharma_nlp.pubmed import get_publications
from pypharma_nlp.pubmed import get_publications_table
from pypharma_nlp.pubmed import get_search_results
from pypharma_nlp.bert import get_next_sentence_probability
from pypharma_nlp.bert import get_tokens
from pypharma_nlp.bert import get_token_probabilities
from pypharma_nlp.bert import format_text
from pypharma_nlp.bert import plot_token_probabilities
from pypharma_nlp.bertviz.pytorch_transformers_attn import BertModel
from pypharma_nlp.bertviz.pytorch_transformers_attn import BertTokenizer
from pypharma_nlp.bertviz.head_view_bert import show as show_head
from pypharma_nlp.bertviz.head_view_bert import show as show_model
from pypharma_nlp.bertviz.head_view_bert import show as show_neuron
from pypharma_nlp.bertviz.colab import configure_plotly_browser_state_head
from pypharma_nlp.bertviz.colab import configure_plotly_browser_state_model
from pypharma_nlp.bertviz.colab import configure_plotly_browser_state_neuron
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
# Get abstracts having the text 'T790M'
#
# Note: T790M is a mutation in exon 20 of the EGFR gene that 
# confers resistance to treatment with Tyrosine Kinase 
# Inhibitors

records = get_publications("T790M[AB]", max_results=10)
abstracts_table = get_publications_table(records)
display(HTML(abstracts_table.to_html(index=False)))

In [None]:
# We now focus on PMID: 31442277.

records = get_publications(pmids=["31426517"], max_results=1)
record = next(records)
print("Title:\n%s" % record["TI"])
print("\nAbstract:\n%s" % record["AB"])

In [None]:
# Let's get a list of sentences in this abstract

sentences_generator = get_publication_sentences([record])
sentences = next(sentences_generator)

# We turn it into a nice pandas table

table_records = [[s] for s in sentences]
sentences_table = pd.DataFrame.from_records(table_records, columns=["Sentences"])
display(HTML(sentences_table.to_html(index=False)))

In [None]:
# Take a look at a couple of sentences

sentence_1 = sentences[0]
print("Sentence 1:\n%s" % sentence_1)
sentence_2 = sentences[1]
print("\nSentence 2:\n%s" % sentence_2)

In [None]:
# BERT sees these sentences a sequence of tokens. We can 
# generate the list of tokens as follows

# First, we put them into a format that BERT can recognize
formatted_text = format_text(sentence_1, sentence_2)
print(formatted_text)

In [None]:
# Next, we generate tokens from the formatted text.
#
# Tokens: These are sub-words.
#
# Token ID: Each sub-word has an index number in the vocabulary.
# There are also special tokens like [CLS], which is found at the 
# start of the first sentence, and [SEP] which is found at the end 
# of every sentence.
#
# Segmend ID: The index of the sentence (segment), 0 for the first 
# and 1 for the second sentence.

tokens, token_ids, segment_ids = get_tokens(formatted_text)
tokens_table = pd.DataFrame.from_dict({
    "Token" : tokens, 
    "Token ID" : token_ids, 
    "Segment ID" : segment_ids, 
})
display(HTML(tokens_table.to_html(index=False)))

In [None]:
# We can mask one token by setting it's value to [MASK]
# and let BERT try to predict what it should be.

probabilities, top_tokens, top_token_ids, masked_sentence = \
    get_token_probabilities(tokens, token_ids, segment_ids, 11)
plot_token_probabilities(probabilities, top_tokens, masked_sentence)

In [None]:
# We can use BERT to predict the probability of the second sentence 
# following the first. BERT has to accurately model the similarity 
# in the contexts of the two sentences in order to do this.

probability_1 = get_next_sentence_probability(
    "A de novo single-nucleotide mutation in the EGFR gene can cause the development of lung cancer.", 
    "EGFR tyrosine kinase inhibitors (EGFR-TKIs) are used for clinical treatment of such lung cancers, but acquired resistance often mitigates their efficacy."
)[0,0]

# We can try again with an obviously wrong sentence.

probability_2 = get_next_sentence_probability(
    "A de novo single-nucleotide mutation in the EGFR gene can cause the development of lung cancer.", 
    "US President Donald Trump has said he will impose a fresh 10% tariff on another $300bn (£247bn) of Chinese goods, in a sharp escalation of a trade war between the two countries."
)[0,0]

# We can also give it a sentence from another abstract coming from 
# non-cancer related publication.

probability_3 = get_next_sentence_probability(
    "A de novo single-nucleotide mutation in the EGFR gene can cause the development of lung cancer.", 
    "The Aspirin Myocardial Infarction Study (AMIS) was a multicenter, randomized, double-blind, placebo-controlled trial of 1.0 g of aspirin daily in men and women who had had a documented myocardial infarction."
)[0,0]

plt.bar(range(3), [probability_1, probability_2, probability_3])
plt.xticks(range(3), ["Sentence 1", "Sentence 2", "Sentence 3"])
plt.show()

# Visualizing BERT with bertviz

* [Head View](bertviz/head_view.ipynb)
* [Model View](bertviz/model_view.ipynb)
* [Neuron View](bertviz/neuron_view.ipynb)