In [None]:
import json 
import csv 
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data

The used data here is only a sample of 200 articles of the whole dataset

In [None]:
data = []
for line in open('/content/drive/My Drive/dat390_data/sample.jsonl', 'r'):
    data.append(json.loads(line))
df = pd.DataFrame(data)
print(df.shape)

(203, 6)


## Visualization

In [None]:
df.head(2)

Unnamed: 0,paper_id,_pdf_hash,abstract,body_text,bib_entries,ref_entries
0,77499681,11f281316fe4638843a83cf559ce4f60aade00f8,"[{'section': 'Abstract', 'text': 'The purpose ...","[{'section': '', 'text': 'Values are presented...",{'BIBREF0': {'title': 'Bone health and osteopo...,{'FIGREF0': {'text': '비스포스포네이트를 장기간 복용한 골다공증 환...
1,94550656,42b3e1bd9c4740192f22d8725d470218e86301c8,[],[],{'BIBREF0': {'title': 'Solving ratio-dependent...,{}


In [None]:
#drop all rows that have empty ref_enteries
clean_df = df[df.ref_entries != {}]

In [None]:
print(clean_df.shape)

(86, 6)


In [None]:
clean_df.head(2)

Unnamed: 0,paper_id,_pdf_hash,abstract,body_text,bib_entries,ref_entries
0,77499681,11f281316fe4638843a83cf559ce4f60aade00f8,"[{'section': 'Abstract', 'text': 'The purpose ...","[{'section': '', 'text': 'Values are presented...",{'BIBREF0': {'title': 'Bone health and osteopo...,{'FIGREF0': {'text': '비스포스포네이트를 장기간 복용한 골다공증 환...
3,94551546,9bf1cb19041b8ddfca7aeccc9d2f7689c8aa1c7e,"[{'section': 'Abstract', 'text': 'Ethanolamine...","[{'section': 'INTRODUCTION', 'text': 'Gene the...","{'BIBREF0': {'title': 'Cancer statistics', 'au...",{'FIGREF0': {'text': 'General procedures for t...


In [None]:
# number of sections in each article body text
sections = []
for row in clean_df.body_text:
    sections.append(len(row))
print(sections)

[2, 34, 12, 55, 35, 57, 16, 41, 19, 38, 17, 41, 32, 26, 20, 38, 23, 17, 82, 37, 19, 31, 33, 26, 19, 64, 4, 32, 15, 59, 21, 6, 17, 25, 45, 15, 29, 19, 24, 35, 52, 25, 24, 29, 27, 58, 1, 41, 19, 40, 85, 12, 67, 57, 58, 21, 18, 78, 19, 12, 17, 13, 37, 41, 19, 1, 33, 17, 43, 174, 2, 52, 82, 143, 64, 61, 18, 13, 120, 50, 66, 38, 10, 92, 25, 12]


In [None]:
# first bib_entry in first document
clean_df.bib_entries[0]['BIBREF0']

{'authors': [],
 'link': None,
 'title': 'Bone health and osteoporosis: a report of the surgeon general',
 'venue': '',
 'year': 2004}

In [None]:
# 10th section of example 3 body text
clean_df.body_text[3][10]['text']

'For the preparation of the Ad-PGEA guest, the adamantine-headed ATRP initiator (Ad-Br) with three initiation sites was first synthesized. Pentaerythritol (1.08 g, 7.94 mmol) was thoroughly dissolved in a 50 ml round flask containing 20 ml of anhydrous N,N-dimethylformamide. Then, 1-adamantanecarboxylic acid chloride (1 g, 5.05 mmol) and K 2 CO 3 (1.33 g, 9.64 mmol) were added. The reaction was conducted at 50°C for 24 h under magnetic stirring. The final reaction solution was centrifuged, evaporated and distillated under reduced pressure, producing Ad-OH with three hydroxyl groups. The resultant Ad-Br was prepared using the similar procedures described earlier. 29 Next, 447 mg of Ad-OH (1.5 mmol) was added to a 50 ml round flask containing 7 ml of N,Ndimethylformamide. Then, 0.75 ml of 2-bromoisobutyryl bromide (6 mmol) was dropped into the aforementioned solution under an ice bath condition and stirring for 24 h. The reaction mixture was quenched with water and extracted with CH 2 Cl

## Basic Topic Modeling

### Processing body text

In [None]:
import re

def delete_regex(text):
  #remove html
  output = re.sub('<[^>]*>', '', text)
  #remove numbers, all maths operations
  output = re.sub('((?:[-\d)(+/*]+)?(?:(?:cos|sin|tan)[(](?:\d+?|Pi)[)])?(?:[\-\d\)\(\+/*]+)?)', '', output)
  output = re.sub('#', '', output)
  output = re.sub('/', '', output)
  #remove abbreivations multiple consecutive capital letters
  output = re.sub('\b[A-Z]{2,}\b', '', output)
  return output

In [None]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer('english')

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
# adding all texts into one single entry for each row
new_df = clean_df.copy()
article_text_column = []
for i in range(len(clean_df.body_text)):
  article_text = []
  for entry in clean_df.iloc[i]['body_text']:
    article_text.append(entry['text'])
  article_text = delete_regex('. '.join(article_text))
  article_text_column.append(article_text)

new_df['pure_text'] = article_text_column

In [None]:
new_df.head(2)

Unnamed: 0,paper_id,_pdf_hash,abstract,body_text,bib_entries,ref_entries,pure_text
0,77499681,11f281316fe4638843a83cf559ce4f60aade00f8,"[{'section': 'Abstract', 'text': 'The purpose ...","[{'section': '', 'text': 'Values are presented...",{'BIBREF0': {'title': 'Bone health and osteopo...,{'FIGREF0': {'text': '비스포스포네이트를 장기간 복용한 골다공증 환...,Values are presented as number only or median ...
3,94551546,9bf1cb19041b8ddfca7aeccc9d2f7689c8aa1c7e,"[{'section': 'Abstract', 'text': 'Ethanolamine...","[{'section': 'INTRODUCTION', 'text': 'Gene the...","{'BIBREF0': {'title': 'Cancer statistics', 'au...",{'FIGREF0': {'text': 'General procedures for t...,Gene therapy holds potential for treating many...


### Latent Dirichlet Allocation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english',
                        max_df=.3,         # Words that occur across too many documents are exluded
                        max_features=5000) # Most frequent words, limiting the dimensionality
                                           # Both can be tuned
X = count.fit_transform(new_df.pure_text)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=5,
                                random_state=42,
                                learning_method='batch')
# 'batch' uses all data in one go (most accurate), but slower than 'online' (online/mini-batch)
X_topics = lda.fit_transform(X)

In [None]:
n_top_words = 10
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

Topic 1:
trust energy mass star power wind magnetic arc equation stellar
Topic 2:
pged age students patients binding gd fitness cancer leptospirosis risk
Topic 3:
cells cell trail noise transformation tsv csf mental rhg patients
Topic 4:
ephb4 zone clay pollen water ec expression deposits prs rh6
Topic 5:
map image pose patients images dengue members care cr wise


The most common words still contains words abbreviations like cr or terms like rh6.

In [None]:
#This is to cover medicine or biology topics
medicine_topic = X_topics[:, 1].argsort()[::-1]

for i, article_idx in enumerate(medicine_topic[:10]):
    print('\nMedicine Topic #%d:' % (i + 1))
    print(new_df.iloc[article_idx]['pure_text'][:200], '...')


Medicine Topic #1:
transcriptional and metabolic changes that increase production and accumulation of the compatible solute glycerol. Mounting a rapid response to increased osmolarity is essential to yeast survival [3,  ...

Medicine Topic #2:
Gene therapy holds potential for treating many severe diseases, such as cancer and genetic diseases. 1 Successful gene therapy depends on highefficiency gene delivery processes, in which the gene carr ...

Medicine Topic #3:
There is strong interest in nanomaterials, motivated by the development of nanotechnoloy and by their novel properties arising from quantum confinement. In particular, the discovery of various atomic- ...

Medicine Topic #4:
Leptospirosis is a zoonotic bacterial disease that occurs in diverse epidemiological settings but imparts its greatest burden on resource-poor populations [1] [2] [3] [4] [5] [6] . The disease has a b ...

Medicine Topic #5:
In this cohort of healthy middle-aged adults, fitness was significantly associat

In [None]:
# Physics Topics
physics_topic = X_topics[:, 0].argsort()[::-1]

for i, article_idx in enumerate(physics_topic[:5]):
    print('\nPhysics Topic #%d:' % (i + 1))
    print(new_df.iloc[article_idx]['pure_text'][:300], '...')


Physics Topic #1:
In the last decade, spectropolarimetric surveys of OB stars have revealed that about % of these massive stars have large-scale, organized magnetic fields (MiMeS: Wade et al. 2014; BOB: Morel et al. 2015) . Such detectable magnetic fields (B 100 G) have a significant effect on the stellar wind, both  ...

Physics Topic #2:
Training competent physicians is, in many ways, a matter of trust. Patients trust providers to competently address their ailments and assist in helping them achieve good health. Accreditation organizations trust that graduate medical education (GME) programs implement policies and procedures to prep ...

Physics Topic #3:
A tidal disruption event (TDE) occurs when a star is violently ripped apart by a black hole's tidal forces (Hills 1975) . When a star is tidally disrupted, roughly half of the stellar debris remains bound to the black hole while the other half of the debris escapes. The gravitationally bound debris  ...

Physics Topic #4:
name and 

## Named Entity Recognition (NER) Using Bert model

In [None]:
#!pip install transformers 
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import time

# Load model
tokenizer = AutoTokenizer.from_pretrained("fran-martinez/scibert_scivocab_cased_ner_jnlpba")
model = AutoModelForTokenClassification.from_pretrained("fran-martinez/scibert_scivocab_cased_ner_jnlpba")

start = time.time()

document_to_recognise = new_df.iloc[1]['pure_text']

document_to_recognise = delete_regex(document_to_recognise)

token_list = []
entities_dict = {}

for i in range(1000, len(document_to_recognise), 1000):
  # take a chunk
  test_sequence = new_df.iloc[1]['pure_text'][i-1000:i]

  # pass chunk to BERT
  input_ids = torch.tensor(tokenizer.encode(test_sequence)).unsqueeze(0)

  # Predict
  with torch.no_grad():
    outputs = model(input_ids)

  # From the output let's take the first element of the tuple.
  # Then, let's get rid of [CLS] and [SEP] tokens (first and last)
  predictions = outputs[0].argmax(axis=-1)[0][1:-1]

  # Map label class indexes to string labels.
  for token, pred in zip(tokenizer.tokenize(test_sequence), predictions):
    if model.config.id2label[pred.numpy().item()] == 'O':
      pass
    else:
      if model.config.id2label[pred.numpy().item()] in entities_dict.keys():
        entities_dict[model.config.id2label[pred.numpy().item()]].append(token)
      else:
        entities_dict[model.config.id2label[pred.numpy().item()]] = [token]

stop = time.time()

In [None]:
print('Time taken to NER for one article is: ', stop-start)

Time taken to NER for one article is:  27.557918310165405


In [None]:
np.unique(entities_dict['B-protein'])

array(['/', 'ab', 'ad', 'cd', 'dt', 'egfp', 'enhanced', 'ep', 'gd', 'ge',
       'gfp', 'luciferase', 'pd', 'pg', 'pge', 'poly', 'polymer', 'prl',
       'ren', 'β'], dtype='<U10')

## References

Oliver Tomic & Kristian Lilland, Applied Machine Learning DAT300, NMBU, 2020

https://github.com/NorskRegnesentral/weak-supervision-for-NER

https://ai.googleblog.com/2020/03/more-efficient-nlp-model-pre-training.html

https://github.com/huggingface/transformers

https://github.com/ZhiGroup/Med-BERT

https://arxiv.org/pdf/1903.10676.pdf

https://huggingface.co/fran-martinez/scibert_scivocab_cased_ner_jnlpba

https://github.com/allenai/scibert

https://link.springer.com/chapter/10.1007/978-3-030-50417-5_23

Thomas W. etal., HuggingFace's Transformers: State-of-the-art Natural Language Processing, ArXiv, 2019