<a href="https://colab.research.google.com/github/nasa/PeTaL-labeller/blob/SJ/auto-labeler/auto_labeler_prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install tensorboardX
!pip install wikipedia
!pip install swifter



In [2]:
import torch
import tensorflow as tf
import pandas as pd
import wikipedia 
import swifter
import numpy as np

## GPU Detection

In [3]:
# GPU detection 

# Get GPU device name
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [4]:
# If there is a GPU available
if torch.cuda.is_available():    

    # Tell PyTorch to use GPU
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P4


## Import, Parse, and Store Data

In [5]:
#Creating PyDrive instance to load in data from PeTaL shared drive, follow the steps to authenticate
!pip install -U -q PyDrive 
  
from pydrive.auth import GoogleAuth 
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials 
  
  
# Authenticate and create the PyDrive client. 
auth.authenticate_user() 
gauth = GoogleAuth() 
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

In [None]:


#this is the un-parsed articles
# link = 'https://drive.google.com/file/d/1iIZgKs1swHHJuumCU5xyW8tXSAnKAg18/view?usp=sharing'
# id = link.split("/")[-2] 
  
# downloaded = drive.CreateFile({'id':id})  
# downloaded.GetContentFile('articles.csv')   
#df = pd.read_csv('articles.csv')


In [None]:
#'https://petscan.wmflabs.org/' link to pull wikipedia articles and their page ID's

In [None]:
#Scraping article content by ID
def wiki_content(row):
  id = row['pageid']
  try:
    content = wikipedia.page(pageid=id).content
  except:
    content = 'error'
  return content

df['Content'] = df.swifter.apply(wiki_content, axis=1)

In [None]:
#Scraping article summary by ID

def wiki_summary(row):
  id = row['pageid']
  try:
    summary = wikipedia.page(pageid=id).summary
  except:
    summary = 'error'
  return summary

df['Summary'] = df.swifter.apply(wiki_summary, axis=1)

In [None]:
#Saving parsed articles as csv, can be accessed in the "Files" folder on the left, then download if you want
df.to_csv('parsed_articles.csv')

In [6]:
#Google drive link to the parsed articles
link = 'https://drive.google.com/file/d/1XRWsEsNUHjWOjPavwrfuUpaq3DwGGE4D/view?usp=sharing'
id = link.split("/")[-2] 
 
downloaded = drive.CreateFile({'id':id})  
downloaded.GetContentFile('parsed_articles.csv') 
df = pd.read_csv('parsed_articles.csv')

df = df[(df['Content'] != 'error') & df['Content'].notnull()]

#Df 'Content' column into list
docs = list(df['Content'].values)

In [None]:
#Labels

labels = ['Maintain homeostasis', 'Protect from temperature']

In [None]:
df['Content'].value_counts().to_frame()

In [6]:
labeled_df_link = 'https://drive.google.com/file/d/1MJDIPe1C0dFHIPWu0w18IEJhVk4Xbk2x/view?usp=sharing'
#'https://drive.google.com/file/d/1OZnAk64SPXfnaEzQFfhDJd6AX3dntIy9/view?usp=sharing'
labeled_id = labeled_df_link.split("/")[-2]
labeled_downloaded = drive.CreateFile({'id':labeled_id})  
labeled_downloaded.GetContentFile('single_label.csv') 
#'Biological-Strategies-Export-2020-October-01-1849 (1).csv'
labeled_df = pd.read_csv('single_label.csv')

In [7]:
labeled_df = labeled_df[['id', 'Title', 'Living Systems', 'Sources_source_link', 'Functions', 'Wikipedia', 'pdf_links', 'single_label']]
labeled_df = labeled_df[labeled_df['Functions'].notnull( )]
labeled_df = labeled_df[labeled_df['Sources_source_link'].notnull()]

In [8]:
import urllib.request
!pip install pdfminer
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from io import StringIO



In [9]:
#convert pdf into text corpus
def convert_pdf_to_string(file_path):
  output_string = StringIO()
  with open(file_path, 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
      interpreter.process_page(page)

  return (output_string.getvalue())

#Parsing into text
def parse_text(row):
  link = row['Sources_source_link']
  try:
      response = urllib.request.urlopen(link)
      file = open('doc.pdf', 'wb')
      file.write(response.read())
      file.close()
      corpus = convert_pdf_to_string('doc.pdf')
  except:
      corpus = 'Web error occurred'
  
  return corpus

In [11]:
#pdf_links = labeled_df[labeled_df['Sources_source_link'].str.endswith('.pdf')]
#pdf_links['Text'] = pdf_links.apply(parse_text, axis=1)

In [12]:
# pdf_links
labeled_df

Unnamed: 0,id,Title,Living Systems,Sources_source_link,Functions,Wikipedia,pdf_links,single_label
0,2324,Beak design absorbs high-energy impacts,Ramphastos toco,http://dx.doi.org/10.1016/j.actamat.2005.04.04...,Manage impact,"The toco toucan (Ramphastos toco), also known ...",[],Manage impact
1,2362,Saliva regulates digestion,Heloderma suspectum,http://www.jbc.org/content/267/11/7402.abstract,Maintain homeostasis|Regulate cellular processes,"The Gila monster (Heloderma suspectum, HEE-lə...",[],Maintain homeostasis
2,2367,'Bombs' distract predators,Swima bombiviridis,http://dx.doi.org/10.1126/science.1172488,Transform radiant energy (light)|Send light si...,Swima bombiviridis is a worm species that live...,"['/content/325/5943.toc.pdf', '/content/sci/32...",Transform radiant energy (light)
3,2393,Organ generates electricity,Electrophorus electricus,https://epub.uni-regensburg.de/2108/,Modify electric charge|Transform electrical en...,"The electric eel (Electrophorus electricus, ot...",['http://www.uni-regensburg.de/publikationen/m...,Modify electric charge
4,2400,Wings generate lift,Apis mellifera,http://biomimetic.pbworks.com/f/Short-amplitud...,Move in/through gases,The western honey bee or European honey bee (A...,[],Move in/through gases
...,...,...,...,...,...,...,...,...
341,93260,Pheromones turn nematodes into pest-killing ma...,Nematoda,https://biblio.ugent.be/publication/1269676/fi...,"Capture, absorb, or filter organisms|Cooperate...","The nematodes (UK: NEM-ə-tohdz, US: NEEM- Gr...","['/articles/s41598-020-62817-y.pdf', '/article...","Capture, absorb, or filter organisms"
342,93283,Interaction with adults leads to faster nest b...,Taeniopygia guttata,https://academic.oup.com/beheco/article/31/4/8...,Physically assemble structure|Self-replicate,The zebra finch (Taeniopygia guttata) is the m...,request error,Physically assemble structure
343,93287,Chemicals in oregano act as fungicide,Origanum vulgare,"https://sci-hub.st/10.1111/1750-3841.12700,htt...",Chemically break down organic compounds|Distri...,"Oregano (US: , UK: ; Origanum vulgare) is a fl...",[],Chemically break down organic compounds
344,93338,Brain acts as both teacher and student,Taeniopygia guttata,"https://elifesciences.org/articles/20944,https...",Encode/Decode|Learn|Differentiate signal from ...,The zebra finch (Taeniopygia guttata) is the m...,[],Encode/Decode


In [21]:
import re
import string

labels = []
docs = []
labels_test = []
docs_test = []
labels_dict = ['a', 'b', 'c', 'd', 'e']

single_label = labeled_df["single_label"].tolist()
wikipedia = labeled_df["Wikipedia"].tolist()
title = labeled_df["Title"].tolist()
living_systems = labeled_df["Living Systems"].tolist()
for i in range(len(title)):
  if i < len(title) - 310:
    docs.append(wikipedia[i])
    #labels.append(labels_dict.index(single_label[i]))
  else:
    docs.append(wikipedia[i])
    #labels_test.append(labels_dict.index(single_label[i]))
print (len(labels))
print (len(docs))

0
346


## Helper Functions

In [22]:
# Calculate accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [23]:
# Format elapsed times as hh:mm:ss
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## BERT Tokenizer

In [25]:
from transformers import BertTokenizer, BertModel, BertConfig

# Load BERT tokenizer
print('Loading BERT tokenizer')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [26]:
# Make sure it is tokenizing correctly:

# Print original articles
print(' Original: ', docs[0])

# Print a doc split into tokens
print('Tokenized: ', tokenizer.tokenize(docs[0]))

# Print docs as mapped to ids
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(docs[0])))

 Original:  The toco toucan (Ramphastos toco), also known as the common toucan or giant toucan, is the largest and probably the best known species in the toucan family. It is found in semi-open habitats throughout a large part of central and eastern South America. It is a common attraction in zoos.


== Taxonomy and systematics ==
German zoologist Philipp Ludwig Statius Müller described the toco toucan in 1776.


=== Subspecies ===
Two subspecies are recognized:
R. t. toco  - Statius Müller, 1776: Found in the Guianas, northern and north-eastern Brazil and south-eastern Peru
R. t. albogularis - Cabanis, 1862: Originally described as a separate species. Found in eastern and southern Brazil, northern Bolivia, Paraguay and northern Argentina


== Description ==
The toco toucan has a striking plumage with a mainly black body, a white throat, chest and uppertail-coverts, and red undertail-coverts. What appears to be a blue iris is actually thin blue skin around the eye. This blue skin is su

In [27]:
max_len = 0

for d in docs:
    # tokenize text and add `[CLS]` and `[SEP]` tokens
    input_ids = tokenizer.encode(d, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (1886 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3977 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (969 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2361 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (8733 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for

ValueError: ignored

In [28]:
# Finishing tokenizing all docs and map tokens to thier word IDs
input_ids = []
attention_masks = []

for d in docs:

    encoded_dict = tokenizer.encode_plus(
                        d,                      # Docs to encode.
                        truncation=True,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all docs
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Attention masks
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', docs[0])
print('Token IDs:', input_ids[0])
print('Reverse:', tokenizer.convert_ids_to_tokens(input_ids[0]))

ValueError: ignored

In [None]:
# Split up training & testing/validation

from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

# 80:20 split

# Number of docs to include per set
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training docs'.format(train_size))
print('{:>5,} validation docs'.format(val_size))

In [None]:
# Iterator using torch DataLoader class so that entire dataset doesn't need to be stored in memory

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# batch size can be 16 or 32
batch_size = 32

# Sample in random order when training
train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

# Sample sequentially for validation
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size # Evaluate with this batch size.
        )

## Training the Classification Model w/ Sequence Classification
  (fine-tune BERT)

  [HuggingFace documentation](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html)

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# BertForSequenceClassification -> BERT model w/ added classification layer 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # 12-layer model, uncased vocab
    num_labels = 5, # Number of labels 
    """CHANGE ABOVE"""
    output_attentions = False, 
    output_hidden_states = False, 
)

# this needs to be run on GPU
model.cuda()

## Optimizer for our hypermarameters / Learning Rate Scheduler
AdamW

Possible hyperparamters: 
* batch size: 16, 32
* learning rate: 5e-5, 3e-5, 2e-5
* number of epochs: 2, 3, 4

In [None]:
# Exeprimenting w/ different parameters
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 # epsilon prevents division by 0??
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Training epochs should be betw 2- 4 (reduce if overfitting)
epochs = 4

total_steps = len(train_dataloader) * epochs

# LR scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

## Training Loop

In [None]:
import random

# based on huggingface transformers `run_glue.py` script : https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_vals = []
total_time = time.time()

for epoch_i in range(0, epochs):       
    ## TRAINING
    print("")
    print('-------- Epoch {:} / {:} --------'.format(epoch_i + 1, epochs))
    print('Training...')
    print("")

    t0 = time.time()
    total_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 20 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            # progress from every 20 batches
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # pytorch tensors in batch (gpu usage)
        #   batch[0] -> input ids 
        #   batch[1] -> attention masks
        #   batch[2] -> labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # clear gradients before forward and backward passes
        model.zero_grad()        

        # forward pass
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        total_loss += loss.item()

        # backward pass
        loss.backward()

        # clip to prevent exploding gradients (??)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # optimizer and lr updare
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    ## VALIDATION
    
    print("")
    print("Validation...")

    t0 = time.time()
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # pytorch tensors in batch (gpu usage)
        #   batch[0] -> input ids 
        #   batch[1] -> attention masks
        #   batch[2] -> labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            # forward pass
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        total_eval_loss += loss.item()

        # move logits and labels -> CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()  
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # print final validation accuracy
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)   
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # epoch values & stats
    training_vals.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training done")
print("Time to train: {:} (h:mm:ss)".format(format_time(time.time()-total_time)))

In [None]:
# Display metrics of training process in a dataframe

import pandas as pd

pd.set_option('precision', 2)

df_vals = pd.DataFrame(data=training_stats)
df_vals = df_vals.set_index('epoch')

df_vals

In [None]:
input_ids_test = []
attention_masks_test = []
actual_labels_test=[]

for i in range(500):

    encoded_dict = tokenizer.encode_plus(
                        sentences_test[i],                      
                        add_special_tokens = True, 
                        max_length = 256,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    
   
    input_ids_test.append(encoded_dict['input_ids'])
    

    attention_masks_test.append(encoded_dict['attention_mask'])
    actual_labels_test.append(labels_test[i])

# lists -> tensors
input_ids_test = torch.cat(input_ids_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)
actual_labels_test = torch.tensor(actual_labels_test)

batch_size = 32  

# build DataLoader
prediction_data = TensorDataset(input_ids_test, attention_masks_test, actual_labels_test)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

## Testing Classification

In [None]:
print('Label predictions for {:,} test publications...'.format(len(input_ids_test)))
model.eval()

predictions, actual_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch
  # save memory and accelerate predictions w/o storing gradients
    with torch.no_grad():
      # forward pass and logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]

  # move logits and labels -> CPU
  logits = logits.detach().cpu().numpy()
  labels_ids_test = b_labels.to('cpu').numpy()
  
  predictions.append(logits)
  actual_labels.append(labels_ids_test)

classification_correct = 0

for i in range(len(predictions)):
  for j in range(len(predictions[i])):
    prediction = np.argmax(predictions[i][j])
    print ('Prediction: ' , prediction , ', actual: ', actual_labels[i][j])
    if prediction == actual_labels[i][j]:
      classification_correct = classification_correct + 1

print ('Classification correctly: ',  classification_correct)

print ('Model accuracy from testing: {0:.2f}'.format(classification_correct / len(input_ids_test)))

In [None]:
## Ignore this cell for now
# Trying out example BERT

# Single training/test example for simple sequence classification
class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """Single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [None]:
# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)

model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

In [None]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)