<a href="https://colab.research.google.com/github/nasa/PeTaL-labeller/blob/SJ/auto-labeler/auto_labeler_prototype_xlnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install tensorboardX
!pip install wikipedia
!pip install swifter

In [None]:
import torch
import tensorflow as tf
import pandas as pd
import wikipedia 
import swifter
import numpy as np

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from pytorch_transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from pytorch_transformers import AdamW

from tqdm import tqdm, trange
import io
import matplotlib.pyplot as plt
% matplotlib inline

## GPU Detection

In [None]:
# GPU detection 

# Get GPU device name
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
# If there is a GPU available
if torch.cuda.is_available():    

    # Tell PyTorch to use GPU
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('GPU:', torch.cuda.get_device_name(0))

# If not
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## Import, Parse, and Store Data

In [None]:
#Creating PyDrive instance to load in data from PeTaL shared drive, follow the steps to authenticate
!pip install -U -q PyDrive 
  
from pydrive.auth import GoogleAuth 
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials 
  
  
# Authenticate and create the PyDrive client. 
auth.authenticate_user() 
gauth = GoogleAuth() 
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

In [None]:


#this is the un-parsed articles
# link = 'https://drive.google.com/file/d/1iIZgKs1swHHJuumCU5xyW8tXSAnKAg18/view?usp=sharing'
# id = link.split("/")[-2] 
  
# downloaded = drive.CreateFile({'id':id})  
# downloaded.GetContentFile('articles.csv')   
#df = pd.read_csv('articles.csv')


In [None]:
#'https://petscan.wmflabs.org/' link to pull wikipedia articles and their page ID's

In [None]:
labeled_df_link = 'https://drive.google.com/file/d/1MJDIPe1C0dFHIPWu0w18IEJhVk4Xbk2x/view?usp=sharing'
#'https://drive.google.com/file/d/1OZnAk64SPXfnaEzQFfhDJd6AX3dntIy9/view?usp=sharing'
labeled_id = labeled_df_link.split("/")[-2]
labeled_downloaded = drive.CreateFile({'id':labeled_id})  
labeled_downloaded.GetContentFile('single_label.csv') 
#'Biological-Strategies-Export-2020-October-01-1849 (1).csv'
labeled_df = pd.read_csv('single_label.csv')

In [None]:
labeled_df = labeled_df[['id', 'Title', 'Living Systems', 'Sources_source_link', 'Functions', 'Wikipedia', 'pdf_links', 'single_label']]
labeled_df = labeled_df[labeled_df['Functions'].notnull( )]
labeled_df = labeled_df[labeled_df['Sources_source_link'].notnull()]

In [None]:
import urllib.request
!pip install pdfminer
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from io import StringIO



In [None]:
pdf_links = labeled_df[labeled_df['Sources_source_link'].str.endswith('.pdf')]
pdf_links['Text'] = pdf_links.apply(parse_text, axis=1)

In [None]:
# pdf_links
labeled_df

Unnamed: 0,id,Title,Living Systems,Sources_source_link,Functions,Wikipedia,pdf_links,single_label
0,2324,Beak design absorbs high-energy impacts,Ramphastos toco,http://dx.doi.org/10.1016/j.actamat.2005.04.04...,Manage impact,"The toco toucan (Ramphastos toco), also known ...",[],Manage impact
1,2362,Saliva regulates digestion,Heloderma suspectum,http://www.jbc.org/content/267/11/7402.abstract,Maintain homeostasis|Regulate cellular processes,"The Gila monster (Heloderma suspectum, HEE-lə...",[],Maintain homeostasis
2,2367,'Bombs' distract predators,Swima bombiviridis,http://dx.doi.org/10.1126/science.1172488,Transform radiant energy (light)|Send light si...,Swima bombiviridis is a worm species that live...,"['/content/325/5943.toc.pdf', '/content/sci/32...",Transform radiant energy (light)
3,2393,Organ generates electricity,Electrophorus electricus,https://epub.uni-regensburg.de/2108/,Modify electric charge|Transform electrical en...,"The electric eel (Electrophorus electricus, ot...",['http://www.uni-regensburg.de/publikationen/m...,Modify electric charge
4,2400,Wings generate lift,Apis mellifera,http://biomimetic.pbworks.com/f/Short-amplitud...,Move in/through gases,The western honey bee or European honey bee (A...,[],Move in/through gases
...,...,...,...,...,...,...,...,...
341,93260,Pheromones turn nematodes into pest-killing ma...,Nematoda,https://biblio.ugent.be/publication/1269676/fi...,"Capture, absorb, or filter organisms|Cooperate...","The nematodes (UK: NEM-ə-tohdz, US: NEEM- Gr...","['/articles/s41598-020-62817-y.pdf', '/article...","Capture, absorb, or filter organisms"
342,93283,Interaction with adults leads to faster nest b...,Taeniopygia guttata,https://academic.oup.com/beheco/article/31/4/8...,Physically assemble structure|Self-replicate,The zebra finch (Taeniopygia guttata) is the m...,request error,Physically assemble structure
343,93287,Chemicals in oregano act as fungicide,Origanum vulgare,"https://sci-hub.st/10.1111/1750-3841.12700,htt...",Chemically break down organic compounds|Distri...,"Oregano (US: , UK: ; Origanum vulgare) is a fl...",[],Chemically break down organic compounds
344,93338,Brain acts as both teacher and student,Taeniopygia guttata,"https://elifesciences.org/articles/20944,https...",Encode/Decode|Learn|Differentiate signal from ...,The zebra finch (Taeniopygia guttata) is the m...,[],Encode/Decode


In [None]:
# All labels in dataset sorted by frequency
labeled_df['single_label'] = labeled_df.apply(lambda x: x['single_label'].split('|')[0], axis=1)
labeled_df['single_label'].value_counts().index

In [None]:
import re
import string

labels = []
docs = []
labels_test = []
docs_test = []
labels_dict = ['Capture, absorb, or filter organisms', 'Capture, absorb, or filter chemical entities', 'Protect from microbe', 'Move in/on liquids', 'Attach temporarily']

single_label = labeled_df["single_label"].tolist()
wikipedia = labeled_df["Wikipedia"].tolist()
title = labeled_df["Title"].tolist()
living_systems = labeled_df["Living Systems"].tolist()
for i in range(len(title)):
  if i < len(title) - 73: #310, 36
    if single_label[i] == 'Capture, absorb, or filter organisms':
      docs.append(wikipedia[i])
      labels.append(labels_dict.index(single_label[i]))
    if single_label[i] == 'Capture, absorb, or filter chemical entities':
      docs.append(wikipedia[i])
      labels.append(labels_dict.index(single_label[i]))
    if single_label[i] == 'Protect from microbe':
      docs.append(wikipedia[i])
      labels.append(labels_dict.index(single_label[i]))
    if single_label[i] == 'Move in/on liquids':
      docs.append(wikipedia[i])
      labels.append(labels_dict.index(single_label[i]))
    if single_label[i] == 'Attach temporarily':
      docs.append(wikipedia[i])
      labels.append(labels_dict.index(single_label[i]))
  else:
    if single_label[i] == 'Capture, absorb, or filter organisms':
      docs_test.append(wikipedia[i])
      labels_test.append(labels_dict.index(single_label[i]))
    if single_label[i] == 'Capture, absorb, or filter chemical entities':
      docs_test.append(wikipedia[i])
      labels_test.append(labels_dict.index(single_label[i]))
    if single_label[i] == 'Protect from microbe':
      docs_test.append(wikipedia[i])
      labels_test.append(labels_dict.index(single_label[i]))
    if single_label[i] == 'Move in/on liquids':
      docs_test.append(wikipedia[i])
      labels_test.append(labels_dict.index(single_label[i]))
    if single_label[i] == 'Attach temporarily':
      docs_test.append(wikipedia[i])
      labels_test.append(labels_dict.index(single_label[i]))

print ("Number of training labels: {:}".format(len(labels)))
print ("Number of training docs: {:}".format(len(docs)))
print ("Number of test labels: {:}".format(len(labels_test)))
print ("Number of test docs: {:}".format(len(docs_test)))
# print(labels)
# print(docs)
# print(labels_test)
# print(docs_test)

Number of training labels: 51
Number of training docs: 51
Number of test labels: 9
Number of test docs: 9


## Helper Functions

In [None]:
# Calculate accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# Format elapsed times as hh:mm:ss
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## XLNet Tokenizer

In [None]:
from transformers import XLNetTokenizer, XLNetModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

tokenized_docs = [tokenizer.tokenize(d) for d in docs]

Loading BERT tokenizer


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
# Make sure it is tokenizing correctly:

# Print original first article
print(' Original: ', docs[0])

# Tokenized first article
print(tokenized_texts[0])

In [None]:
max_len = 0

for d in docs:
    # tokenize text and add `[CLS]` and `[SEP]` tokens
    input_ids = tokenizer.encode(d, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max length: ', max_len)

In [None]:
# Finishing tokenizing all docs and map tokens to thier word IDs
input_ids = []
attention_masks = []

for d in docs:

    encoded_dict = tokenizer.encode_plus(
                        d,                      # Docs to encode.
                        truncation=True,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all docs
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Attention masks
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
# print('Original: ', docs[0])
# print('Token IDs:', input_ids[0])
# print('Reverse:', tokenizer.convert_ids_to_tokens(input_ids[0]))



In [None]:
# Split up training & testing/validation

from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

# 80:20 split

# Number of docs to include per set
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training docs'.format(train_size))
print('{:>5,} validation docs'.format(val_size))

   40 training docs
   11 validation docs


In [None]:
# Iterator using torch DataLoader class so that entire dataset doesn't need to be stored in memory

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# batch size can be 16 or 32
batch_size = 32

# Sample in random order when training
train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

# Sample sequentially for validation
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size # Evaluate with this batch size.
        )

## Training the Classification Model w/ Sequence Classification
  (fine-tune BERT)

  [HuggingFace documentation](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html)

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# BertForSequenceClassification -> BERT model w/ added classification layer 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # 12-layer model, uncased vocab
    num_labels = 5, # Number of labels 
    output_attentions = False, 
    output_hidden_states = False, 
)

# this needs to be run on GPU
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Optimizer for our hypermarameters / Learning Rate Scheduler
AdamW

Possible hyperparamters: 
* batch size: 16, 32
* learning rate: 5e-5, 3e-5, 2e-5
* number of epochs: 2, 3, 4

In [None]:
# Exeprimenting w/ different parameters
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 # epsilon prevents division by 0??
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Training epochs should be betw 2- 4 (reduce if overfitting)
epochs = 4

total_steps = len(train_dataloader) * epochs

# LR scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

## Training Loop

In [None]:
import random

# based on huggingface transformers `run_glue.py` script : https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_vals = []
total_time = time.time()

for epoch_i in range(0, epochs):       
    ## TRAINING
    print("")
    print('-------- Epoch {:} / {:} --------'.format(epoch_i + 1, epochs))
    print('Training...')
    print("")

    t0 = time.time()
    total_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 20 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            # progress from every 20 batches
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # pytorch tensors in batch (gpu usage)
        #   batch[0] -> input ids 
        #   batch[1] -> attention masks
        #   batch[2] -> labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # clear gradients before forward and backward passes
        model.zero_grad()        

        # forward pass
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        total_loss += loss.item()

        # backward pass
        loss.backward()

        # clip to prevent exploding gradients (??)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # optimizer and lr updare
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    ## VALIDATION
    
    print("")
    print("Validation...")

    t0 = time.time()
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # pytorch tensors in batch (gpu usage)
        #   batch[0] -> input ids 
        #   batch[1] -> attention masks
        #   batch[2] -> labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            # forward pass
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        total_eval_loss += loss.item()

        # move logits and labels -> CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()  
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # print final validation accuracy
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)   
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # epoch values & stats
    training_vals.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training done")
print("Time to train: {:} (h:mm:ss)".format(format_time(time.time()-total_time)))


-------- Epoch 1 / 4 --------
Training...


  Average training loss: 1.63
  Training epcoh took: 0:00:02

Validation...
  Accuracy: 0.18
  Validation Loss: 1.59
  Validation took: 0:00:00

-------- Epoch 2 / 4 --------
Training...


  Average training loss: 1.54
  Training epcoh took: 0:00:02

Validation...
  Accuracy: 0.36
  Validation Loss: 1.57
  Validation took: 0:00:00

-------- Epoch 3 / 4 --------
Training...


  Average training loss: 1.41
  Training epcoh took: 0:00:02

Validation...
  Accuracy: 0.36
  Validation Loss: 1.54
  Validation took: 0:00:00

-------- Epoch 4 / 4 --------
Training...


  Average training loss: 1.39
  Training epcoh took: 0:00:02

Validation...
  Accuracy: 0.45
  Validation Loss: 1.54
  Validation took: 0:00:00

Training done
Time to train: 0:00:08 (h:mm:ss)


In [None]:
# Display metrics of training process in a dataframe

import pandas as pd

pd.set_option('precision', 2)

df_vals = pd.DataFrame(data=training_vals)
df_vals = df_vals.set_index('epoch')

df_vals

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.63,1.59,0.18,0:00:02,0:00:00
2,1.54,1.57,0.36,0:00:02,0:00:00
3,1.41,1.54,0.36,0:00:02,0:00:00
4,1.39,1.54,0.45,0:00:02,0:00:00


In [None]:
input_ids_test = []
attention_masks_test = []
actual_labels_test=[]

for i in range(9):

    encoded_dict = tokenizer.encode_plus(
                        docs_test[i],                      
                        add_special_tokens = True, 
                        max_length = 256,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    
   
    input_ids_test.append(encoded_dict['input_ids'])
    

    attention_masks_test.append(encoded_dict['attention_mask'])
    actual_labels_test.append(labels_test[i])

# lists -> tensors
input_ids_test = torch.cat(input_ids_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)
actual_labels_test = torch.tensor(actual_labels_test)

batch_size = 32  

# build DataLoader
prediction_data = TensorDataset(input_ids_test, attention_masks_test, actual_labels_test)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (

## Testing Classification

In [None]:
print('Label predictions for {:,} test publications...'.format(len(input_ids_test)))
model.eval()

predictions, actual_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch
  # save memory and accelerate predictions w/o storing gradients
  with torch.no_grad():
      # forward pass and logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]

  # move logits and labels -> CPU
  logits = logits.detach().cpu().numpy()
  labels_ids_test = b_labels.to('cpu').numpy()
  
  predictions.append(logits)
  actual_labels.append(labels_ids_test)

classification_correct = 0

for i in range(len(predictions)):
  for j in range(len(predictions[i])):
    prediction = np.argmax(predictions[i][j])
    print ('Prediction: ' , prediction , ', actual: ', actual_labels[i][j])
    if prediction == actual_labels[i][j]:
      classification_correct = classification_correct + 1

print ('Classification correctly: ',  classification_correct)

print ('Model accuracy from testing: {0:.2f}'.format(classification_correct / len(input_ids_test)))

Label predictions for 9 test publications...
Prediction:  1 , actual:  0
Prediction:  1 , actual:  4
Prediction:  0 , actual:  0
Prediction:  1 , actual:  0
Prediction:  1 , actual:  4
Prediction:  1 , actual:  3
Prediction:  1 , actual:  0
Prediction:  3 , actual:  4
Prediction:  3 , actual:  0
Classification correctly:  1
Model accuracy from testing: 0.11


In [None]:
## Ignore this cell for now
# Trying out example BERT

# Single training/test example for simple sequence classification
class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """Single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [None]:
# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)

model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

In [None]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)