<a href="https://colab.research.google.com/github/nasa/PeTaL/blob/text-classification-JQ/auto-labeler/auto_labeler_prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install tensorboardX
!pip install wikipedia
!pip install swifter

In [2]:
import torch
import tensorflow as tf
import pandas as pd
import wikipedia 
import swifter


GPU Detection

In [3]:
# GPU detection 

# Get GPU device name
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [4]:
# If there is a GPU available
if torch.cuda.is_available():    

    # Tell PyTorch to use GPU
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [7]:
#Creating PyDrive instance to load in data from PeTaL shared drive, follow the steps to authenticate
!pip install -U -q PyDrive 
  
from pydrive.auth import GoogleAuth 
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials 
  
  
# Authenticate and create the PyDrive client. 
auth.authenticate_user() 
gauth = GoogleAuth() 
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

In [8]:


#this is the un-parsed articles
# link = 'https://drive.google.com/file/d/1iIZgKs1swHHJuumCU5xyW8tXSAnKAg18/view?usp=sharing'
# id = link.split("/")[-2] 
  
# downloaded = drive.CreateFile({'id':id})  
# downloaded.GetContentFile('articles.csv')   
#df = pd.read_csv('articles.csv')


In [17]:
#'https://petscan.wmflabs.org/' link to pull wikipedia articles

In [None]:
#Scraping article content by ID
def wiki_content(row):
  id = row['pageid']
  try:
    content = wikipedia.page(pageid=id).content
  except:
    content = 'error'
  return content

df['Content'] = df.swifter.apply(wiki_content, axis=1)

In [19]:
#Scraping article summary by ID

def wiki_summary(row):
  id = row['pageid']
  try:
    summary = wikipedia.page(pageid=id).summary
  except:
    summary = 'error'
  return summary

df['Summary'] = df.swifter.apply(wiki_summary, axis=1)

In [9]:
#Saving parsed articles as csv, can be accessed in the "Files" folder on the left, then download if you want
df.to_csv('parsed_articles.csv')

In [23]:
#Google drive link to the parsed articles
link = 'https://drive.google.com/file/d/1XRWsEsNUHjWOjPavwrfuUpaq3DwGGE4D/view?usp=sharing'
id = link.split("/")[-2] 
 
downloaded = drive.CreateFile({'id':id})  
downloaded.GetContentFile('parsed_articles.csv') 
df = pd.read_csv('parsed_articles.csv')

df = df[(df['Content'] != 'error') & df['Content'].notnull()]

#Df 'Content' column into list
docs = list(df['Content'].values)

In [25]:
#Labels

labels = ['Maintain homeostasis', 'Protect from temperature']

In [None]:
df['Content'].value_counts().to_frame()

**BERT**

In [11]:
from transformers import BertTokenizer

# Load BERT tokenizer
print('Loading BERT tokenizer')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [32]:
# Make sure it is tokenizing correctly:

# Print original articles
print(' Original: ', docs[0])

# Print a doc split into tokens
print('Tokenized: ', tokenizer.tokenize(docs[0]))

# Print docs as mapped to ids
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(docs[0])))

 Original:  Algae (; singular alga ) is an informal term for a large and diverse group of photosynthetic eukaryotic organisms. It is a polyphyletic grouping, including species from multiple distinct clades. Included organisms range from unicellular microalgae, such as Chlorella and the diatoms, to multicellular forms, such as the giant kelp, a large brown alga which may grow up to 50 m in length. Most are aquatic and autotrophic and lack many of the distinct cell and tissue types, such as stomata, xylem and phloem, which are found in land plants. The largest and most complex marine algae are called seaweeds, while the most complex freshwater forms are the Charophyta, a division of green algae which includes, for example, Spirogyra and stoneworts.
No definition of algae is generally accepted. One definition is that algae "have chlorophyll as their primary photosynthetic pigment and lack a sterile covering of cells around their reproductive cells". Although cyanobacteria are often referr

In [None]:
max_len = 0

for d in docs:

    # Tokenize text and add `[CLS]` and `[SEP]` tokens
    input_ids = tokenizer.encode(d, add_special_tokens=True)

    # Update max length
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [26]:
# Finishing tokenizing all docs and map tokens to thier word IDs
input_ids = []
attention_masks = []

for d in docs:

    encoded_dict = tokenizer.encode_plus(
                        d,                      # Docs to encode.
                        truncation=True,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all docs
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Attention masks
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', docs[0])
print('Token IDs:', input_ids[0])
print('Reverse:', tokenizer.convert_ids_to_tokens(input_ids[0]))



ValueError: ignored

In [None]:
# Split up training & testing/validation

from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

# 90:10 split

# Number of samples to include per set
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

#

In [None]:
## Ignore this cell for now
# Trying out example BERT

# Single training/test example for simple sequence classification
class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """Single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id