In [1]:
!pip install transformers



In [2]:
!pip install seqeval



In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np
import torch
import string
import nltk
import transformers
import math

from tqdm import trange
from nltk.stem import WordNetLemmatizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertForTokenClassification, BertTokenizer, BertConfig, AdamW, get_linear_schedule_with_warmup 
from seqeval.metrics import f1_score, accuracy_score

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
TRAIN_RAW = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/lcp_single_train.tsv"
TEST_RAW = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/lcp_single_test.tsv"
TRAIN = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/lcp_single_train_cleaned.tsv"
TEST = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/lcp_single_test_cleaned.tsv"

In [6]:
# read the datasets
# train
with open(TRAIN_RAW, 'r') as f:
  data = f.read()

# need to remove " from the string, otherwise parsing will have problems because some quotas are not closed 
data = data.replace('"', '')

with open(TRAIN, 'w') as f:
  f.write(data)

df = pd.read_csv(TRAIN, sep='\t')

# test
with open(TEST_RAW, 'r') as f:
  data = f.read()

data = data.replace('"', '')

with open(TEST, 'w') as f:
  f.write(data)

test = pd.read_csv(TEST, sep='\t')

In [7]:
# calculate label given complexity
def get_label(complexity):
  diffs = []
  for i in range(5):
    diffs.append(abs(complexity - i*0.25))
  return diffs.index(min(diffs)) + 1
  
# generate a list of labels given the sentence, the word and the complexity
def generate_labels(sentence, word, complexity):
  labels = [0] * len(sentence)
  try:
    word = lemmatizer.lemmatize(word)
    labels[sentence.index(word)] = get_label(complexity)
    return labels
  except:
    # print(sentence)
    # print(word)
    return None

# pre process the sentence, remove punctuations, lemmatize each word
def proc_sent(sentence):
  punc = string.punctuation
  sentence = sentence.replace('-', ' ')
  sentence = sentence.replace('–', ' ')
  sentence = sentence.split()

  for i in range(len(sentence)):
    if '\'s' in sentence[i]:
      sentence[i] = sentence[i].replace('\'s', '')
    for c in punc:
      sentence[i] = sentence[i].replace(c, '')
    sentence[i] = lemmatizer.lemmatize(sentence[i])

  return sentence

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
lemmatizer = WordNetLemmatizer()
sentences = [proc_sent(sent) for sent in df['sentence']]
labels = [generate_labels(sent, word, complexity) for (sent, word, complexity) in zip(sentences, df['token'], df['complexity'])]

test_sentences = [proc_sent(sent) for sent in test['sentence']]

In [11]:
# remove the sentences that couldn't be processed

none_index = [i for i, x in enumerate(labels) if x == None]
sentences = [sent for i, sent in enumerate(sentences) if not i in none_index]
labels = [sent for i, sent in enumerate(labels) if not i in none_index]

In [12]:
# example sentence and its labels
print(sentences[0])
print(labels[0])

['Behold', 'there', 'came', 'up', 'out', 'of', 'the', 'river', 'seven', 'cattle', 'sleek', 'and', 'fat', 'and', 'they', 'fed', 'in', 'the', 'marsh', 'grass']
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [13]:
max_len = 0
for sent in sentences:
  max_len = max(max_len, len(sent))
print(max_len)

225


In [14]:
MAX_LEN = 225
bs = 32

In [15]:
# cut and pad the token sequences to our desired length
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(sent) for sent in sentences],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

test_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(sent) for sent in test_sentences],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [16]:
tags = pad_sequences(labels, maxlen=MAX_LEN, value=0, padding="post",
                     dtype="long", truncating="post")

In [17]:
# create attention masks
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

test_attention_masks = [[float(i != 0.0) for i in ii] for ii in test_input_ids]

In [18]:
# split the dataset to use 10% to validate the model
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [19]:
# convert the dataset to torch tensors
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

ts_inputs = torch.tensor(test_input_ids)
ts_masks = torch.tensor(test_attention_masks)

In [20]:
# define the dataloaders
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

test_data = TensorDataset(ts_inputs, ts_masks)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

In [21]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=6,
    output_attentions = False,
    output_hidden_states = False
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [22]:
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [23]:
# setup the optimizer
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [24]:
# add a scheduler to linearly reduce the learning rate throughout the epochs
epochs = 5
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
def logit2prob(logit):
    odds = math.exp(logit)
    return odds / (1+odds)

def get_loss(logits, true_labels):
    logits = [p_i for p, l in zip(logits, true_labels) for p_i, l_i in zip(p, l) if l_i != 0]
    true_labels = [l_i for l in true_labels for l_i in l if l_i != 0]
    loss = [-math.log(logit2prob(logits[i][true_labels[i]])) for i in range(len(true_labels))]
    return np.sum(loss)

In [27]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for i in range(epochs):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    print("epochs(" + str(i+1) + "/" + str(epochs) + "):")
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += get_loss(logits, label_ids)
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [p_i for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if l_i != 0]
    valid_tags = [l_i for l in true_labels for l_i in l if l_i != 0] 
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    # print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

epochs(1/5):
Average train loss: 0.1647662321381547
Validation loss: 8.611094941165808
Validation Accuracy: 0.00130718954248366

epochs(2/5):
Average train loss: 0.12797020334336492
Validation loss: 8.385222123017074
Validation Accuracy: 0.02483660130718954

epochs(3/5):
Average train loss: 0.11673229946582406
Validation loss: 6.764451286941544
Validation Accuracy: 0.044444444444444446

epochs(4/5):
Average train loss: 0.10456236158670099
Validation loss: 8.272098001563183
Validation Accuracy: 0.06797385620915032

epochs(5/5):
Average train loss: 0.09383730341990788
Validation loss: 8.800996609181842
Validation Accuracy: 0.08104575163398693

