In [1]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score, roc_curve, auc, roc_auc_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

Using TensorFlow backend.


In [2]:
print("GPU Available: {}".format(torch.cuda.is_available()))
n_gpu = torch.cuda.device_count()
print("Number of GPU Available: {}".format(n_gpu))
print("GPU: {}".format(torch.cuda.get_device_name(0)))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GPU Available: True
Number of GPU Available: 4
GPU: Tesla V100-SXM2-32GB


In [3]:
df = pd.read_csv('/nfs/research/regan/src/coling2020-code/data/augmented_with_ccs_rq13_v3.csv')

df.cc = df.cc.apply(literal_eval)

In [4]:
len(df)

2137

In [5]:
all_q_values = ['COS', 'DES', 'MOT', 'OTHER']

In [6]:
def make_df_for_train(df, augmented=True):

    all_items = []
    
    if not augmented:
        dff = df[df['augmented']==0]
    else:
        dff = df.copy(deep=True)

    for idx, row in dff.iterrows(): 

        cc = row.cc

        for c in cc:
            entity = c["text"] 
            sentence = row.sentence
            
            sentence = "<s> " + sentence + " </s> </s> " + entity + " </s>"
            
            prop = c['q_value']
            if prop == 'PROP':
                prop = 'COS'
                
            if prop in all_q_values:
                pass
            else:
                prop='OTHER'
                
            prop_idx = all_q_values.index(prop)

            item = {"sentence":sentence, 'COS':0, 'DES':0, 'MOT':0, 'OTHER': 0, 'property_index': prop_idx}

            q_value = c["q_value"]

            if q_value == "":
                item["MOT"] = 1
            elif "**" in q_value:
                item["OTHER"] = 1
            elif "MPROP" in q_value or "PROP" in q_value:
                item["COS"] = 1
            elif "::" in q_value:
                q_values = c["q_value"].split("::")
                for q in q_values:
                    if q in all_q_values:
                        item[q] = 1
                    else:
                        item['OTHER'] = 1
            else:
                if q_value in all_q_values:
                    item[q_value] = 1
                else:
                    item['OTHER'] = 1
            all_items.append(item)
    
    return pd.DataFrame(all_items)

df_train = make_df_for_train(df, augmented=True)

In [7]:
df_train.tail(25)

Unnamed: 0,sentence,COS,DES,MOT,OTHER,property_index
4543,<s> Dig hole in dirt </s> </s> you </s>,0,0,0,1,3
4544,<s> Dig hole in dirt </s> </s> hole </s>,0,1,0,0,1
4545,<s> Make a hole in the wall </s> </s> you </s>,0,0,0,1,3
4546,<s> Make a hole in the wall </s> </s> hole </s>,0,1,0,0,1
4547,<s> Put seed in hole </s> </s> you </s>,0,0,0,1,3
4548,<s> Put seed in hole </s> </s> seed </s>,0,0,0,1,3
4549,<s> Put seed in hole </s> </s> hole </s>,0,0,0,1,3
4550,<s> Place your things on the table </s> </s> y...,0,0,0,1,3
4551,<s> Place your things on the table </s> </s> t...,0,0,0,1,3
4552,<s> Place your things on the table </s> </s> t...,0,0,0,1,3


In [8]:
df_train.property_index.value_counts()

3    3603
0     621
2     195
1     149
Name: property_index, dtype: int64

In [9]:
def get_test_data(path):

    df_test = pd.read_pickle(path)
    
    return df_test

df_test = get_test_data('../data/propara_q_value_for_test_noisy_rq121.pkl')

df_test['property_index'] = df_test['property'].apply(lambda x: all_q_values.index(x))

In [10]:
df_test.head()

Unnamed: 0,sentence,COS,DES,MOT,OTHER,property,property_index
0,<s> The sediment and plants are at least one m...,0,0,1,0,MOT,2
1,<s> More chemical changes happen and the burie...,0,1,0,0,DES,1
2,<s> The dead algae and plankton end up part of...,1,0,0,0,COS,0
3,<s> The dead algae and plankton end up part of...,1,0,0,0,COS,0
4,<s> The material becomes a liquid. </s> </s> m...,1,0,0,0,COS,0


In [11]:
df_test.property_index.value_counts()

1    701
2    543
0    334
Name: property_index, dtype: int64

In [12]:
cols = df_train.columns
label_cols = list(cols[1:5])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['COS', 'DES', 'MOT', 'OTHER']


In [13]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [16]:
def train(df1, all_labels, batch_size, epochs, lr):

    print()
    print('Begin train')
    print()

    labels = list(df1.property_index.values)
    num_labels = len(all_labels)
    label_counts = df1.property_index.value_counts()
    
    sentences = list(df1.sentence.values)

    print("# train_labels:", len(labels))
    print("# train_sentences:", len(sentences))
    print()
    print(label_counts)
    print()

    pretrained_weights = 'roberta-large'

    tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights, do_lower_case=False, add_special_tokens=False)

    train_encodings = tokenizer.batch_encode_plus(sentences,
                                            padding=True,
                                            return_token_type_ids=True)

    print('tokenizer outputs: ', train_encodings.keys())

    input_ids = train_encodings['input_ids'] # tokenized and encoded sentences
    token_type_ids = train_encodings['token_type_ids']
    attention_masks = train_encodings['attention_mask']


    # Use train_test_split to split our data into train and validation sets

    train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids,attention_masks,
                                                                random_state=2020, test_size=0.10)

    # Convert all of data into tensors
    # recall that with the augmented data, I want to define my own validation split; at least initially
    # maybe it should not be this way, and use train_test_split()?
    train_inputs = torch.tensor(train_inputs)
    train_labels = torch.tensor(train_labels)
    train_masks = torch.tensor(train_masks)
    train_token_types = torch.tensor(train_token_types)

    validation_inputs = torch.tensor(validation_inputs)
    validation_labels = torch.tensor(validation_labels)
    validation_masks = torch.tensor(validation_masks)
    validation_token_types = torch.tensor(validation_token_types)


    # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
    # with an iterator the entire dataset does not need to be loaded into memory

    train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    config = RobertaConfig.from_pretrained(pretrained_weights, num_labels=num_labels)
    config.output_hidden_states = True
    config.output_attentions = True

    model = RobertaForSequenceClassification.from_pretrained(pretrained_weights, 
                                                             config=config)

    model.to(device)

    print("num_labels:", model.num_labels)
    print()

    # setting custom optimization parameters.
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]

    optimizer = AdamW(optimizer_grouped_parameters,lr=lr,correct_bias=True)
    # optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

    num_warmup_steps = batch_size
    num_total_steps = batch_size * epochs
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_total_steps) 

    # Store our loss and accuracy for plotting
    train_loss_set = []
    train_loss_batch = []
    valid_loss_set = []
    validation_f1 = []
    validation_flat_accuracy = []

    # Tracking predictions and labels for confusion matrix
    y_pred = []
    y_true = []

    # trange is a tqdm wrapper around the normal python range
    for epoch in trange(epochs, desc="Epoch"):

        # Training

        # Set our model to training mode (as opposed to evaluation mode)
        model.train()

        # Tracking variables
        tr_loss = 0 #running loss
        nb_tr_examples, nb_tr_steps = 0, 0

        # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()

            # # Forward pass for multiclass classification
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]

            # logits = outputs[1]

            train_loss_set.append(loss.item())    

            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            scheduler.step()
            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        loss_batch = tr_loss/nb_tr_steps
        print("Train loss: {}".format(loss_batch))
        train_loss_batch.append(loss_batch)

        ###############################################################################

        # Validation

        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        val_loss = []

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                # including loss for plots (so adding labels to model definition)
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
                loss = outputs[0]
                logits = outputs[1]
            #Compute loss
            #loss_func = CrossEntropyLoss() 
            #loss = loss_func(logits.view(-1,num_labels),b_labels.long().view(-1))
            #loss = loss_func(logits, b_labels)
            val_loss.append(loss.item())

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            eval_accuracy += flat_accuracy(logits, label_ids)

            if epoch == epochs-1:
                y_pred.extend(np.argmax(logits, axis=1).flatten())
                y_true.extend(label_ids.flatten())

        avg_val_accuracy = eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        avg_val_loss = np.mean(val_loss)
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))

        valid_loss_set.append(float(avg_val_loss))

    c_matrix = confusion_matrix(y_true, y_pred, normalize='true')

    print()
    print(c_matrix)
    print()

    print("train_loss:", train_loss_batch)
    print("valid_loss:", valid_loss_set)

    return model, tokenizer, train_loss_batch, c_matrix

In [17]:
batch_size = 48
epochs = 4
lr = 3e-5

model, tokenizer, train_loss_batch, c_matrix = train(df_train, all_q_values, batch_size, epochs, lr)


Begin train

# train_labels: 4568
# train_sentences: 4568

3    3603
0     621
2     195
1     149
Name: property_index, dtype: int64

tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

num_labels: 4

Train loss: 0.7736165291348169


Epoch:  12%|█▎        | 1/8 [00:25<02:58, 25.55s/it]

  Accuracy: 0.80
  Validation Loss: 0.70
Train loss: 0.40262517029809397


Epoch:  25%|██▌       | 2/8 [00:51<02:33, 25.53s/it]

  Accuracy: 0.92
  Validation Loss: 0.24
Train loss: 0.14338219902196594


Epoch:  38%|███▊      | 3/8 [01:16<02:07, 25.52s/it]

  Accuracy: 0.95
  Validation Loss: 0.16


KeyboardInterrupt: 