<a href="https://colab.research.google.com/github/oaarnikoivu/dissertation/blob/master/BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting Emotions from Tweets with BERT 

First we need to install the transformers python package to get access to the pre-trained BERT models.

In [0]:
!pip install transformers

In [0]:
import os
import pandas as pd
import numpy as np
import torch
import logging 

from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertPreTrainedModel, BertModel 
from transformers.optimization import AdamW 
from sklearn.metrics import f1_score 
from tqdm import tqdm_notebook as tqdm 

# Define model

In [0]:
class MultiLabelBertClassifier(BertPreTrainedModel):
  """BERT model for multi-label classification"""

  def __init__(self, config, num_labels=2):
    super(MultiLabelBertClassifier, self).__init__(config)
    self.num_labels = num_labels 
    self.bert = BertModel(config)
    self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
    self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
    self.apply(self.init_bert_weights)

  def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
    _, pooled_output = self.bert(
        input_ids, 
        token_type_ids, 
        attention_mask, 
        output_all_encoded_layers=False)
    logits = self.classifier(pooled_output)

    if labels is not None:
      loss_func = nn.CrossEntropyLoss()
      loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
      return loss 
    else:
      return logits 
  
  def freeze_bert_encoder(self):
    for param in self.bert.parameters():
      param.requires_grad = False
  
  def unfreeze_bert_encoder(self):
    for param in self.bert.parameters():
      param.requires_grad = True

# Data

Lets import the Sem-Eval dataset and format it such that it can be fed into BERT.

In [5]:
from google.colab import drive 
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def load_dataset(filename):
  dataset = pd.read_csv(filename, sep='\t')
  return dataset

In [14]:
file_path = '/content/drive/My Drive/datasets/'

train_data = load_dataset(file_path + '2018-E-c-En-train.txt')
validation_data = load_dataset(file_path + '2018-E-c-En-dev.txt')
test_data = load_dataset(file_path + '2018-E-c-En-test-gold.txt')

train_data.columns

Index(['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

Our input data is the 'Tweet' column and label columns the emotion categories.

In [0]:
DATA_COLUMN = 'Tweet'
LABEL_COLUMNS = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Data Preprocessing

Here we transform the data into a format that BERT understands. This involves two steps. First, we modify the *InputExample* class to allow for multiple labels.

- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `labels` are the labels for our example, i.e. anger, disgust, fear, joy, etc.

In [0]:
args = {
    "max_seq_length": 128,
    "batch_size": 32,
    "learning_rate": 3e-5,
    "num_train_epochs": 1,
    "warmup_steps": 2000
}

In [0]:
class InputExample(object):
  """A single training/test example for simple sequence classification."""
  def __init__(self, guid, text_a, text_b=None, labels=None):
    """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.labels = labels 

class InputFeatures(object):
  """A single set of features of data."""

  def __init__(self, input_ids, input_mask, segment_ids, label_ids):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_ids = label_ids

In [0]:
def convert_examples_to_features(examples, max_seq_length, 
                                 tokenizer, 
                                 labels_available=True):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens = tokenizer.tokenize(example.text)


        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        if labels_available:
            labels_ids = []
            for label in example.labels:
                labels_ids.append(label)    

            features.append(
                    InputFeatures(input_ids=input_ids,
                                  input_mask=input_mask,
                                  segment_ids=segment_ids,
                                  label_ids=labels_ids))
        else:
            features.append(
                    InputFeatures(input_ids=input_ids,
                                  input_mask=input_mask,
                                  segment_ids=segment_ids,))
    return features

## Create a dataloader for training, testing and validation.

In [0]:
def create_dataloader(data, batch_size, labels_available=True):
  features = convert_examples_to_features(data, args['max_seq_length'], 
                                          tokenizer, labels_available)