<a href="https://colab.research.google.com/github/oaarnikoivu/dissertation/blob/master/BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting Emotions from Tweets with BERT 

First we need to install the transformers python package to get access to the pre-trained BERT models.

In [0]:
!pip install transformers

In [0]:
!pip install fastai

In [3]:
import os
import pandas as pd
import numpy as np
import torch
import logging 

from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertPreTrainedModel, BertModel 
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score 
from tqdm import tqdm_notebook as tqdm 

In [0]:
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# GPU

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}".format(device, n_gpu))

01/16/2020 18:41:11 - INFO - __main__ -   device: cuda n_gpu: 1


# Data

Lets import the Sem-Eval dataset and format it such that it can be fed into BERT.

In [6]:
from google.colab import drive 
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def load_dataset(filename):
  dataset = pd.read_csv(filename, sep='\t')
  return dataset

In [8]:
file_path = '/content/drive/My Drive/datasets/'

train_data = load_dataset(file_path + '2018-E-c-En-train.txt')
validation_data = load_dataset(file_path + '2018-E-c-En-dev.txt')
test_data = load_dataset(file_path + '2018-E-c-En-test-gold.txt')

train_data.columns

Index(['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

# Data Preprocessing

Here we transform the data into a format that BERT understands. This involves two steps. First, we modify the *InputExample* class to allow for multiple labels.

- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `labels` are the labels for our example, i.e. anger, disgust, fear, joy, etc.

In [0]:
class MultiLabelBertClassifier(BertPreTrainedModel):
  """BERT model for multi-label classification"""

  num_labels = 2
  num_tasks = 11 

  def __init__(self, config):
    super(MultiLabelBertClassifier, self).__init__(config)
    self.num_labels = MultiLabelBertClassifier.num_labels 
    self.num_tasks = MultiLabelBertClassifier.num_tasks 

    self.bert = BertModel(config)
    self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
    self.classifier = torch.nn.ModuleList([torch.nn.Linear(config.hidden_size, self.num_labels) for _ in range(self.num_tasks)])

    self.init_weights() 

  def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
    outputs = self.bert(
        input_ids, 
        attention_mask=attention_mask, 
        token_type_ids=token_type_ids)
    
    pooled_output = outputs[1]
    pooled_output = self.dropout(pooled_output)
  
    logits = [self.classifier[i](pooled_output) for i in range(self.num_tasks)]

    if labels is not None:
      loss_func = nn.CrossEntropyLoss()
      loss = torch.tensor([0.]).to(device)
      for i in range(self.num_tasks):
        loss += loss_func(logits[i], labels[:, i])
      return loss 
    else:
      logits = [logit.cpu().numpy() for logit in logits]
      return torch.tensor(logits) 
  
  def freeze_bert_encoder(self):
    for param in self.bert.parameters():
      param.requires_grad = False
  
  def unfreeze_bert_encoder(self):
    for param in self.bert.parameters():
      param.requires_grad = True

In [0]:
args = {
    "max_seq_length": 512,
    "batch_size": 32,
    "learning_rate": 3e-5,
    "num_train_epochs": 1,
    "warmup_steps": 5000
}

In [0]:
class InputExample(object):
  """A single training/test example for simple sequence classification."""
  def __init__(self, guid, tweet, labels=None):
    """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
    self.guid = guid
    self.tweet = tweet
    self.labels = labels 

class InputFeatures(object):
  """A single set of features of data."""

  def __init__(self, input_ids, input_mask, segment_ids, label_ids=None):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_ids = label_ids

In [0]:
def create_input_examples(df, labels_available=True):
  """Creates examples for training, test and validation sets"""
  examples = []
  for (i, row) in enumerate(df.values):
    guid = row[0]
    tweet = row[1]
    if labels_available:
      labels = row[2:]
    else:
      labels = []
    examples.append(
        InputExample(guid=guid, tweet=tweet, labels=labels)
    )
    return examples

## Convert train, val and test data to examples that BERT can understand.

In [13]:
train_InputExamples = create_input_examples(train_data)
test_InputExamples = create_input_examples(test_data)
validation_InputExamples = create_input_examples(validation_data)

train_InputExamples[0].tweet

"“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry"

In [0]:
def convert_examples_to_features(examples, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.tweet)

        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        labels_ids = []
        for label in example.labels:
            labels_ids.append(float(label))

#         label_id = label_map[example.label]
        if ex_index < 0:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %s)" % (example.labels, labels_ids))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_ids=labels_ids))
    return features

## Create a dataloader for training, testing and validation.

In [0]:
def create_dataloader(data, batch_size, labels_available=True):
  features = convert_examples_to_features(data, 
                                          args['max_seq_length'], 
                                          tokenizer)
  
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)

  if labels_available: 
    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
  else:
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
  
  data_loader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
  return data_loader

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = MultiLabelBertClassifier.from_pretrained('bert-base-uncased')
model

01/16/2020 18:42:30 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt not found in cache or force_download set to True, downloading to /tmp/tmpx5vxupgx
01/16/2020 18:42:30 - INFO - transformers.file_utils -   copying /tmp/tmpx5vxupgx to cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
01/16/2020 18:42:30 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
01/16/2020 18:42:30 - INFO - transformers.file_utils -   removing temp file /tmp/tmpx5vxupgx
01/16/2020 18:42:30 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/t

MultiLabelBertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [0]:
train_dataloader = create_dataloader(
    train_InputExamples, 
    batch_size=args['batch_size'])
val_dataloader = create_dataloader(
    validation_InputExamples, 
    batch_size=args['batch_size'])
test_dataloader = create_dataloader(
    test_InputExamples, 
    batch_size=args['batch_size'], 
    labels_available=False)

label_list = pd.read_csv(file_path + 'classes.txt')
num_labels = len(label_list)

# Prepare optimizer

In [0]:
def optimizer(model, lr):
  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_params = [
                              {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                               'weight_decay': 0.01},
                              {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                               'weight_decay': 0.0}
  ]

  return AdamW(optimizer_grouped_params, lr=lr, eps=1e-8)

In [0]:
optimizer = optimizer(model, lr=args['learning_rate'])

In [20]:
EPOCHS = args['num_train_epochs']
TRAINING_STEPS = int(len(train_data) / args['batch_size'] * EPOCHS)
TRAINING_STEPS

213

In [0]:
WARMUP_STEPS = args['warmup_steps']
scheduler = get_linear_schedule_with_warmup(optimizer, WARMUP_STEPS, TRAINING_STEPS)

In [0]:
def train(num_epochs=args['num_train_epochs']):

  model.train()

  for i_ in tqdm(range(int(num_epochs)), desc="Epoch"):
    
    train_loss = 0
    num_train, train_steps = 0, 0

    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, segment_ids, label_ids = batch
      loss = model(input_ids, segment_ids, input_mask, label_ids)

      loss.backward()

      train_loss += loss.item()
      num_train += input_ids.size(0)
      train_steps += 1

      optimizer.step()
      scheduler.step()
      optimizer.zero_grad() 
      
      logger.info('Train loss after epoch {}'.format(train_loss / train_steps / args['batch_size']))
      logger.info('Eval after epoch {}'.format(i_+1))

      eval() 

In [0]:
def eval():

  all_logits = None
  all_labels = None
  
  model.eval() 

  eval_loss, eval_accuracy = 0, 0
  eval_steps, num_eval = 0, 0

  for input_ids, input_mask, segment_ids, label_ids in val_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
      tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
      logits = model(input_ids, segment_ids, input_mask)

    if all_logits is None:
        all_logits = logits.detach().cpu()
    else:
        all_logits = torch.cat((all_logits, logits.detach().cpu()), 1)
    if all_labels is None:
        all_labels = label_ids.detach().cpu()
    else:    
        all_labels = torch.cat((all_labels, label_ids.detach().cpu()), 0)

    eval_loss += tmp_eval_loss.item()

    num_eval += input_ids.size(0)
    eval_steps += 1
  
  eval_loss = eval_loss / eval_steps 

  # Compute f1 scores
  f1_scores_list = []
  pred_labels = torch.argmax(all_logits, dim=2)
  
  for i in range(len(LABEL_COLUMNS)):
    f1_scores_list.append(f1_score(all_labels[:, i].numpy(), pred_labels[i].numpy(), average='macro'))
  
  f1_scores = np.mean(f1_scores_list)

  logger.info(' Eval loss after epoch {}'.format(eval_loss / args['batch_size']))
  logger.info('f1_score after epoch {}'.format(f1_scores))

In [24]:
train(args['num_train_epochs'])

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=1, style=ProgressStyle(description_width='ini…

01/16/2020 18:43:04 - INFO - numexpr.utils -   NumExpr defaulting to 2 threads.


RuntimeError: ignored