<a href="https://colab.research.google.com/github/oaarnikoivu/dissertation/blob/master/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Emotion Classification using BERT

In [0]:
!pip install tensorflow==2.00

In [0]:
!pip install transformers

In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import collections

nltk.download('stopwords')
from nltk.corpus import stopwords

import tensorflow as tf

print(tf.__version__)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2.0.0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Load data

In [0]:
def load_data(filename):
  data = pd.read_csv(filename, sep='\t')
  return data

In [0]:
file_path = '/content/drive/My Drive/datasets/'

train = load_data(file_path + '2018-E-c-En-train.txt')
val = load_data(file_path + '2018-E-c-En-dev.txt')
test = load_data(file_path + '2018-E-c-En-test-gold.txt')

In [7]:
train.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [0]:
class_names = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 
              'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

train['none'] = 1-train[class_names].max(axis=1)
train.describe()

## Replace missing tweets with: "Unknown"

In [0]:
train['Tweet'].fillna("unknown", inplace=True)
test['Tweet'].fillna("unknown", inplace=True)
val['Tweet'].fillna("unknown", inplace=True)

In [0]:
train_text = train['Tweet']
test_text = test['Tweet']
val_text = val['Tweet']
all_text = pd.concat([train_text, test_text, val_text])

## Text preprocessing

In [0]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

In [0]:
BERT_MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 140

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME, do_lower_case=True)

def tokenize_tweets(tweets, tokenizer, max_seq_len=140):
  tokenized_tweets = []

  for tweet in tweets:
    tokenized_tweet = tokenizer.encode(
        tweet,
        add_special_tokens = True,
        max_length = max_seq_len
    )
    tokenized_tweets.append(tokenized_tweet)
  return tokenized_tweets

def create_attention_masks(tokenized_and_padded_tweets):
  attention_masks = []

  for tweet in tokenized_and_padded_tweets:
    att_mask = [int(token_id > 0) for token_id in tweet]
    attention_masks.append(att_mask)
  
  return np.asarray(attention_masks)

In [0]:
input_ids = tokenize_tweets(train_text, tokenizer, MAX_LEN)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', value=0, 
                          truncating='post', padding='post')
attention_masks = create_attention_masks(input_ids)

In [18]:
input_ids

array([[  101,  1523,  4737, ...,     0,     0,     0],
       [  101,  3649,  2017, ...,     0,     0,     0],
       [  101,  1030,  4098, ...,     0,     0,     0],
       ...,
       [  101,  1030, 21541, ...,     0,     0,     0],
       [  101,  1045,  4687, ...,     0,     0,     0],
       [  101,  1045,  1005, ...,     0,     0,     0]])

In [19]:
attention_masks

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])