<a href="https://colab.research.google.com/github/oaarnikoivu/dissertation/blob/master/BERT_Encodings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install tokenizers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 4.5MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 10.9MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 21.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.38-cp36-none-any.whl size=884629 sha256=1ec697d47e

In [2]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score 
from sklearn.linear_model import LogisticRegression
from pathlib import Path

import torch
import transformers as ppb 
import warnings

warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive 
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
path = '/content/drive/My Drive/'

DATA_PATH = Path(path + '/datasets/SemEval/')

train_df = pd.read_csv(DATA_PATH/'train.csv', delimiter=',')
test_df = pd.read_csv(DATA_PATH/'test.csv', delimiter=',')
val_df = pd.read_csv(DATA_PATH/'val.csv', delimiter=',')

all_data = pd.concat([train_df, test_df, val_df])

class_names = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
              'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# For performance reasons, only going to use 6,000 Tweets.
batch_tweets = all_data[:6000]
#batch_labels = all_data[class_names][:6000]

In [5]:
batch_tweets['Tweet'].head()

0    “Worry is a down payment on a problem you may ...
1    Whatever you decide to do make sure it makes y...
2    @Max_Kellerman  it also helps that the majorit...
3    Accept the challenges so that you can literall...
4    My roommate: it's okay that we can't spell bec...
Name: Tweet, dtype: object

In [6]:
len(batch_tweets)

6000

In [7]:
model_class, tokenizer_class, pretrained_weights = (
  ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (dropout): Dropout(p=0.1, inplace=False)
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_fea

In [0]:
tokenized = batch_tweets['Tweet'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [0]:
max_len = 0
for i in tokenized.values:
  if len(i) > max_len:
    max_len = len(i)

padded = np.array([i + [0] * (max_len-len(i)) for i in tokenized.values])

In [10]:
np.array(padded).shape

(6000, 71)

In [11]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(6000, 71)

In [0]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
  last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [0]:
features = last_hidden_states[0][:,0,:].numpy()

In [14]:
features

array([[ 0.04521827, -0.19778816, -0.14963762, ..., -0.29783145,
         0.00215782,  0.5573782 ],
       [-0.07956323, -0.08615977,  0.25079146, ..., -0.24437697,
         0.12766258,  0.29017454],
       [-0.06133094, -0.0864303 , -0.10019764, ..., -0.04994415,
         0.2807403 ,  0.39241007],
       ...,
       [-0.24905805,  0.00395511, -0.02915181, ..., -0.0561239 ,
         0.20778862,  0.28375342],
       [-0.02059313, -0.11244061, -0.06061284, ..., -0.0480255 ,
         0.4726586 ,  0.23896316],
       [-0.19081639, -0.24672167, -0.07189524, ...,  0.01926764,
         0.52206874,  0.2643463 ]], dtype=float32)

In [15]:
len(features)

6000

In [84]:
features.shape

(6000, 768)

In [16]:
labels = batch_tweets[class_names]
labels

Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,0,1,0,0,0,0,1,0,0,0,1
1,0,0,0,0,1,1,1,0,0,0,0
2,1,0,1,0,1,0,1,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,0,1,0,0,0,0,0,0,0
5996,1,0,1,0,0,0,0,0,0,0,0
5997,1,0,1,0,0,0,0,0,0,0,0
5998,1,0,1,0,0,0,0,0,0,0,0


In [0]:
train_features = features[:4800]
train_labels = labels[:4800]

test_features = features[4800:]
test_labels = labels[4800:]

In [18]:
test_labels

Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
4800,1,0,1,1,0,0,0,0,0,0,0
4801,1,0,1,0,1,0,0,0,0,0,0
4802,0,0,0,1,0,0,1,0,0,0,0
4803,0,0,0,0,0,0,0,0,1,0,0
4804,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,0,1,0,0,0,0,0,0,0
5996,1,0,1,0,0,0,0,0,0,0,0
5997,1,0,1,0,0,0,0,0,0,0,0
5998,1,0,1,0,0,0,0,0,0,0,0


In [0]:
from sklearn.metrics import accuracy_score
import pickle

In [49]:
clf = LogisticRegression() 

scores = []

for class_name in class_names:
  print('\n... Processing {}'.format(class_name))

  train_target = train_labels[class_name]
  test_target = test_labels[class_name]

  cv_score = np.mean(cross_val_score(clf, train_features, train_target, cv=3, scoring='roc_auc'))
  scores.append(cv_score)

  print('CV score for class {} is {}'.format(class_name, cv_score))

  clf.fit(train_features, train_target)

  with open(path + 'models/log_bert_{}.pkl'.format(class_name), 'wb') as lg_file:
      pickle.dump(clf, lg_file)

  # Compute training accuracy
  y_pred_train = clf.predict(train_features)
  print('Training accuracy is {}'.format(accuracy_score(train_target, y_pred_train)))

  # Compute testing accuracy 
  y_pred_test = clf.predict(test_features)
  print('Testing accuracy is {}'.format(accuracy_score(test_target, y_pred_test)))

print('\nTotal CV score is {}'.format(np.mean(scores)))


... Processing anger
CV score for class anger is 0.8741448498637294
Training accuracy is 0.8416666666666667
Testing accuracy is 0.805

... Processing anticipation
CV score for class anticipation is 0.6888349249458376
Training accuracy is 0.8789583333333333
Testing accuracy is 0.8525

... Processing disgust
CV score for class disgust is 0.8313324725801828
Training accuracy is 0.8085416666666667
Testing accuracy is 0.77

... Processing fear
CV score for class fear is 0.8407506186760134
Training accuracy is 0.8983333333333333
Testing accuracy is 0.8583333333333333

... Processing joy
CV score for class joy is 0.8741551768527754
Training accuracy is 0.84875
Testing accuracy is 0.84

... Processing love
CV score for class love is 0.8704612738470631
Training accuracy is 0.9310416666666667
Testing accuracy is 0.905

... Processing optimism
CV score for class optimism is 0.8365824692996252
Training accuracy is 0.8441666666666666
Testing accuracy is 0.8066666666666666

... Processing pessimism

# Making new predictions

In [0]:
input_id = torch.tensor(tokenizer.encode("Passed my exams! Whoop Whoop!")).unsqueeze(0)

with torch.no_grad():
  last_hidden_state = model(input_id)

In [0]:
feature = last_hidden_state[0][:,0,:].numpy()

In [120]:
feature.shape

(1, 768)

# Load Models

In [0]:
import os

In [0]:
with open(path + '/models/log_bert_anger.pkl', 'rb') as log_anger_file:
  log_anger_model = pickle.load(log_anger_file)
with open(path + '/models/log_bert_anticipation.pkl', 'rb') as log_anticipation_file:
  log_anticipation_model = pickle.load(log_anticipation_file)
with open(path + '/models/log_bert_disgust.pkl', 'rb') as log_disgust_file:
  log_disgust_model = pickle.load(log_disgust_file)
with open(path + '/models/log_bert_fear.pkl', 'rb') as log_fear_file:
  log_fear_model = pickle.load(log_fear_file)
with open(path + '/models/log_bert_joy.pkl', 'rb') as log_joy_file:
  log_joy_model = pickle.load(log_joy_file)
with open(path + '/models/log_bert_love.pkl', 'rb') as log_love_file:
  log_love_model = pickle.load(log_love_file)
with open(path + '/models/log_bert_optimism.pkl', 'rb') as log_optimism_file:
  log_optimism_model = pickle.load(log_optimism_file)
with open(path + '/models/log_bert_pessimism.pkl', 'rb') as log_pessimism_file:
  log_pessimism_model = pickle.load(log_pessimism_file)
with open(path + '/models/log_bert_sadness.pkl', 'rb') as log_sadness_file:
  log_sadness_model = pickle.load(log_sadness_file)
with open(path + '/models/log_bert_surprise.pkl', 'rb') as log_surprise_file:
  log_surprise_model = pickle.load(log_surprise_file)
with open(path + '/models/log_bert_trust.pkl', 'rb') as log_trust_file:
  log_trust_model = pickle.load(log_trust_file)

In [0]:
dict_preds = {
    'pred_anger': log_anger_model.predict_proba(feature)[:, 1][0],
    'pred_anticipation': log_anticipation_model.predict_proba(feature)[:, 1][0],
    'pred_disgust': log_disgust_model.predict_proba(feature)[:, 1][0],
    'pred_fear': log_fear_model.predict_proba(feature)[:, 1][0],
    'pred_joy': log_joy_model.predict_proba(feature)[:, 1][0],
    'pred_love': log_love_model.predict_proba(feature)[:, 1][0],
    'pred_optimism': log_optimism_model.predict_proba(feature)[:, 1][0],
    'pred_pessimism': log_pessimism_model.predict_proba(feature)[:, 1][0],
    'pred_sadness': log_sadness_model.predict_proba(feature)[:, 1][0],
    'pred_surprise': log_surprise_model.predict_proba(feature)[:, 1][0],
    'pred_trust': log_trust_model.predict_proba(feature)[:, 1][0]
}

In [146]:
dict_preds

{'pred_anger': 0.06675318262663289,
 'pred_anticipation': 0.4137930969628998,
 'pred_disgust': 0.06232448057478434,
 'pred_fear': 0.030564008271689154,
 'pred_joy': 0.802032665008891,
 'pred_love': 0.060469246225951206,
 'pred_optimism': 0.3551892201729051,
 'pred_pessimism': 0.032624406530744283,
 'pred_sadness': 0.09376390501077958,
 'pred_surprise': 0.17154308962086087,
 'pred_trust': 0.08149311084632589}