## Import Libraries, Language Model, Settings

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import random
import time
import datetime

import pandas_datareader as web # For collecting financial data
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm # For displaying progress during training


In [2]:
from spacy.lang.en import English
import en_core_web_md
nlp = en_core_web_md.load()
en = English()


In [3]:
import warnings
warnings.filterwarnings('ignore')


## Upload Text (Comment) / Label (Sentiment) Data

In [4]:
# 0 = Negative Sentiment
# 1 = Positive Sentiment
df = pd.read_pickle('./data.pkl')
df


Unnamed: 0,text,label
0,I hear all these autists asking ‚ÄúdO I HoLd mY ...,0
1,Love mornings like this around here...the shor...,0
2,Trump's relationship with Powell reminds me of...,0
3,I've never seen such a day in my life.\n\n* Ch...,0
4,[deleted],0
...,...,...
10696,Lol Pence cannot answer the mask question,0
10697,The only thing I like about pence is he says ‚Äú...,0
10698,The üê¢ üöÄ,0
10699,BTFD\n\ndisclaimer: does not apply during a de...,0


In [5]:
# Split the dataset into a training and validation dataset
# Adapted from midterm_student_nhanhuynh.ipynb
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size = 0.15,
    random_state = 17,
    stratify = df.label.values
)

df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df


Unnamed: 0,text,label,data_type
0,I hear all these autists asking ‚ÄúdO I HoLd mY ...,0,train
1,Love mornings like this around here...the shor...,0,val
2,Trump's relationship with Powell reminds me of...,0,train
3,I've never seen such a day in my life.\n\n* Ch...,0,train
4,[deleted],0,val
...,...,...,...
10696,Lol Pence cannot answer the mask question,0,val
10697,The only thing I like about pence is he says ‚Äú...,0,train
10698,The üê¢ üöÄ,0,val
10699,BTFD\n\ndisclaimer: does not apply during a de...,0,train


## Fine-Tune and Train BERT Model

In [6]:
# Adapted from https://www.curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
# Tokenize text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-cased', # Account for capitalization, since all CAPS usually means negative sentiment
    do_lower_case=False
)


In [7]:
# Encode the data
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values, # Comment text
    add_special_tokens=True, # Add tokens for sentence beginning and end
    return_attention_mask=True, # BERT needs attention mask
    pad_to_max_length=True,
    max_length=256, # From BERT article
    return_tensors='pt' # Return tensors for PyTorch
) # returns dict with ids and attention mask

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)


Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
# Create tensor datasets
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)


In [9]:
# Loading and fine-tuning BERT base model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', # Account for captitalization
    num_labels = 2, # Positive/negative
    output_attentions=False, # Don't need output attentions
    output_hidden_states=False # Also don't need output hidden states
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [10]:
# Create dataloaders to iterate through dataset in batches
batch_size = 32 # From BERT article

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size
)

In [11]:
# Set up optimizer and scheduler
optimizer = AdamW( # Optimize the weights
    model.parameters(),
    lr=1e-5, # Learning rate from BERT article
    eps=1e-8 # Epsilon value from BERT article
)

epochs = 10

scheduler = get_linear_schedule_with_warmup( # Set the learning rate
    optimizer,
    num_warmup_steps=0, # Default
    num_training_steps=len(dataloader_train)*epochs
)


In [12]:
# Functions for performance metrics
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_falt = labels.flatten()
    return f1_score(labels, preds_flat, average='weighted') # or macro

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v:k for k,v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
        

In [13]:
# Set seed for PyTorch
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Use GPU
device = torch.device('cuda')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [14]:
# Function for evaluating validation performance
# Adapted from HuggingFace's 'run_glue.py'
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [None]:
# Training loop for BERT
for epoch in tqdm(range(1, epochs+1)): # Iterate through epochs
    model.train() # Set model to training mode
    
    loss_train_total = 0 # Initiate training loss
    
    # Print out epochs during training
    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    
    for batch in progress_bar: # Iterate through batches
        model.zero_grad() # Initiate gradient
        
        batch = tuple(b.to(device) for b in batch) # Ensure each batch is on GPU
        
        inputs = { # Input for model
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs) # Unpacked output from running model on inputs
        
        loss = outputs[0]
        loss_train_total += loss.item() # Add up loss
        loss.backward() # Back propagation
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip gradient to prevent exploding gradient
        
        optimizer.step() # Step optimizer
        scheduler.step() # Step scheduler
        
        # Print training loss
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    
    tqdm.write(f'\nEpoch: {epoch}') # Print epoch number
    
    val_loss, predictions, true_vals = evaluate(dataloader_val) # Calculate validation loss
    val_f1 = f1_score_func(predictions, true_vals) # Calculate F1 score
    
    tqdm.write(f'Validation loss: {val_loss}') # Print validation loss
    tqdm.write(f'F1 score (weighted) {val_f1}') # Print F1 score
    
torch.save(model.state_dict(), 'BERT.model') # Save model after last epoch


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=285.0, style=ProgressStyle(description_widt‚Ä¶


Epoch: 1


HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))


Validation loss: 0.6889312793226803
F1 score (weighted) 0.39593265343069217


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=285.0, style=ProgressStyle(description_widt‚Ä¶


Epoch: 2


HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))


Validation loss: 0.6854986291305691
F1 score (weighted) 0.46688949134554275


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=285.0, style=ProgressStyle(description_widt‚Ä¶


Epoch: 3


HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))


Validation loss: 0.6925584264830047
F1 score (weighted) 0.5741029822192624


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=285.0, style=ProgressStyle(description_widt‚Ä¶


Epoch: 4


HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))


Validation loss: 0.7771714230378469
F1 score (weighted) 0.5765902371407684


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=285.0, style=ProgressStyle(description_widt‚Ä¶


Epoch: 5


HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))


Validation loss: 0.9104635984289879
F1 score (weighted) 0.5783549749385054


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=285.0, style=ProgressStyle(description_widt‚Ä¶


Epoch: 6


HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))


Validation loss: 1.0410101437101178
F1 score (weighted) 0.5703503935411962


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=285.0, style=ProgressStyle(description_widt‚Ä¶