# Training BERT Classifier for Polarisation

This script contains the training and testing of a BERT model to classify affective polarisation in Reddit comments.

In [303]:
# import own functions written in moralisation classifier notebook (NB II) saved to .py
from finalproject_functions import remove_bad_rows

import gzip
import json
import pickle
import random
import sys
import csv
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import ticker
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import torch
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score

from collections import defaultdict

sns.set(style='ticks', font_scale=1.2)
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.utils import compute_sample_weight

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

## Read Required Data

In [304]:
labelled_comments = pd.read_excel("labs_labelled_comments.xlsx")
unlabelled_comments = pd.read_csv("unlabelled_comments.csv", delimiter = ",")

## Data Preprocessing: Remove Duplicates & NA's

In [305]:
labelled_comments = remove_bad_rows(labelled_comments, "comment")

## Train Test Split

In [306]:
x_list = labelled_comments["comment"].values.tolist()

In [307]:
y_list = labelled_comments["AP_label"].values.tolist()

In [308]:
# Train Test Split using the preprocessed comments column and the overall morality label. 
# X_test_f and y_test_f are set aside to test the final model.
X_train, X_test_f, y_train, y_test_f = train_test_split(
    x_list,
    y_list,
    test_size=0.2,
    random_state=99)

# Split the training data again, this time with test size = .25 to achieve a final split of 
# 60 training data; 20 validation data (this is where baseline is tested on); 20 final testing data (best model testing)
X_train_sec, X_val, y_train_sec, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.25,
    random_state=99)

In [309]:
print(f"Training data: {len(X_train_sec)}")
print(f"Validation data: {len(X_val)}")
print(f"Test data: {len(X_test_f)}")

Training data: 597
Validation data: 200
Test data: 200


## Loading the English-language Model

In [310]:
bertmodel = 'bert-base-cased'

device_name = 'cuda'

max_length = 512

save_directory = 'polarisation_model'

## Data Preparation 

In [311]:
tokenizer = AutoTokenizer.from_pretrained(bertmodel)

In [312]:
#MINE:
unique_labels = set(label for label in y_train_sec)
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

In [313]:
#check: 
label2id.keys()

dict_keys([0, 1])

In [314]:
#check: 
id2label.keys()

dict_keys([0, 1])

In [315]:
train_encodings = tokenizer(X_train_sec, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=max_length)
test_encodings  = tokenizer(X_test_f, truncation=True, padding=True, max_length=max_length)

train_labels_encoded = [label2id[y] for y in y_train_sec]
val_labels_encoded = [label2id[y] for y in y_val]
test_labels_encoded  = [label2id[y] for y in y_test_f]

## Custom Torch Dataset

In [316]:
#Initiate MyDataset Class
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [317]:
#convert the data 
train_dataset = MyDataset(train_encodings, train_labels_encoded) #put this into funct.
val_dataset = MyDataset(val_encodings, val_labels_encoded) #put this into funct.
test_dataset = MyDataset(test_encodings, test_labels_encoded)

## Pre-Trained Bert Model:

In [318]:
model_a = AutoModelForSequenceClassification.from_pretrained(bertmodel, num_labels=len(id2label)).to(device_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

## Fine-Tuning Bert Model: 

In [320]:
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average='macro', sample_weight=compute_sample_weight('balanced', labels))
    return {'accuracy': acc, 'macro_f1': macro_f1}

In [321]:
metric_name = 'macro_f1'

In [322]:
training_args = TrainingArguments(
    "test", evaluation_strategy="steps", eval_steps=500, disable_tqdm=True)

In [323]:
# Instantiate an object of the TrainingArguments class with the following parameters:
training_args = TrainingArguments(
    
    # Number of training epochs
    num_train_epochs=5, #with higher epochs, the model begins to overfit -- already between epoch 1 and 2 it starts overfitting
    
    # Batch size for training
    per_device_train_batch_size=8,
    
    # Batch size for evaluation
    per_device_eval_batch_size=8,
    
    # Learning rate for optimization
    learning_rate=5e-5, 
    
    # Load the best model at the end of training
    load_best_model_at_end=True,
    
    # Metric used for selecting the best model
    metric_for_best_model=metric_name,
    
    # Number of warmup steps for the optimizer
    warmup_steps=0,
    
    # L2 regularization weight decay
    weight_decay=0.1, #regularization weight was increased to minimise overfitting, however, this did not work
    
    # Directory to save the fine-tuned model and configuration files
    output_dir='./results',
    
    # Directory to store logs
    logging_dir='./logs',
    
    # Log results every n steps
    logging_steps=20,
    
    # Strategy for evaluating the model during training
    evaluation_strategy='steps',
)

#https://towardsdatascience.com/handling-overfitting-in-deep-learning-models-c760ee047c6e
#https://towardsdatascience.com/handling-overfitting-in-deep-learning-models-c760ee047c6e

In [324]:
trainer = Trainer(
    model=model_a,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics)      # our custom evaluation function 


In [325]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Macro F1
20,0.4938,0.579262,0.76,0.333333
40,0.5768,0.552687,0.76,0.333333
60,0.5043,0.568991,0.76,0.333333
80,0.5285,0.612369,0.76,0.333333
100,0.4341,0.455225,0.775,0.399249
120,0.4035,0.739063,0.755,0.352866
140,0.3885,0.663712,0.715,0.705496
160,0.2741,0.751172,0.78,0.555989
180,0.2149,1.07722,0.775,0.499197
200,0.4214,0.910365,0.78,0.437614


TrainOutput(global_step=375, training_loss=0.25877766268452007, metrics={'train_runtime': 98.8746, 'train_samples_per_second': 30.19, 'train_steps_per_second': 3.793, 'total_flos': 785386500249600.0, 'train_loss': 0.25877766268452007, 'epoch': 5.0})

## Save fine tuned model:

In [326]:
trainer.save_model(save_directory)

## Testing on Validation Set:

In [327]:
trainer.evaluate()

{'eval_loss': 1.2461990118026733,
 'eval_accuracy': 0.78,
 'eval_macro_f1': 0.5559887664605376,
 'eval_runtime': 1.2721,
 'eval_samples_per_second': 157.22,
 'eval_steps_per_second': 19.653,
 'epoch': 5.0}

## Evaluate on Test Set: 

In [328]:
predicted_results = trainer.predict(test_dataset)

In [329]:
predicted_results.predictions.shape

(200, 2)

In [330]:
predicted_labels = predicted_results.predictions.argmax(-1) 
predicted_labels = predicted_labels.flatten().tolist()      
predicted_labels = [id2label[l] for l in predicted_labels]  

In [331]:
len(predicted_labels)

200

In [332]:
print(classification_report(y_test_f, 
                           predicted_labels))

              precision    recall  f1-score   support

           0       0.86      0.91      0.88       164
           1       0.44      0.31      0.36        36

    accuracy                           0.81       200
   macro avg       0.65      0.61      0.62       200
weighted avg       0.78      0.81      0.79       200



## Evaluation of Final Model:

In [333]:
#print examples of correct predictions
for _true_label, _predicted_label, _text in random.sample(list(zip(y_test_f, predicted_labels, X_test_f)), 20):
  if _true_label == _predicted_label:
    print('LABEL:', _true_label)
    print('REVIEW TEXT:', _text[:100], '...')
    print()

LABEL: 1
REVIEW TEXT: The immigration status of the perpetrators is really the most important part of this story. Thanks f ...

LABEL: 0
REVIEW TEXT: Detain the immigrants, now you got slaves . Sell their work and charge the government for keeping th ...

LABEL: 0
REVIEW TEXT: Sometimes standing up for yourself has a price.   Sometimes it's a price you can't pay. ...

LABEL: 0
REVIEW TEXT: I can understand the great polarization between the two parties in America. I cannot, however, under ...

LABEL: 0
REVIEW TEXT: This happens when people are arrested. Why the surprise here? ...

LABEL: 0
REVIEW TEXT: Advertisers are cowards with skin thinner than tissue paper. News at eleven.

I thought we already l ...

LABEL: 0
REVIEW TEXT: It's comforting seeing social progress being reversed... Not like banning the confederate flag. That ...

LABEL: 0
REVIEW TEXT: ..but not Will Smith's Bel Air ...

LABEL: 0
REVIEW TEXT: 'Papers, please. No papers? Off to the camps with you!' ...

LABEL: 0
REVIEW

In [334]:
# print missclassifications: 
for _true_label, _predicted_label, _text in random.sample(list(zip(y_test_f, predicted_labels, X_test_f)), 80):
  if _true_label != _predicted_label:
    print('TRUE LABEL:', _true_label)
    print('PREDICTED LABEL:', _predicted_label)
    print('REVIEW TEXT:', _text[:100], '...')
    print()

TRUE LABEL: 1
PREDICTED LABEL: 0
REVIEW TEXT: lol...a whole 450.  Wow.  That'll show those damn dirty liberal cities what's up.  

How many of the ...

TRUE LABEL: 0
PREDICTED LABEL: 1
REVIEW TEXT: What's the obsession with illegal immigrants on the DNC side? Don't they understand they are losing  ...

TRUE LABEL: 0
PREDICTED LABEL: 1
REVIEW TEXT: Racists who say they have a black friend be like ...

TRUE LABEL: 1
PREDICTED LABEL: 0
REVIEW TEXT: Sounds like something people of /r/conservative would do. ...

TRUE LABEL: 0
PREDICTED LABEL: 1
REVIEW TEXT: 8 yo Mickey Hicks.  
6 yo Alyssa Thomas.  
Unnamed 2 yo.  
9 yo James Robinson 
  
These kids are on ...

TRUE LABEL: 1
PREDICTED LABEL: 0
REVIEW TEXT: The explanation is that they need cars to get from place to place, but why the fuck are we allowing  ...

TRUE LABEL: 0
PREDICTED LABEL: 1
REVIEW TEXT: I wonder if we had mfa would that qualify. Either way shame on us the richest country. I guess we’re ...

TRUE LABEL: 1
PREDICTED LABEL: 