# Fine Tunning BERT to train a propaganda detector
In this notebook, we will fine tune the pre-trained BERT model to obtain a propaganda detector. 

In [1]:
# load data from CW2 to train and test the model
import pandas as pd
from sklearn.utils import shuffle
df = pd.read_table('../CW2/coursework2_train.tsv')
df = shuffle(df) # randomly shuffle data entries 
df

Unnamed: 0,article_id,article_title,label,sentence_text
1620,774637726,Liberals Agree – Trump Tougher on Putin than O...,non-propaganda,"The other issue that I want to make is that, y..."
9948,762546428,Homeschooling Expands As Parents Seethe Over L...,propaganda,“But when we ask families why do they homescho...
9876,705409419,﻿Vatican Theologian Sacked for Questioning “Me...,non-propaganda,"One of the world’s most renowned theologians, Fr."
9949,762546428,Homeschooling Expands As Parents Seethe Over L...,non-propaganda,It includes all of that.”
941,757843275,EXPERIMENTAL Ebola Vaccine Will Be Administere...,non-propaganda,Read more here about Ebola and how it is trans...
...,...,...,...,...
3270,701225819,South Florida Muslim Leader Sofian Zakkout’s D...,propaganda,"In February 2016, Zakkout circulated on social..."
8551,763260610,The Eerie Silence,non-propaganda,It also reminds us that so-called “meddling” b...
4710,7646642839,Foolish Religion Author Gary Wills: ‘The Relig...,non-propaganda,With What the Qur’an Meant: And Why It Matters...
10653,758386255,Pope Francis vs Contemplative Orders,non-propaganda,"The system, in other words, is stacked against..."


In [2]:
raw_labels = df.label.values.tolist()
docs = df.sentence_text.values.tolist()
titles = df.article_title.values.tolist()

label_dic = {'non-propaganda':0, 'propaganda':1}

assert len(docs) == len(raw_labels) == len(titles)
labels = [label_dic[rl] for rl in raw_labels] # transfer raw labels (strings) to integer numbers
print('total data size: {}, label type num: {}'.format(len(docs), len(label_dic)))

np_num = len([ll for ll in labels if ll == 0])
p_num = len([ll for ll in labels if ll == 1])
print(np_num, p_num)

total data size: 11464, label type num: 2
8227 3237


In [3]:
# split the data into train, dev and test
import random

train_ratio = 0.6
dev_ratio = 0.2
train_idx = []
dev_idx = []
test_idx = []
for i in range(len(docs)):
    rnd = random.random()
    if rnd < train_ratio: train_idx.append(i)
    elif rnd < train_ratio+dev_ratio: dev_idx.append(i)
    else: test_idx.append(i)

print('train size {}, dev size {}, test size {}'.format(len(train_idx), len(dev_idx), len(test_idx)))

train size 6781, dev size 2280, test size 2403


In [4]:
# then we define the RNN-based classifier
import torch
import torch.nn as nn

from transformers import BertModel, BertTokenizer
import torch

class BERT_Clf(nn.Module):
    def __init__(self, cls_num, gpu, bert_type='base'):
        super(BERT_Clf, self).__init__()
        # check which version of bert is used
        if bert_type == 'base':
            self.bert_dim = 768 
        elif bert_type == 'large':
            self.bert_dim = 1024
        else:
            print('INVALID bert_type!')
            return None
        # load the tokenizer customized for the bert model
        self.tokenizer = BertTokenizer.from_pretrained('bert-{}-uncased'.format(bert_type))
        # load the pretrained bert model
        self.model = BertModel.from_pretrained('bert-{}-uncased'.format(bert_type))
        # map the bert output embeddings to class prediction logits
        self.fc = nn.Linear(self.bert_dim, cls_num)
        # use gpu or not
        self.gpu = gpu
        if self.gpu:
            self.to('cuda')
    def forward(self, input_sents, input_titles=None):
        if input_titles is None:
            sents = input_sents
        else:
            assert len(input_titles) == len(input_sents)
            sents = [[input_titles[i], input_sents[i]] for i in range(len(input_titles))]
        input_batch = self.tokenizer.batch_encode_plus(sents, pad_to_max_length=True, return_tensors='pt')['input_ids']
        if self.gpu:
            input_batch = input_batch.to('cuda')
        sent_reprs = self.model(input_batch)[0][:,0,:]
        logits = self.fc(sent_reprs)
        return logits


In [5]:
# because the training data is inbalanced, we use simple down-sampling to balance the data
# used to train the model
import random
import numpy as np
def down_sample():
    np_idx = [i for i in train_idx if labels[i]==0]
    p_idx = [i for i in train_idx if labels[i]==1]
    each_cat_num = min(len(np_idx), len(p_idx))
    random.shuffle(np_idx)
    random.shuffle(p_idx)
    wanted_idx = np_idx[:each_cat_num] + p_idx[:each_cat_num]
    random.shuffle(wanted_idx)
    return wanted_idx
    
wanted_idx = down_sample()

Before we train our BERT-based model, we first check the performance of some simple baseline methods.

In [6]:
# random baseline
from sklearn.metrics import precision_recall_fscore_support
rand_pred = [random.randint(0,1) for i in range(len(test_idx))]
pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[test_idx], rand_pred,average='macro')
print('\n---> the macro-F1 of random baseline on dev set is {}'.format(f1))


---> the macro-F1 of random baseline on dev set is 0.4794524349839533


In [7]:
# majority baseline
major_pred = [0]*len(test_idx)
pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[test_idx], major_pred, average='macro')
print('\n---> the macro-F1 of majority baseline on dev set is {}'.format(f1))


---> the macro-F1 of majority baseline on dev set is 0.4140453547915143


  _warn_prf(average, modifier, msg_start, len(result))


Now we start to train our BERT-based model. 

In [8]:
# set some hyper parameters
gpu = True
bert_type = 'base'
model = BERT_Clf(len(label_dic), gpu, bert_type)
use_titles = True

loss_fnc = torch.nn.CrossEntropyLoss() # cross entropy loss

# hyper parameters
n_epochs = 20 # number of epoch (i.e. number of iterations)
batch_size = 16
lr = 1e-5 # initial learning rate

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) # after each epoch, the learning rate is discounted to its 95%

In [9]:
# before we train the model, we first look at its initial performance on the test set
# without performing any training

import numpy as np

with torch.no_grad(): # let pytorch know that no gradient should be computed
    model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
    predictions = []
    for i in range(0,len(dev_idx),batch_size):
        idx = dev_idx[i:i+batch_size]
        if len(idx) == 0: break
        dd = np.array(docs)[idx]
        if use_titles:
            tt = np.array(titles)[idx]
        else:
            tt = None
        y_pred = model(dd,tt).cpu().detach().numpy()
        pred_labels = [np.argmax(entry) for entry in y_pred]
        predictions += pred_labels
    pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[dev_idx], predictions,average='macro')
    print('\n---> before training, the macro-F1 on dev set is {}'.format(f1))
    print('pred 1 percent', np.sum(predictions)/len(predictions))


---> before training, the macro-F1 on dev set is 0.3916023544800523
pred 1 percent 0.6342105263157894


In [10]:
best_f1 = -1.
best_model = None
import copy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    sample_idx = down_sample()
    for i in range(0,len(sample_idx),batch_size):
        # Step 0: Get the data
        idx = sample_idx[i:i+batch_size]
        if len(idx)==0: break
        sents = list(np.array(docs)[idx])
        if use_titles: 
            tt = list(np.array(titles)[idx])
        else:
            tt = None
        target_labels = list(np.array(labels)[idx])
        # print(sents[0])
        if len(sents) == 0: break
        y_target = torch.tensor(target_labels, dtype=torch.int64).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = model(sents, tt)
        # print(y_pred)
        yp = y_pred.cpu().detach().numpy()
        pred_labels = [np.argmax(entry) for entry in yp]
        # print('pred labels', pred_labels)
        # print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        # print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the test set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        predictions = []
        for i in range(0,len(dev_idx),batch_size):
            idx = dev_idx[i:i+batch_size]
            if len(idx) == 0: break
            dd = np.array(docs)[idx]
            if use_titles:
                tt = np.array(titles)[idx]
            else:
                tt = None
            y_pred = model(dd,tt).cpu().detach().numpy()
            pred_labels = [np.argmax(entry) for entry in y_pred]
            predictions += pred_labels
        pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[dev_idx], predictions,average='macro')
        print('\n---> after {} epochs the macro-F1 on dev set is {}'.format(epoch_i,f1))
        print('pred 1 percent', np.sum(predictions)/len(predictions))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if f1 > best_f1:
            best_f1 = f1
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best macro-F1',f1)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()

  0%|          | 0/20 [00:00<?, ?it/s]




  5%|▌         | 1/20 [00:52<16:35, 52.39s/it]


---> after 0 epochs the macro-F1 on dev set is 0.7070322865988502
pred 1 percent 0.3030701754385965
learning rate 1e-05
best model updated; new best macro-F1 0.7070322865988502



 10%|█         | 2/20 [01:45<15:47, 52.67s/it]


---> after 1 epochs the macro-F1 on dev set is 0.7433219687435645
pred 1 percent 0.3206140350877193
learning rate 1e-05
best model updated; new best macro-F1 0.7433219687435645



 15%|█▌        | 3/20 [02:39<14:59, 52.92s/it]


---> after 2 epochs the macro-F1 on dev set is 0.7202780749202338
pred 1 percent 0.4293859649122807
learning rate 1e-05



 20%|██        | 4/20 [03:32<14:10, 53.14s/it]


---> after 3 epochs the macro-F1 on dev set is 0.7435895217736961
pred 1 percent 0.38728070175438595
learning rate 1e-05
best model updated; new best macro-F1 0.7435895217736961



 25%|██▌       | 5/20 [04:26<13:21, 53.41s/it]


---> after 4 epochs the macro-F1 on dev set is 0.7355750161245262
pred 1 percent 0.39166666666666666
learning rate 1e-05



 30%|███       | 6/20 [05:20<12:30, 53.61s/it]


---> after 5 epochs the macro-F1 on dev set is 0.7441100943984376
pred 1 percent 0.34385964912280703
learning rate 1e-05
best model updated; new best macro-F1 0.7441100943984376



 35%|███▌      | 7/20 [06:15<11:39, 53.83s/it]


---> after 6 epochs the macro-F1 on dev set is 0.7441762468347873
pred 1 percent 0.3307017543859649
learning rate 1e-05
best model updated; new best macro-F1 0.7441762468347873



 40%|████      | 8/20 [07:09<10:48, 54.06s/it]


---> after 7 epochs the macro-F1 on dev set is 0.7496020064880452
pred 1 percent 0.2951754385964912
learning rate 1e-05
best model updated; new best macro-F1 0.7496020064880452



 45%|████▌     | 9/20 [08:04<09:54, 54.07s/it]


---> after 8 epochs the macro-F1 on dev set is 0.7537291077792799
pred 1 percent 0.2741228070175439
learning rate 1e-05
best model updated; new best macro-F1 0.7537291077792799



 50%|█████     | 10/20 [08:58<09:01, 54.11s/it]


---> after 9 epochs the macro-F1 on dev set is 0.7557142857142857
pred 1 percent 0.3337719298245614
learning rate 1e-05
best model updated; new best macro-F1 0.7557142857142857



 55%|█████▌    | 11/20 [09:52<08:06, 54.06s/it]


---> after 10 epochs the macro-F1 on dev set is 0.7587301587301587
pred 1 percent 0.2723684210526316
learning rate 5e-06
best model updated; new best macro-F1 0.7587301587301587



 60%|██████    | 12/20 [10:46<07:12, 54.04s/it]


---> after 11 epochs the macro-F1 on dev set is 0.7493429938539824
pred 1 percent 0.2820175438596491
learning rate 5e-06



 65%|██████▌   | 13/20 [11:39<06:17, 53.94s/it]


---> after 12 epochs the macro-F1 on dev set is 0.7651938550178294
pred 1 percent 0.3043859649122807
learning rate 5e-06
best model updated; new best macro-F1 0.7651938550178294



 70%|███████   | 14/20 [12:34<05:24, 54.05s/it]


---> after 13 epochs the macro-F1 on dev set is 0.7594658281642698
pred 1 percent 0.30043859649122806
learning rate 5e-06



 75%|███████▌  | 15/20 [13:27<04:29, 53.98s/it]


---> after 14 epochs the macro-F1 on dev set is 0.7506910644057957
pred 1 percent 0.25
learning rate 5e-06



 80%|████████  | 16/20 [14:22<03:35, 53.99s/it]


---> after 15 epochs the macro-F1 on dev set is 0.7585088568788936
pred 1 percent 0.28596491228070176
learning rate 5e-06



 85%|████████▌ | 17/20 [15:15<02:41, 53.98s/it]


---> after 16 epochs the macro-F1 on dev set is 0.7541712714628427
pred 1 percent 0.3048245614035088
learning rate 5e-06



 90%|█████████ | 18/20 [16:09<01:47, 53.95s/it]


---> after 17 epochs the macro-F1 on dev set is 0.7528196250083232
pred 1 percent 0.27280701754385966
learning rate 5e-06



 95%|█████████▌| 19/20 [17:03<00:54, 54.00s/it]


---> after 18 epochs the macro-F1 on dev set is 0.7626583613973783
pred 1 percent 0.29605263157894735
learning rate 5e-06



100%|██████████| 20/20 [17:57<00:00, 53.89s/it]


---> after 19 epochs the macro-F1 on dev set is 0.7581762307858491
pred 1 percent 0.31491228070175437
learning rate 5e-06





In [13]:
model.load_state_dict(best_model)
with torch.no_grad(): # let pytorch know that no gradient should be computed
    model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
    predictions = []
    for i in range(0,len(test_idx),batch_size):
        idx = test_idx[i:i+batch_size]
        if len(idx) == 0: break
        dd = np.array(docs)[idx]
        if use_titles:
            tt = np.array(titles)[idx]
        else:
            tt = None
        y_pred = model(dd,tt).cpu().detach().numpy()
        pred_labels = [np.argmax(entry) for entry in y_pred]
        predictions += pred_labels
    pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[test_idx], predictions,average='macro')
    print('\n---> the macro-F1 on test set is {}'.format(f1))
    print('pred 1 percent', np.sum(predictions)/len(predictions))


---> the macro-F1 on test set is 0.7560704972259656
pred 1 percent 0.3079483978360383


In [11]:
# if you want to save your trained model, you may uncomment the line below
# torch.save(best_model, 'bert_pgd_base_wTitle.state_dict')