# Train an MLP-based propaganda detector that uses BERT text representations as input
In this notebook we will develop an MLP based propaganda detector, which uses BERT to vectoriz the input title-sentence pair. Note that the BERT model itself will *not* be updated during training.

In [1]:
# load data from CW2 to train and test the model
import pandas as pd
from sklearn.utils import shuffle
df = pd.read_table('../CW2/coursework2_train.tsv')
df = shuffle(df) # randomly shuffle data entries 
df

Unnamed: 0,article_id,article_title,label,sentence_text
7520,711622457,UK: Labour MP Cites Ban of Robert Spencer & Pa...,non-propaganda,— Donald J. Trump (@realDonaldTrump) November ...
6719,765913191,US Conference of Mayors Call For More Gun Conf...,propaganda,"The facts refute their arguments, but beyond t..."
9916,705409419,﻿Vatican Theologian Sacked for Questioning “Me...,non-propaganda,Can we not see in Fr.
517,771879020,DOJ Surrenders: 3D Print Gun Files Are Protect...,propaganda,Cody finally won against the federal beast aft...
9310,758469195,American in China injured in 'sonic attack' si...,non-propaganda,Some have permanent hearing loss or concussion...
...,...,...,...,...
4297,787529309,The Last-Minute Character Assassination of Jud...,non-propaganda,"""Senator Grassley must postpone the vote until..."
9849,731063195,One Trillion Stars,non-propaganda,"Photo credit: Keilana, Roberta Mura"
1981,790266787,Avenatti’s Freak Show,non-propaganda,But I have never done that to her or to anyone.
1994,790266787,Avenatti’s Freak Show,non-propaganda,An abused person wouldn’t do that.


In [2]:
raw_labels = df.label.values.tolist()
docs = df.sentence_text.values.tolist()
titles = df.article_title.values.tolist()

label_dic = {'non-propaganda':0, 'propaganda':1}

assert len(docs) == len(raw_labels) == len(titles)
labels = [label_dic[rl] for rl in raw_labels] # transfer raw labels (strings) to integer numbers
print('total data size: {}, label type num: {}'.format(len(docs), len(label_dic)))

np_num = len([ll for ll in labels if ll == 0])
p_num = len([ll for ll in labels if ll == 1])
print(np_num, p_num)

total data size: 11464, label type num: 2
8227 3237


In [3]:
# split the data into train, dev and test
import random

train_ratio = 0.6
dev_ratio = 0.2
train_idx = []
dev_idx = []
test_idx = []
for i in range(len(docs)):
    rnd = random.random()
    if rnd < train_ratio: train_idx.append(i)
    elif rnd < train_ratio+dev_ratio: dev_idx.append(i)
    else: test_idx.append(i)

print('train size {}, dev size {}, test size {}'.format(len(train_idx), len(dev_idx), len(test_idx)))

train size 6923, dev size 2217, test size 2324


In [4]:
# now we use the BERT model to vectorize all sentence-title pairs
# !pip install transformers 
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
from tqdm import tqdm

gpu = True
bert_type = 'large'
if bert_type == 'base':
    bert_dim = 768
else: 
    bert_dim = 1024
use_titles = True # whether to consider the titles when making predictions
bert_batch_size = 10

all_input_vecs = []
bert_tokenizer = BertTokenizer.from_pretrained('bert-{}-uncased'.format(bert_type))
bert_model = BertModel.from_pretrained('bert-{}-uncased'.format(bert_type))

if gpu:
    bert_model.to('cuda')

for i in tqdm(range(0, len(docs))):
    if use_titles:
        sent = [[titles[i], docs[i]]]
    else:
        sent = [docs[i]]
    input_to_bert = bert_tokenizer.batch_encode_plus(sent)['input_ids']
    input_to_bert = torch.tensor(input_to_bert)
    if gpu: input_to_bert = input_to_bert.to('cuda')
    words_vecs = bert_model(input_to_bert)[0][:,1:,:].squeeze().cpu().detach().numpy()
    sent_vec = np.mean(words_vecs, axis=0)
    # print(sent_vec.shape)
    all_input_vecs.append(sent_vec)

100%|██████████| 11464/11464 [04:37<00:00, 41.32it/s]


In [5]:
all_input_vecs = np.array(all_input_vecs)
print(all_input_vecs.shape)

(11464, 1024)


In [6]:
# then we define the RNN-based classifier
import torch
import torch.nn as nn

class MLP_Clf(nn.Module):
    def __init__(self, input_dim, cls_num, gpu):
        super(MLP_Clf, self).__init__()
        self.bert_dim = input_dim
        # MLP structure: three layers in total, dim of the hidden layer is the same as input layer
        self.fc1 = nn.Linear(self.bert_dim, self.bert_dim) 
        self.atv_fnc = nn.ReLU()
        self.fc2 = nn.Linear(self.bert_dim, cls_num)
        # use gpu or not
        self.gpu = gpu
        if self.gpu:
            self.to('cuda')
    def forward(self, input_batch, input_titles=None):
        if self.gpu:
            input_batch = input_batch.to('cuda')
        logits = self.fc2( self.atv_fnc( self.fc1(input_batch) ) )
        return logits


In [7]:
# because the training data is inbalanced, we use simple down-sampling to balance the data
# used to train the model
import random
import numpy as np
def down_sample():
    np_idx = [i for i in train_idx if labels[i]==0]
    p_idx = [i for i in train_idx if labels[i]==1]
    each_cat_num = min(len(np_idx), len(p_idx))
    random.shuffle(np_idx)
    random.shuffle(p_idx)
    wanted_idx = np_idx[:each_cat_num] + p_idx[:each_cat_num]
    random.shuffle(wanted_idx)
    return wanted_idx
    
wanted_idx = down_sample()

Before we train our MLP model, we first check the performance of some simple baseline methods.

In [8]:
# random baseline
from sklearn.metrics import precision_recall_fscore_support
rand_pred = [random.randint(0,1) for i in range(len(test_idx))]
pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[test_idx], rand_pred,average='macro')
print('\n---> the macro-F1 of random baseline on dev set is {}'.format(f1))


---> the macro-F1 of random baseline on dev set is 0.47265816526516913


In [9]:
# majority baseline
major_pred = [0]*len(test_idx)
pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[test_idx], major_pred, average='macro')
print('\n---> the macro-F1 of majority baseline on dev set is {}'.format(f1))


---> the macro-F1 of majority baseline on dev set is 0.41608040201005025


  _warn_prf(average, modifier, msg_start, len(result))


Now we start to train our MLP model with BERT vectors as input. 

In [10]:
model = MLP_Clf(bert_dim, len(label_dic), gpu)
loss_fnc = torch.nn.CrossEntropyLoss() # cross entropy loss

# hyper parameters
n_epochs = 100 # number of epoch (i.e. number of iterations)
batch_size = 32
lr = 1e-4 # initial learning rate

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8) # after each epoch, the learning rate is discounted to its 95%

In [13]:
# before we train the model, we first look at its initial performance on the test set
# without performing any training

import numpy as np

with torch.no_grad(): # let pytorch know that no gradient should be computed
    model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
    predictions = []
    for i in range(0,len(dev_idx),batch_size):
        idx = dev_idx[i:i+batch_size] 
        vecs = torch.tensor(all_input_vecs[idx])
        if gpu: vecs = vecs.to('cuda')
        y_pred = model(vecs).cpu().detach().numpy()
        pred_labels = [np.argmax(entry) for entry in y_pred]
        predictions += pred_labels
    pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[dev_idx], predictions,average='macro')
    print('\n---> before training, the macro-F1 on dev set is {}'.format(f1))
    print('pred 1 percent', np.sum(predictions)/len(predictions))


---> before training, the macro-F1 on dev set is 0.727675023081504
pred 1 percent 0.2372575552548489


In [14]:
best_f1 = -1.
best_model = None
import copy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    sample_idx = down_sample()
    for i in range(0,len(train_idx),batch_size):
        # Step 0: Get the data
        idx = train_idx[i:i+batch_size] 
        if len(idx) == 0: break
        # print(idx)
        vecs = torch.tensor(all_input_vecs[idx])
        if gpu: vecs = vecs.to('cuda')
        target_labels = list(np.array(labels)[idx])
        # print(sents[0])
        y_target = torch.tensor(target_labels, dtype=torch.int64).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = model(vecs)
        # print(y_pred)
        yp = y_pred.cpu().detach().numpy()
        pred_labels = [np.argmax(entry) for entry in yp]
        # print('pred labels', pred_labels)
        # print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        # print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the test set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        predictions = []
        for i in range(0,len(dev_idx),batch_size):
            idx = dev_idx[i:i+batch_size] 
            vecs = torch.tensor(all_input_vecs[idx])
            if gpu: vecs = vecs.to('cuda')
            y_pred = model(vecs).cpu().detach().numpy()
            pred_labels = [np.argmax(entry) for entry in y_pred]
            predictions += pred_labels
        pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[dev_idx], predictions,average='macro')
        print('\n---> after {} epochs, the macro-F1 on dev set is {}'.format(epoch_i,f1))
        print('pred 1 percent', np.sum(predictions)/len(predictions))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if f1 > best_f1:
            best_f1 = f1
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best macro-F1',f1)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:00<00:43,  2.27it/s][A



---> after 0 epochs, the macro-F1 on dev set is 0.6771713994661194
pred 1 percent 0.13757329724853407
learning rate 6.400000000000001e-05
best model updated; new best macro-F1 0.6771713994661194



  2%|▏         | 2/100 [00:00<00:42,  2.29it/s][A



---> after 1 epochs, the macro-F1 on dev set is 0.676318329167801
pred 1 percent 0.13847541723049164
learning rate 6.400000000000001e-05



  3%|▎         | 3/100 [00:01<00:42,  2.26it/s][A



---> after 2 epochs, the macro-F1 on dev set is 0.6768365294818138
pred 1 percent 0.13937753721244925
learning rate 6.400000000000001e-05



  4%|▍         | 4/100 [00:01<00:42,  2.25it/s][A



---> after 3 epochs, the macro-F1 on dev set is 0.6782922677593525
pred 1 percent 0.14073071718538566
learning rate 6.400000000000001e-05
best model updated; new best macro-F1 0.6782922677593525



  5%|▌         | 5/100 [00:02<00:42,  2.24it/s][A



---> after 4 epochs, the macro-F1 on dev set is 0.6788032810938456
pred 1 percent 0.14163283716734326
learning rate 6.400000000000001e-05
best model updated; new best macro-F1 0.6788032810938456



  6%|▌         | 6/100 [00:02<00:42,  2.24it/s][A



---> after 5 epochs, the macro-F1 on dev set is 0.6845820474095476
pred 1 percent 0.14839873703202525
learning rate 5.120000000000001e-05
best model updated; new best macro-F1 0.6845820474095476



  7%|▋         | 7/100 [00:03<00:46,  1.99it/s][A



---> after 6 epochs, the macro-F1 on dev set is 0.6845820474095476
pred 1 percent 0.14839873703202525
learning rate 5.120000000000001e-05



  8%|▊         | 8/100 [00:03<00:44,  2.05it/s][A



---> after 7 epochs, the macro-F1 on dev set is 0.6837240784808898
pred 1 percent 0.14930085701398285
learning rate 5.120000000000001e-05



  9%|▉         | 9/100 [00:04<00:43,  2.11it/s][A



---> after 8 epochs, the macro-F1 on dev set is 0.6837833933793614
pred 1 percent 0.15065403698691926
learning rate 5.120000000000001e-05



 10%|█         | 10/100 [00:04<00:41,  2.14it/s][A



---> after 9 epochs, the macro-F1 on dev set is 0.6851803734364614
pred 1 percent 0.15200721695985567
learning rate 5.120000000000001e-05
best model updated; new best macro-F1 0.6851803734364614



 11%|█         | 11/100 [00:05<00:45,  1.95it/s][A



---> after 10 epochs, the macro-F1 on dev set is 0.683355483416487
pred 1 percent 0.15110509697789806
learning rate 5.120000000000001e-05



 12%|█▏        | 12/100 [00:05<00:44,  1.97it/s][A



---> after 11 epochs, the macro-F1 on dev set is 0.6820137693631669
pred 1 percent 0.15110509697789806
learning rate 5.120000000000001e-05



 13%|█▎        | 13/100 [00:06<00:50,  1.73it/s][A



---> after 12 epochs, the macro-F1 on dev set is 0.6829280387268635
pred 1 percent 0.15155615696887687
learning rate 5.120000000000001e-05



 14%|█▍        | 14/100 [00:06<00:45,  1.87it/s][A



---> after 13 epochs, the macro-F1 on dev set is 0.6838407154510847
pred 1 percent 0.15200721695985567
learning rate 5.120000000000001e-05



 15%|█▌        | 15/100 [00:07<00:49,  1.71it/s][A



---> after 14 epochs, the macro-F1 on dev set is 0.6838407154510847
pred 1 percent 0.15200721695985567
learning rate 5.120000000000001e-05



 16%|█▌        | 16/100 [00:08<00:50,  1.68it/s][A



---> after 15 epochs, the macro-F1 on dev set is 0.6952530775575037
pred 1 percent 0.16914749661705006
learning rate 4.096000000000001e-05
best model updated; new best macro-F1 0.6952530775575037



 17%|█▋        | 17/100 [00:09<00:53,  1.56it/s][A



---> after 16 epochs, the macro-F1 on dev set is 0.6956921127652835
pred 1 percent 0.17004961659900766
learning rate 4.096000000000001e-05
best model updated; new best macro-F1 0.6956921127652835



 18%|█▊        | 18/100 [00:09<00:48,  1.70it/s][A



---> after 17 epochs, the macro-F1 on dev set is 0.6965606520256518
pred 1 percent 0.17050067658998647
learning rate 4.096000000000001e-05
best model updated; new best macro-F1 0.6965606520256518



 19%|█▉        | 19/100 [00:10<00:49,  1.65it/s][A



---> after 18 epochs, the macro-F1 on dev set is 0.6961291728364979
pred 1 percent 0.17095173658096527
learning rate 4.096000000000001e-05



 20%|██        | 20/100 [00:10<00:51,  1.56it/s][A



---> after 19 epochs, the macro-F1 on dev set is 0.695698134544251
pred 1 percent 0.17140279657194407
learning rate 4.096000000000001e-05



 21%|██        | 21/100 [00:11<00:47,  1.67it/s][A



---> after 20 epochs, the macro-F1 on dev set is 0.6969974137387245
pred 1 percent 0.17275597654488048
learning rate 4.096000000000001e-05
best model updated; new best macro-F1 0.6969974137387245



 22%|██▏       | 22/100 [00:11<00:47,  1.63it/s][A



---> after 21 epochs, the macro-F1 on dev set is 0.6978602376813302
pred 1 percent 0.17320703653585928
learning rate 4.096000000000001e-05
best model updated; new best macro-F1 0.6978602376813302



 23%|██▎       | 23/100 [00:12<00:47,  1.64it/s][A



---> after 22 epochs, the macro-F1 on dev set is 0.698292296585995
pred 1 percent 0.17275597654488048
learning rate 4.096000000000001e-05
best model updated; new best macro-F1 0.698292296585995



 24%|██▍       | 24/100 [00:13<00:43,  1.73it/s][A



---> after 23 epochs, the macro-F1 on dev set is 0.6978602376813302
pred 1 percent 0.17320703653585928
learning rate 4.096000000000001e-05



 25%|██▌       | 25/100 [00:13<00:44,  1.69it/s][A



---> after 24 epochs, the macro-F1 on dev set is 0.6969974137387245
pred 1 percent 0.17275597654488048
learning rate 4.096000000000001e-05



 26%|██▌       | 26/100 [00:14<00:42,  1.73it/s][A



---> after 25 epochs, the macro-F1 on dev set is 0.7086516200467504
pred 1 percent 0.20162381596752368
learning rate 3.276800000000001e-05
best model updated; new best macro-F1 0.7086516200467504



 27%|██▋       | 27/100 [00:14<00:40,  1.80it/s][A



---> after 26 epochs, the macro-F1 on dev set is 0.7078426188227888
pred 1 percent 0.20117275597654488
learning rate 3.276800000000001e-05



 28%|██▊       | 28/100 [00:15<00:41,  1.75it/s][A



---> after 27 epochs, the macro-F1 on dev set is 0.7090278196647699
pred 1 percent 0.2025259359494813
learning rate 3.276800000000001e-05
best model updated; new best macro-F1 0.7090278196647699



 29%|██▉       | 29/100 [00:16<00:42,  1.68it/s][A



---> after 28 epochs, the macro-F1 on dev set is 0.7090278196647699
pred 1 percent 0.2025259359494813
learning rate 3.276800000000001e-05



 30%|███       | 30/100 [00:16<00:38,  1.82it/s][A



---> after 29 epochs, the macro-F1 on dev set is 0.7094024006604047
pred 1 percent 0.2034280559314389
learning rate 3.276800000000001e-05
best model updated; new best macro-F1 0.7094024006604047



 31%|███       | 31/100 [00:17<00:40,  1.69it/s][A



---> after 30 epochs, the macro-F1 on dev set is 0.7090278196647699
pred 1 percent 0.2025259359494813
learning rate 3.276800000000001e-05



 32%|███▏      | 32/100 [00:17<00:41,  1.65it/s][A



---> after 31 epochs, the macro-F1 on dev set is 0.7090278196647699
pred 1 percent 0.2025259359494813
learning rate 3.276800000000001e-05



 33%|███▎      | 33/100 [00:18<00:41,  1.62it/s][A



---> after 32 epochs, the macro-F1 on dev set is 0.7094594122589628
pred 1 percent 0.2020748759585025
learning rate 3.276800000000001e-05
best model updated; new best macro-F1 0.7094594122589628



 34%|███▍      | 34/100 [00:18<00:37,  1.78it/s][A



---> after 33 epochs, the macro-F1 on dev set is 0.7074118397065239
pred 1 percent 0.20162381596752368
learning rate 3.276800000000001e-05



 35%|███▌      | 35/100 [00:19<00:34,  1.91it/s][A



---> after 34 epochs, the macro-F1 on dev set is 0.7094594122589628
pred 1 percent 0.2020748759585025
learning rate 3.276800000000001e-05



 36%|███▌      | 36/100 [00:20<00:37,  1.72it/s][A



---> after 35 epochs, the macro-F1 on dev set is 0.7262087065964207
pred 1 percent 0.2313937753721245
learning rate 2.621440000000001e-05
best model updated; new best macro-F1 0.7262087065964207



 37%|███▋      | 37/100 [00:20<00:33,  1.88it/s][A



---> after 36 epochs, the macro-F1 on dev set is 0.7266436990027199
pred 1 percent 0.2309427153811457
learning rate 2.621440000000001e-05
best model updated; new best macro-F1 0.7266436990027199



 38%|███▊      | 38/100 [00:21<00:35,  1.75it/s][A



---> after 37 epochs, the macro-F1 on dev set is 0.72588726064659
pred 1 percent 0.2304916553901669
learning rate 2.621440000000001e-05



 39%|███▉      | 39/100 [00:21<00:35,  1.69it/s][A



---> after 38 epochs, the macro-F1 on dev set is 0.72588726064659
pred 1 percent 0.2304916553901669
learning rate 2.621440000000001e-05



 40%|████      | 40/100 [00:22<00:33,  1.80it/s][A



---> after 39 epochs, the macro-F1 on dev set is 0.7255644682115271
pred 1 percent 0.22958953540820928
learning rate 2.621440000000001e-05



 41%|████      | 41/100 [00:22<00:34,  1.73it/s][A



---> after 40 epochs, the macro-F1 on dev set is 0.7248056139776455
pred 1 percent 0.22913847541723048
learning rate 2.621440000000001e-05



 42%|████▏     | 42/100 [00:23<00:30,  1.88it/s][A



---> after 41 epochs, the macro-F1 on dev set is 0.7229564525980348
pred 1 percent 0.2273342354533153
learning rate 2.621440000000001e-05



 43%|████▎     | 43/100 [00:23<00:33,  1.72it/s][A



---> after 42 epochs, the macro-F1 on dev set is 0.7229564525980348
pred 1 percent 0.2273342354533153
learning rate 2.621440000000001e-05



 44%|████▍     | 44/100 [00:24<00:33,  1.65it/s][A



---> after 43 epochs, the macro-F1 on dev set is 0.7229564525980348
pred 1 percent 0.2273342354533153
learning rate 2.621440000000001e-05



 45%|████▌     | 45/100 [00:25<00:30,  1.79it/s][A



---> after 44 epochs, the macro-F1 on dev set is 0.722193055798609
pred 1 percent 0.2268831754623365
learning rate 2.621440000000001e-05



 46%|████▌     | 46/100 [00:25<00:30,  1.76it/s][A



---> after 45 epochs, the macro-F1 on dev set is 0.7289162426605138
pred 1 percent 0.2408660351826793
learning rate 2.097152000000001e-05
best model updated; new best macro-F1 0.7289162426605138



 47%|████▋     | 47/100 [00:26<00:31,  1.69it/s][A



---> after 46 epochs, the macro-F1 on dev set is 0.7289162426605138
pred 1 percent 0.2408660351826793
learning rate 2.097152000000001e-05



 48%|████▊     | 48/100 [00:27<00:32,  1.61it/s][A



---> after 47 epochs, the macro-F1 on dev set is 0.7284822528687855
pred 1 percent 0.2413170951736581
learning rate 2.097152000000001e-05



 49%|████▉     | 49/100 [00:27<00:29,  1.76it/s][A



---> after 48 epochs, the macro-F1 on dev set is 0.7284822528687855
pred 1 percent 0.2413170951736581
learning rate 2.097152000000001e-05



 50%|█████     | 50/100 [00:28<00:29,  1.68it/s][A



---> after 49 epochs, the macro-F1 on dev set is 0.7289162426605138
pred 1 percent 0.2408660351826793
learning rate 2.097152000000001e-05



 51%|█████     | 51/100 [00:28<00:26,  1.81it/s][A



---> after 50 epochs, the macro-F1 on dev set is 0.7289162426605138
pred 1 percent 0.2408660351826793
learning rate 2.097152000000001e-05



 52%|█████▏    | 52/100 [00:29<00:26,  1.81it/s][A



---> after 51 epochs, the macro-F1 on dev set is 0.7293505815212545
pred 1 percent 0.2404149751917005
learning rate 2.097152000000001e-05
best model updated; new best macro-F1 0.7293505815212545



 53%|█████▎    | 53/100 [00:29<00:24,  1.90it/s][A



---> after 52 epochs, the macro-F1 on dev set is 0.7293505815212545
pred 1 percent 0.2404149751917005
learning rate 2.097152000000001e-05



 54%|█████▍    | 54/100 [00:30<00:23,  1.98it/s][A



---> after 53 epochs, the macro-F1 on dev set is 0.7289162426605138
pred 1 percent 0.2408660351826793
learning rate 2.097152000000001e-05



 55%|█████▌    | 55/100 [00:30<00:24,  1.84it/s][A



---> after 54 epochs, the macro-F1 on dev set is 0.728173844919173
pred 1 percent 0.2404149751917005
learning rate 2.097152000000001e-05



 56%|█████▌    | 56/100 [00:31<00:25,  1.72it/s][A



---> after 55 epochs, the macro-F1 on dev set is 0.7291386682956628
pred 1 percent 0.2467298150654037
learning rate 1.677721600000001e-05



 57%|█████▋    | 57/100 [00:31<00:24,  1.72it/s][A



---> after 56 epochs, the macro-F1 on dev set is 0.7291386682956628
pred 1 percent 0.2467298150654037
learning rate 1.677721600000001e-05



 58%|█████▊    | 58/100 [00:32<00:23,  1.82it/s][A



---> after 57 epochs, the macro-F1 on dev set is 0.7291386682956628
pred 1 percent 0.2467298150654037
learning rate 1.677721600000001e-05



 59%|█████▉    | 59/100 [00:33<00:23,  1.72it/s][A



---> after 58 epochs, the macro-F1 on dev set is 0.7307399772112046
pred 1 percent 0.2462787550744249
learning rate 1.677721600000001e-05
best model updated; new best macro-F1 0.7307399772112046



 60%|██████    | 60/100 [00:33<00:22,  1.74it/s][A



---> after 59 epochs, the macro-F1 on dev set is 0.7311741223658321
pred 1 percent 0.2458276950834461
learning rate 1.677721600000001e-05
best model updated; new best macro-F1 0.7311741223658321



 61%|██████    | 61/100 [00:34<00:20,  1.86it/s][A



---> after 60 epochs, the macro-F1 on dev set is 0.7307399772112046
pred 1 percent 0.2462787550744249
learning rate 1.677721600000001e-05



 62%|██████▏   | 62/100 [00:34<00:19,  1.95it/s][A



---> after 61 epochs, the macro-F1 on dev set is 0.7300053142022054
pred 1 percent 0.2458276950834461
learning rate 1.677721600000001e-05



 63%|██████▎   | 63/100 [00:35<00:19,  1.90it/s][A



---> after 62 epochs, the macro-F1 on dev set is 0.7307399772112046
pred 1 percent 0.2462787550744249
learning rate 1.677721600000001e-05



 64%|██████▍   | 64/100 [00:35<00:20,  1.73it/s][A



---> after 63 epochs, the macro-F1 on dev set is 0.7307399772112046
pred 1 percent 0.2462787550744249
learning rate 1.677721600000001e-05



 65%|██████▌   | 65/100 [00:36<00:21,  1.64it/s][A



---> after 64 epochs, the macro-F1 on dev set is 0.7304391493684934
pred 1 percent 0.2453766350924673
learning rate 1.677721600000001e-05



 66%|██████▌   | 66/100 [00:36<00:19,  1.73it/s][A



---> after 65 epochs, the macro-F1 on dev set is 0.7290422343613834
pred 1 percent 0.2395128552097429
learning rate 1.3421772800000007e-05



 67%|██████▋   | 67/100 [00:37<00:17,  1.86it/s][A



---> after 66 epochs, the macro-F1 on dev set is 0.7290422343613834
pred 1 percent 0.2395128552097429
learning rate 1.3421772800000007e-05



 68%|██████▊   | 68/100 [00:38<00:18,  1.75it/s][A



---> after 67 epochs, the macro-F1 on dev set is 0.7282982061153022
pred 1 percent 0.23906179521876408
learning rate 1.3421772800000007e-05



 69%|██████▉   | 69/100 [00:38<00:18,  1.67it/s][A



---> after 68 epochs, the macro-F1 on dev set is 0.7282982061153022
pred 1 percent 0.23906179521876408
learning rate 1.3421772800000007e-05



 70%|███████   | 70/100 [00:39<00:18,  1.59it/s][A



---> after 69 epochs, the macro-F1 on dev set is 0.7290422343613834
pred 1 percent 0.2395128552097429
learning rate 1.3421772800000007e-05



 71%|███████   | 71/100 [00:40<00:18,  1.56it/s][A



---> after 70 epochs, the macro-F1 on dev set is 0.7282982061153022
pred 1 percent 0.23906179521876408
learning rate 1.3421772800000007e-05



 72%|███████▏  | 72/100 [00:40<00:17,  1.59it/s][A



---> after 71 epochs, the macro-F1 on dev set is 0.7302203116032904
pred 1 percent 0.2395128552097429
learning rate 1.3421772800000007e-05



 73%|███████▎  | 73/100 [00:41<00:17,  1.51it/s][A



---> after 72 epochs, the macro-F1 on dev set is 0.728421691752149
pred 1 percent 0.2377086152458277
learning rate 1.3421772800000007e-05



 74%|███████▍  | 74/100 [00:42<00:16,  1.54it/s][A



---> after 73 epochs, the macro-F1 on dev set is 0.7296024670054004
pred 1 percent 0.2377086152458277
learning rate 1.3421772800000007e-05



 75%|███████▌  | 75/100 [00:42<00:15,  1.57it/s][A



---> after 74 epochs, the macro-F1 on dev set is 0.7288564763436232
pred 1 percent 0.2372575552548489
learning rate 1.3421772800000007e-05



 76%|███████▌  | 76/100 [00:43<00:13,  1.75it/s][A



---> after 75 epochs, the macro-F1 on dev set is 0.7244800828147303
pred 1 percent 0.2282363554352729
learning rate 1.0737418240000007e-05



 77%|███████▋  | 77/100 [00:43<00:13,  1.69it/s][A



---> after 76 epochs, the macro-F1 on dev set is 0.7237187939000154
pred 1 percent 0.2277852954442941
learning rate 1.0737418240000007e-05



 78%|███████▊  | 78/100 [00:44<00:13,  1.62it/s][A



---> after 77 epochs, the macro-F1 on dev set is 0.7241531849842202
pred 1 percent 0.2273342354533153
learning rate 1.0737418240000007e-05



 79%|███████▉  | 79/100 [00:45<00:13,  1.60it/s][A



---> after 78 epochs, the macro-F1 on dev set is 0.7245879432486211
pred 1 percent 0.2268831754623365
learning rate 1.0737418240000007e-05



 80%|████████  | 80/100 [00:45<00:13,  1.51it/s][A



---> after 79 epochs, the macro-F1 on dev set is 0.7245879432486211
pred 1 percent 0.2268831754623365
learning rate 1.0737418240000007e-05



 81%|████████  | 81/100 [00:46<00:13,  1.45it/s][A



---> after 80 epochs, the macro-F1 on dev set is 0.723390499523615
pred 1 percent 0.2268831754623365
learning rate 1.0737418240000007e-05



 82%|████████▏ | 82/100 [00:47<00:12,  1.45it/s][A



---> after 81 epochs, the macro-F1 on dev set is 0.7238249134957446
pred 1 percent 0.2264321154713577
learning rate 1.0737418240000007e-05



 83%|████████▎ | 83/100 [00:47<00:11,  1.44it/s][A



---> after 82 epochs, the macro-F1 on dev set is 0.7238249134957446
pred 1 percent 0.2264321154713577
learning rate 1.0737418240000007e-05



 84%|████████▍ | 84/100 [00:48<00:09,  1.61it/s][A



---> after 83 epochs, the macro-F1 on dev set is 0.7242596958750622
pred 1 percent 0.2259810554803789
learning rate 1.0737418240000007e-05



 85%|████████▌ | 85/100 [00:49<00:09,  1.60it/s][A



---> after 84 epochs, the macro-F1 on dev set is 0.7242596958750622
pred 1 percent 0.2259810554803789
learning rate 1.0737418240000007e-05



 86%|████████▌ | 86/100 [00:49<00:08,  1.67it/s][A



---> after 85 epochs, the macro-F1 on dev set is 0.7219228881496429
pred 1 percent 0.21966621560667568
learning rate 8.589934592000006e-06



 87%|████████▋ | 87/100 [00:50<00:08,  1.62it/s][A



---> after 86 epochs, the macro-F1 on dev set is 0.7219228881496429
pred 1 percent 0.21966621560667568
learning rate 8.589934592000006e-06



 88%|████████▊ | 88/100 [00:50<00:07,  1.58it/s][A



---> after 87 epochs, the macro-F1 on dev set is 0.7219228881496429
pred 1 percent 0.21966621560667568
learning rate 8.589934592000006e-06



 89%|████████▉ | 89/100 [00:51<00:06,  1.70it/s][A



---> after 88 epochs, the macro-F1 on dev set is 0.7211484339722767
pred 1 percent 0.21921515561569688
learning rate 8.589934592000006e-06



 90%|█████████ | 90/100 [00:52<00:06,  1.63it/s][A



---> after 89 epochs, the macro-F1 on dev set is 0.7211484339722767
pred 1 percent 0.21921515561569688
learning rate 8.589934592000006e-06



 91%|█████████ | 91/100 [00:52<00:05,  1.59it/s][A



---> after 90 epochs, the macro-F1 on dev set is 0.7211484339722767
pred 1 percent 0.21921515561569688
learning rate 8.589934592000006e-06



 92%|█████████▏| 92/100 [00:53<00:05,  1.52it/s][A



---> after 91 epochs, the macro-F1 on dev set is 0.7211484339722767
pred 1 percent 0.21921515561569688
learning rate 8.589934592000006e-06



 93%|█████████▎| 93/100 [00:54<00:04,  1.51it/s][A



---> after 92 epochs, the macro-F1 on dev set is 0.7211484339722767
pred 1 percent 0.21921515561569688
learning rate 8.589934592000006e-06



 94%|█████████▍| 94/100 [00:54<00:03,  1.53it/s][A



---> after 93 epochs, the macro-F1 on dev set is 0.720372882022251
pred 1 percent 0.21876409562471807
learning rate 8.589934592000006e-06



 95%|█████████▌| 95/100 [00:55<00:03,  1.39it/s][A



---> after 94 epochs, the macro-F1 on dev set is 0.720372882022251
pred 1 percent 0.21876409562471807
learning rate 8.589934592000006e-06



 96%|█████████▌| 96/100 [00:56<00:02,  1.42it/s][A



---> after 95 epochs, the macro-F1 on dev set is 0.7184731086273523
pred 1 percent 0.2169598556608029
learning rate 6.871947673600004e-06



 97%|█████████▋| 97/100 [00:57<00:02,  1.32it/s][A



---> after 96 epochs, the macro-F1 on dev set is 0.7184731086273523
pred 1 percent 0.2169598556608029
learning rate 6.871947673600004e-06



 98%|█████████▊| 98/100 [00:57<00:01,  1.35it/s][A



---> after 97 epochs, the macro-F1 on dev set is 0.7184731086273523
pred 1 percent 0.2169598556608029
learning rate 6.871947673600004e-06



 99%|█████████▉| 99/100 [00:58<00:00,  1.33it/s][A



---> after 98 epochs, the macro-F1 on dev set is 0.7184731086273523
pred 1 percent 0.2169598556608029
learning rate 6.871947673600004e-06



100%|██████████| 100/100 [00:59<00:00,  1.68it/s][A



---> after 99 epochs, the macro-F1 on dev set is 0.7184731086273523
pred 1 percent 0.2169598556608029
learning rate 6.871947673600004e-06





In [15]:
import numpy as np

model.load_state_dict(best_model)
with torch.no_grad(): # let pytorch know that no gradient should be computed
    model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
    predictions = []
    for i in range(0,len(test_idx),batch_size):
        idx = test_idx[i:i+batch_size] 
        vecs = torch.tensor(all_input_vecs[idx])
        if gpu: vecs = vecs.to('cuda')
        y_pred = model(vecs).cpu().detach().numpy()
        pred_labels = [np.argmax(entry) for entry in y_pred]
        predictions += pred_labels
    pre, rec, f1, _ = precision_recall_fscore_support(np.array(labels)[test_idx], predictions,average='macro')
    print('\n--->  the macro-F1 on test set is {}'.format(f1))
    print('pred 1 percent', np.sum(predictions)/len(predictions))


--->  the macro-F1 on test set is 0.7126365010256823
pred 1 percent 0.2452667814113597


In [None]:
# if you want to save your trained model, you may uncomment the line below
# torch.save(best_model, 'bert_pgd_base_wTitle.state_dict')