In [1]:
import csv
import json
import pandas as pd
import torch
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup

import numpy as np
from sklearn.metrics import f1_score

import random
import warnings
warnings.filterwarnings('ignore')

In [2]:

def read_data(path):
    tsv_file = open(path)
    read_tsv = csv.reader(tsv_file, delimiter="\t")

    X = []
    Y = []
    i = 0
    print('Reading articles...')
    for row in read_tsv:
        #print(row)
        # Opening JSON file
        if(row[0][:]!='ID'):

            #if(i%1000==0):
            #    print('Reading article ', i)

            json_file = open('data/jsons/'+row[0][:]+'.json',)
            #print(row[1][:])
            # Reading JSON file
            data = json.load(json_file)

            Y.append(data['bias'])
            X.append(data['content'])


            #print(target)
            #print(text)

            # Closing file
            json_file.close()
            i = i + 1

    tsv_file.close()
    print('done')
    
    return X, Y


In [3]:
print('Load training dataset')
train_data_path = "data/splits/media/train.tsv"
X_train, Y_train = read_data(train_data_path)

# dictionary of lists 
dict_train = {'content': X_train, 'bias': Y_train} 
df_train = pd.DataFrame(dict_train)
#df_train.to_csv('data/splits/media/train_data.tsv', index=False)

print('Load validation dataset')
val_data_path = "data/splits/media/valid.tsv"
X_val, Y_val = read_data(val_data_path)

# dictionary of lists 
dict_val = {'content': X_val, 'bias': Y_val} 
df_val = pd.DataFrame(dict_val)
#df_val.to_csv('data/splits/media/val_data.tsv', index=False)

print('Load test dataset')
test_data_path = "data/splits/media/test.tsv"
X_test, Y_test = read_data(test_data_path)

# dictionary of lists 
dict_test = {'content': X_test, 'bias': Y_test} 
df_test = pd.DataFrame(dict_test)

#df_test.to_csv('data/splits/media/test_data.tsv', index=False)


Load training dataset
Reading articles...
done
Load validation dataset
Reading articles...
done
Load test dataset
Reading articles...
done


In [4]:
train  = df_train[['content', 'bias']] 
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

val = df_val[['content', 'bias']] 
val.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

test = df_test[['content', 'bias']]
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

In [5]:
train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,President Trump and Senate Minority Leader Chu...,2
1,“ The 360 ” shows you diverse perspectives on ...,1
2,LOS ANGELES — Actress Rosario Dawson took the ...,0
3,President Donald Trump said on Friday that he ...,1
4,Washington ( CNN ) Donald Trump became the 45t...,0


In [6]:
val.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,This education gave him a moral vision that em...,2
1,8:54 p.m. | Updated WASHINGTON — John E. Bryso...,0
2,"In a tweet on Wednesday night , Mr. Trump urge...",0
3,The commerce post had been vacant for nearly a...,0
4,Mississippi has the largest percentage of poor...,0


In [7]:
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,Has the media gotten worse ? Or am I just grou...,2
1,Fort Worth ISD employee Yolanda Cintron assist...,1
2,With Congress heading toward another governmen...,0
3,The White House Commission on Combating Drug A...,0
4,"In 2004 , after the open secularism of John Ke...",2


In [14]:
train.LABEL_COLUMN.value_counts()

2    10241
0     8861
1     7488
Name: LABEL_COLUMN, dtype: int64

In [20]:
label_dict = {'right': 2, 'center': 1, 'left': 0}
label_dict

{'right': 2, 'center': 1, 'left': 0}

In [8]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [42]:
max_length = 128

In [65]:
encoded_data_train = tokenizer.batch_encode_plus(
    train.DATA_COLUMN.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=max_length,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val.DATA_COLUMN.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=max_length,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test.DATA_COLUMN.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=max_length,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train.LABEL_COLUMN.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val.LABEL_COLUMN.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test.LABEL_COLUMN.values)

In [66]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

dataset_test = TensorDataset(input_ids_test, 
                              attention_masks_test,
                              labels_test)

In [67]:
print(len(dataset_train))
print(len(dataset_val))
print(len(dataset_test))

1300

In [46]:
dataset_val.tensors

(tensor([[  101,  2023,  2495,  ...,  1012,  2151,   102],
         [  101,  1022,  1024,  ..., 10976, 10727,   102],
         [  101,  1999,  1037,  ...,  2055,  2010,   102],
         ...,
         [  101,   100,  2003,  ..., 13751,  2050,   102],
         [  101,  5205,  2957,  ...,  2001,  6260,   102],
         [  101,  1996,  2877,  ..., 12400,  2018,   102]]),
 tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 tensor([2, 0, 0,  ..., 0, 0, 0]))

In [68]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = len(label_dict),
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [69]:
batch_size = 1

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size
)

dataloader_test = DataLoader(
    dataset_test,
    sampler=RandomSampler(dataset_test),
    batch_size=batch_size
)

In [70]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

In [71]:
epochs = 2

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

In [72]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [73]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [74]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [75]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [76]:
print(len(dataloader_train))
print(len(dataloader_val))
print(len(dataloader_test))

1300

In [77]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    torch.save(model.state_dict(), f'Models/BERT_ft_Epoch{epoch}.model')
    
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/26590 [00:00<?, ?it/s]

In [None]:
accuracy_per_class(predictions, true_vals)

In [None]:
test_loss, test_predictions, true_tests = evaluate(dataloader_test)
test_f1 = f1_score_func(test_predictions, true_tests)
print(f'Test loss: {test_loss}')
print(f'Test F1 Score (weighted): {test_f1}')
accuracy_per_class(test_predictions, true_tests)