# Import Libraries

In [17]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import trange
import random

# Kaggle Dataset - Media Biasness

In [5]:
news_data = pd.read_excel(r"C:\Users\DesignHub\Documents\Monika\Projects\News_polarity_analysis\annotations.xlsx")

In [7]:
news_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,survey_record_id,sentence_id,sentence_group_id,created_at,label,words,factual,group_id,...,outlet,mturk_id,age,gender,education,native_english_speaker,political_ideology,followed_news_outlets,news_check_frequency,survey_completed
0,0,19227,0045473f40ec42a2bd2ca0ee35df0b75,06e9e57e549d4dd48d8ac649ff81fd2e,67,2020-08-12 06:09:53,Non-biased,,Entirely factual,67,...,breitbart,Jonathan Daniel Garavito,29,Male,Bachelor’s degree,Native speaker,7,"['ABC News', 'MSNBC']",Every day,True
1,1,19228,0045473f40ec42a2bd2ca0ee35df0b75,07f2137fd3ae4dd2b5c990b93e5c2a62,67,2020-08-12 06:10:21,Non-biased,,Expresses writer’s opinion,67,...,alternet,Jonathan Daniel Garavito,29,Male,Bachelor’s degree,Native speaker,7,"['ABC News', 'MSNBC']",Every day,True
2,2,19229,0045473f40ec42a2bd2ca0ee35df0b75,10a5e68a84ab4c1a83d861b87c57def9,67,2020-08-12 06:10:35,Non-biased,,Somewhat factual but also opinionated,67,...,msnbc,Jonathan Daniel Garavito,29,Male,Bachelor’s degree,Native speaker,7,"['ABC News', 'MSNBC']",Every day,True
3,3,19231,0045473f40ec42a2bd2ca0ee35df0b75,17e77a14ecaa478fa81cf80e10f92e50,67,2020-08-12 06:10:51,Non-biased,,Expresses writer’s opinion,67,...,breitbart,Jonathan Daniel Garavito,29,Male,Bachelor’s degree,Native speaker,7,"['ABC News', 'MSNBC']",Every day,True
4,4,19232,0045473f40ec42a2bd2ca0ee35df0b75,251232bf6ddf4bcbac7fb40a8aec1964,67,2020-08-12 06:11:04,Non-biased,,Expresses writer’s opinion,67,...,federalist,Jonathan Daniel Garavito,29,Male,Bachelor’s degree,Native speaker,7,"['ABC News', 'MSNBC']",Every day,True


In [8]:
news_data.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'survey_record_id', 'sentence_id',
       'sentence_group_id', 'created_at', 'label', 'words', 'factual',
       'group_id', 'text', 'link', 'type', 'topic', 'outlet', 'mturk_id',
       'age', 'gender', 'education', 'native_english_speaker',
       'political_ideology', 'followed_news_outlets', 'news_check_frequency',
       'survey_completed'],
      dtype='object')

In [6]:
news_data.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,sentence_group_id,group_id,age,political_ideology
count,17775.0,17775.0,17775.0,17775.0,17775.0,17775.0
mean,11278.431505,12984.615809,42.912236,42.912236,37.73564,-1.474205
std,6528.730534,6773.907595,24.645737,24.645737,10.986286,6.454495
min,0.0,0.0,1.0,1.0,0.0,-10.0
25%,5440.5,8109.5,21.0,21.0,29.0,-7.0
50%,11364.0,13651.0,43.0,43.0,35.0,-2.0
75%,16936.5,18853.5,64.0,64.0,45.0,4.0
max,22470.0,23566.0,85.0,85.0,71.0,10.0


In [9]:
news_data['label'].value_counts()

Biased        10651
Non-biased     7124
Name: label, dtype: int64

In [10]:
news_data['factual'].value_counts()

Entirely factual                         6034
Expresses writer’s opinion               5945
Somewhat factual but also opinionated    5796
Name: factual, dtype: int64

In [11]:
news_data['text']

0        The transgender effort to suppress any recogni...
1        Radical Virginia Citizens Defense League has o...
2        Miller is the architect of President Donald Tr...
3        The House Democrats’ 1,400-page coronavirus re...
4        A specter is haunting the West; our elites see...
                               ...                        
17770    Oregon first graders could attend gun safety c...
17771    But President Donald Trump and his campaign ad...
17772    The normalization effort included softened rhe...
17773    And so, while demonstrations began to spread a...
17774    House and Senate Democrats are responding to t...
Name: text, Length: 17775, dtype: object

In [12]:
news_data['topic'].value_counts()

environment                              1407
white-nationalism                        1372
gender                                   1339
student-debt                             1332
sport                                    1320
abortion                                 1320
gun-control                              1296
immigration                              1276
coronavirus                              1275
vaccines                                 1275
trump-presidency                         1255
middle-class                             1227
elections-2020                           1159
international-politics-and-world-news     922
Name: topic, dtype: int64

In [13]:
news_data['survey_completed'].value_counts()

True    17775
Name: survey_completed, dtype: int64

In [25]:
news_data["new_label"]=np.where(news_data["label"]=="Biased",1,0)

In [50]:
news_data['new_label'] = news_data['new_label'].astype('int64')

In [51]:
news_data["new_label"].value_counts()

1    10651
0     7124
Name: new_label, dtype: int64

# Using BERT model to fine tune

In [52]:
text = news_data.text.values
labels = news_data.new_label.values

In [19]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

Downloading: 100%|██████████████████████████████████████████████████████████████████| 226k/226k [00:00<00:00, 3.46MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 28.2kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 286kB/s]


In [21]:
token_id = []
attention_masks = []
def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)

In [22]:
token_id.shape

torch.Size([17775, 32])

In [23]:
attention_masks.shape

torch.Size([17775, 32])

In [53]:
labels = torch.tensor(labels)

In [29]:
labels.shape

torch.Size([17775])

In [70]:
val_ratio = 0.3
batch_size = 16
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

In [71]:
# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

In [72]:
# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [73]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 3e-5,
                              eps = 1e-08
                              )

# Run on GPU
#model()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [75]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
    '''
    Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
    '''
    preds = np.argmax(preds, axis = 1).flatten()
    labels = labels.flatten()
    tp = b_tp(preds, labels)
    tn = b_tn(preds, labels)
    fp = b_fp(preds, labels)
    fn = b_fn(preds, labels)
    b_accuracy = (tp + tn) / len(labels)
    b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
    b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
    b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
    return b_accuracy, b_precision, b_recall, b_specificity

# Model Training

In [76]:
def train_model(epochs):
    for _ in trange(epochs, desc = 'Epoch'):

        # ========== Training ==========

        # Set model to training mode
        model.train()

        # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            #print(batch)
            optimizer.zero_grad()
            # Forward pass
            train_output = model(b_input_ids, 
                                 token_type_ids = None, 
                                 attention_mask = b_input_mask, 
                                 labels = b_labels)
            # Backward pass
            train_output.loss.backward()
            optimizer.step()
            # Update tracking variables
            tr_loss += train_output.loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        # ========== Validation ==========

        # Set model to evaluation mode
        model.eval()

        # Tracking variables 
        val_accuracy = []
        val_precision = []
        val_recall = []
        val_specificity = []

        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
              # Forward pass
              eval_output = model(b_input_ids, 
                                  token_type_ids = None, 
                                  attention_mask = b_input_mask)
            logits = eval_output.logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            # Calculate validation metrics
            b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
            val_accuracy.append(b_accuracy)
            # Update precision only when (tp + fp) !=0; ignore nan
            if b_precision != 'nan': val_precision.append(b_precision)
            # Update recall only when (tp + fn) !=0; ignore nan
            if b_recall != 'nan': val_recall.append(b_recall)
            # Update specificity only when (tn + fp) !=0; ignore nan
            if b_specificity != 'nan': val_specificity.append(b_specificity)

        print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
        print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
        print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
        print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
        print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

In [77]:
epochs = 2
train_model(epochs)

Epoch:  50%|█████████████████████████████████████▌                                     | 1/2 [32:25<32:25, 1945.87s/it]


	 - Train loss: 0.6264
	 - Validation Accuracy: 0.6683
	 - Validation Precision: 0.6777
	 - Validation Recall: 0.8569
	 - Validation Specificity: 0.3935



Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 2/2 [1:02:45<00:00, 1882.59s/it]


	 - Train loss: 0.5816
	 - Validation Accuracy: 0.6722
	 - Validation Precision: 0.7189
	 - Validation Recall: 0.7431
	 - Validation Specificity: 0.5653






# Model Inference

In [82]:
new_sentence = 'Indian people are smart in thier work'

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Biased' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Unbiased'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  Indian people are smart in thier work
Predicted Class:  Biased
