<a href="https://colab.research.google.com/github/mike-jansen/misinformation-analysis-capstone/blob/main/bert_misinformation_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [None]:
# installing BERT
!pip install pytorch-transformers
!pip install keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, AdamW
from pytorch_transformers import BertModel, BertConfig
from keras.utils import pad_sequences

import torch.nn as nn

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
file_path = '/content/drive/My Drive/Colab Notebooks/data/research/'

# combine all the data files so we can preprocess them at the same time
train_df = pd.read_csv(file_path + 'train.tsv', sep='\t', header=None)
test_df = pd.read_csv(file_path + 'test.tsv', sep='\t', header=None)
valid_df = pd.read_csv(file_path + 'valid.tsv', sep='\t', header=None)
df = pd.concat([train_df, test_df, valid_df])

# create the column headers (given by datasets README)
df.columns = ['id', 'label', 'statement', 'subject(s)', 'speaker', 'job title', 'state info', 'affiliation', 'barely true', 'false', 'half true', 'mostly true', 'pants on fire', 'context']

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
df.shape

(12791, 14)

In [None]:
df.head()

Unnamed: 0,id,label,statement,subject(s),speaker,job title,state info,affiliation,barely true,false,half true,mostly true,pants on fire,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


#Preprocessing

For preprocessing, we need to:
1. add special tokens to the text that BERT needs to understand the inputs
2. tokenize the inputs
3. create attention masks for padding (input to BERT)
4. split the data

In [None]:
# values need to be converted from strings to ints for regression
mapping = {
    'pants-fire': 0,
    'false': 1,
    'barely-true': 2,
    'half-true': 3,
    'mostly-true': 4,
    'true': 5,
}
df['label'] = df['label'].replace(mapping) # replace all values according to the mapping

In [None]:
# for BERT to work, need to add special tokens at the start and end of each text sample (sentence)

# create new lists for all the sentences and all the labels
sentences = df.statement.values
labels = df.label.values

# add the special tokens to each sentence
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
'''
  [CLS] = classification, used at beginning of sequence
  [SEP] = separator, used to distinguish different tokens, allows you to compare sentences
'''

'\n  [CLS] = classification, used at beginning of sequence\n  [SEP] = separator, used to distinguish different tokens, allows you to compare sentences\n'

##Inputs
To use BERT, the input must be converted to tokens that BERT understands. This can be done using the BERT tokenizer.

In [None]:
# create the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# create a list of tokenized sentences using the tokenizer
tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]

# print an example
print(tokenized_sentences[0])

['[CLS]', 'says', 'the', 'annie', '##s', 'list', 'political', 'group', 'supports', 'third', '-', 'trim', '##ester', 'abortion', '##s', 'on', 'demand', '.', '[SEP]']


In [None]:
MAX_TOKEN_LENGTH = 128

input_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokenized_sentences]
# input_ids = [tokenizer.encode(sentence) for sentence in sentences]  # same thing, skipping initial tokenize() call

# pad the input ids
input_ids = pad_sequences(input_ids, maxlen=MAX_TOKEN_LENGTH, dtype="long", truncating="post", padding="post")





After creating padding for the tokens, we should create an attention mask, which is a sequence of 1s and 0s that BERT takes as an input.

The attention mask maps to the input_ids 1:1. A 1 in the attention mask means the corresponding id in input_ids is an input token, and a 0 in the attention mask means that the corresponding id in input_ids is a padding token


In [None]:
attention_masks = []

for input in input_ids:
  mask = [int(id>0) for id in input]
  attention_masks.append(mask)

print(attention_masks[0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


####Splitting the data and converting to tensors

In [None]:
# splitting the input data
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.2)

# splitting the attention masks (dont care about labels)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.2)

In [None]:
# convert all the data into tensors so BERT can use them
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# select a batch size for training
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


#Training BERT

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
#model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
#model.cuda()

config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = 1  # 1 for regression

# create the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
# replace the last layer with a new layer for regression
model.classifier = nn.Linear(config.hidden_size, 1)

100%|██████████| 433/433 [00:00<00:00, 166816.72B/s]
100%|██████████| 440473133/440473133 [00:15<00:00, 28530562.41B/s]


In [None]:
# get a list of all parameters that BERT has
param_optimizer = list(model.named_parameters())

# define the variables that shouldn't decay
no_decay = ['bias', 'gamma', 'beta']

# group the parameters into two categories, the first for decay, and the second for no decay
# each category has a defined decay rate (0 for no decay)
optimizer_grouped_parameters = [
    # decay group
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    # no decay group
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]


# Uncompleted...