In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 install transformers

In [3]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4
import time
import datetime
import random

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Transformers
from transformers import get_linear_schedule_with_warmup
from transformers import (
    BertModel,
    BertForSequenceClassification,
                          BertTokenizer,
                          RobertaForSequenceClassification,
                          RobertaTokenizer,
                         AdamW)

In [4]:
def encode_dataframe(statement_col, target_col, unpack=False):
    # Tokenize statements
    bert_encoded_dict = statement_col.apply(lambda sent: bert_tokenizer.encode_plus(
                                      sent,                      # Sentence to encode.
                                      add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                      max_length = 120,           # Pad & truncate all sentences.
                                      pad_to_max_length = True,
                                      return_attention_mask = True,   # Construct attn. masks.
                                      return_tensors = 'pt',     # Return pytorch tensors.
                                      truncation = True
                                ))
    bert_input_ids = torch.cat([item['input_ids'] for item in bert_encoded_dict], dim=0)
    bert_attention_masks = torch.cat([item['attention_mask'] for item in bert_encoded_dict], dim=0)

    # Format targets
    labels = torch.tensor(target_col)
    sentence_ids = torch.tensor(range(len(target_col)))

    # Combine the training inputs into a TensorDataset
    bert_dataset = TensorDataset(sentence_ids, bert_input_ids, bert_attention_masks, labels)

    # Remove indices
    trial_dataset = index_remover(bert_dataset)

    if unpack:
        return bert_input_ids, bert_attention_masks, labels
    else:
        return trial_dataset

def index_remover(tensordata):
    input_ids = []
    attention_masks = []
    labels = []
   
    for a,b,c,d in tensordata:
        input_ids.append(b.tolist())
        attention_masks.append(c.tolist())
        labels.append(d.tolist())
        
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    
    final_dataset = TensorDataset(input_ids, attention_masks, labels)
    return final_dataset

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [5]:
import torch.nn as nn
from transformers import AutoModel
class FakeBERT(nn.Module):
    def __init__(self):
        super(FakeBERT, self).__init__()
        
        self.base_model = AutoModel.from_pretrained('bert-base-uncased')

        # Layer 1: Conv1D + Maxpool
        self.conv_1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3, stride=1)
        self.sigm_1 = nn.ReLU()
        self.pool_1 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Layer 6: Fully Connected Layer 
        self.full_6 = nn.Linear(153,32)
        self.sigm_6 = nn.Sigmoid()
        
        # Layer 7: Fully Connected Layer 
        self.full_7 = nn.Linear(32,2)
        self.soft_7 = nn.Softmax()

    def forward(self, input_ids, attn_mask):
        bert_output = self.base_model(input_ids, attention_mask=attn_mask)
        outputs = self.pool_1(self.sigm_1(self.conv_1(bert_output['pooler_output'].unsqueeze(1))))
        outputs = self.sigm_6(self.full_6(outputs))
        outputs = self.soft_7(self.full_7(outputs))
        return outputs, bert_output



In [None]:
# Device
device = torch.device("cuda:0")

# BERT
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [14]:
# Load in dataset

# df_train = pd.read_csv("/content/drive/MyDrive/fake-news-explainability/fake_news_train.csv")
# df_train = df_train.dropna(subset=['text']).reset_index(drop=True)
# df_train['target'] = df_train['label']
# df_train_encode = encode_dataframe(df_train['text'], df_train['target'])
# torch.save(df_train_encode,
#            "/content/drive/MyDrive/fake-news-explainability/fake_news_encoded.pt")

df_train_encode = torch.load("/content/drive/MyDrive/fake-news-explainability/fake_news_encoded.pt")

In [15]:
# Load data into dataloader
batch_size = 32
bert_train_dataloader = DataLoader(
    df_train_encode,  # The training samples.
    batch_size = batch_size # Trains with this batch size.
    )

In [16]:
# Load model
bert_model = FakeBERT().to(device)
bert_training_stats = []
epochs = 3
total_steps = len(bert_train_dataloader) * epochs
loss_func = nn.CrossEntropyLoss()

# Optimizer
bert_optimizer = AdamW(bert_model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Create the learning rate scheduler.
bert_scheduler = get_linear_schedule_with_warmup(bert_optimizer, 
                                                 num_warmup_steps = 0, # Default value in run_glue.py
                                                 num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    bert_model.train()

    for step, batch in enumerate(bert_train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(bert_train_dataloader), elapsed))

        # Unpack batch
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Zero grads
        bert_model.zero_grad()        

        # Forward pass
        output, bert_output = bert_model(b_input_ids, b_input_mask)
        
        # Accumulate loss
        loss = loss_func(output.squeeze(1), 
                         b_labels)
        
        # Backpropagate
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The bert_optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        bert_optimizer.step()

        # Update the learning rate.
        # bert_scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(bert_train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # Record all statistics from this epoch.
    bert_training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            # 'Valid. Loss': avg_val_loss,
            # 'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            # 'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [19]:
# Generate predictions
results = []
with torch.no_grad():
    for step, batch in enumerate(bert_train_dataloader):
        # Unpack batch
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)       

        # Forward pass
        output, bert_output = bert_model(b_input_ids, b_input_mask)
        results.append(output)



In [49]:
# Calculate accuracy
results_ = torch.vstack(results).squeeze(1)
sum(torch.argmax(results_.cpu(), axis=1) == df_train_encode.tensors[2])/len(df_train_encode.tensors[2])

tensor(0.9913)

In [51]:
# Save model
torch.save(bert_model.state_dict(),
           "/content/drive/MyDrive/fake-news-explainability/bert_model_fake_news_kaggle")
torch.save(bert_model, 
           "/content/drive/MyDrive/fake-news-explainability/bert_model_fake_news_kaggle_full")


In [None]:
# Load model
model = FakeBERT()
model.load_state_dict(torch.load("/content/drive/MyDrive/fake-news-explainability/bert_model_fake_news_kaggle"))
model.to(device)