In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install transformers

In [None]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4
import time
import datetime
import random

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Transformers
from transformers import get_linear_schedule_with_warmup
from transformers import (
    BertModel,
    BertForSequenceClassification,
    BertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    AdamW
    )

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''Takes a time in seconds and returns a string hh:mm:ss'''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import torch.nn as nn
from transformers import AutoModel
class FakeBERT(nn.Module):
    def __init__(self):
        super(FakeBERT, self).__init__()
        
        self.base_model = AutoModel.from_pretrained('bert-base-uncased')

        # Layer 1: Conv1D + Maxpool
        self.conv_1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3, stride=1)
        self.sigm_1 = nn.ReLU()
        self.pool_1 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Layer 6: Fully Connected Layer 
        self.full_6 = nn.Linear(153,32)
        self.sigm_6 = nn.Sigmoid()
        
        # Layer 7: Fully Connected Layer 
        self.full_7 = nn.Linear(32,2)
        self.soft_7 = nn.Softmax()

    def forward(self, input_ids, attn_mask):
        bert_output = self.base_model(input_ids, attention_mask=attn_mask)
        bert_output = bert_output['pooler_output'].unsqueeze(1)
        # bert_perturb = 0.1*torch.rand(768).to(device)
        # bert_output = bert_output + bert_perturb
        outputs = self.pool_1(self.sigm_1(self.conv_1(bert_output)))
        outputs = self.sigm_6(self.full_6(outputs))
        outputs = self.soft_7(self.full_7(outputs))
        return outputs, bert_output

def train():
    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):
        
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        t0 = time.time()
        total_train_loss = 0
        bert_model.train()
        step = 0

        for (batch_pos,batch_neg) in zip(bert_train_pos_dataloader,
                                         bert_train_neg_dataloader):

            # if step%40==0:
            #     elapsed = format_time(time.time() - t0)
            #     print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(bert_train_pos_dataloader), elapsed))

            # Unpack batches
            b_input_ids_pos  = batch_pos[0].to(device)
            b_input_mask_pos = batch_pos[1].to(device)
            b_labels_pos     = batch_pos[2].to(device)
            b_input_ids_neg  = batch_neg[0].to(device)
            b_input_mask_neg = batch_neg[1].to(device)
            b_labels_neg     = batch_neg[2].to(device)

            # Zero grads
            bert_model.zero_grad()        

            # Forward pass
            output_pos, _ = bert_model(b_input_ids_pos, b_input_mask_pos)
            output_neg, _ = bert_model(b_input_ids_neg, b_input_mask_neg)

            # Accumulate loss
            output_pos, output_neg = output_pos.squeeze(1), output_neg.squeeze(1)
            loss = loss_func(output_pos, b_labels_pos)
            loss += loss_func(output_neg, b_labels_neg)
            loss += (2-torch.mean(torch.norm(output_pos-output_neg,p=2,dim=1)).detach())
            
            if step%40==0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(bert_train_pos_dataloader), elapsed))
                print(loss)

            # Backpropagate
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The bert_optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            bert_optimizer.step()

            # Update the learning rate.
            bert_scheduler.step()

            # Increment step
            step += 1

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(bert_train_pos_dataloader)            
        
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))

        # Record all statistics from this epoch.
        bert_training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                # 'Valid. Loss': avg_val_loss,
                # 'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                # 'Validation Time': validation_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
# Device
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device("cpu")

# BERT
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

## Training

In [None]:
# Load encoded Fake-News Kaggle dataset
df_pos_encode = torch.load("/content/drive/MyDrive/fake-news-explainability/Data/fake_news_pos_encoded.pt")
df_neg_encode = torch.load("/content/drive/MyDrive/fake-news-explainability/Data/fake_news_neg_encoded.pt")

# Load data into dataloader
batch_size = 32
bert_train_pos_dataloader = DataLoader(
    df_pos_encode,  # The training samples.
    batch_size = batch_size # Trains with this batch size.
    )
bert_train_neg_dataloader = DataLoader(
    df_neg_encode,  # The training samples.
    batch_size = batch_size # Trains with this batch size.
    )

In [None]:
# Load model
bert_model = FakeBERT().to(device)
bert_training_stats = []
epochs = 2
total_steps = len(bert_train_pos_dataloader) * epochs
loss_func = nn.CrossEntropyLoss()

# Optimizer
bert_optimizer = AdamW(bert_model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Create the learning rate scheduler.
bert_scheduler = get_linear_schedule_with_warmup(bert_optimizer, 
                                                 num_warmup_steps = 0, # Default value in run_glue.py
                                                 num_training_steps = total_steps)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Train
train()

In [None]:
# Save model
torch.save(bert_model.state_dict(),
           "/content/drive/MyDrive/fake-news-explainability/Models/bert_model_fake_news_test_kaggle")

## Evaluate

In [None]:
# Load model
bert_model = FakeBERT().to(device)
bert_model.load_state_dict(torch.load("/content/drive/MyDrive/fake-news-explainability/Models/bert_model_fake_news_test_kaggle"))

# Load data
df_train_pos_encode = torch.load("/content/drive/MyDrive/fake-news-explainability/Data/fake_news_pos_encoded.pt")
df_train_neg_encode = torch.load("/content/drive/MyDrive/fake-news-explainability/Data/fake_news_neg_encoded.pt")

# Load data into dataloader
batch_size = 32
bert_train_pos_dataloader = DataLoader(
    df_train_pos_encode,  # The training samples.
    batch_size = batch_size # Trains with this batch size.
    )
bert_train_neg_dataloader = DataLoader(
    df_train_neg_encode,  # The training samples.
    batch_size = batch_size # Trains with this batch size.
    )

In [None]:
# Generate predictions
results = []
with torch.no_grad():
    for step, batch in enumerate(bert_train_pos_dataloader):
        # Unpack batch
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)       

        # Forward pass
        output, bert_output = bert_model(b_input_ids, b_input_mask)
        results.append(output)



In [None]:
# Calculate accuracy
labels = df_pos_encode.tensors[2]
results_ = torch.vstack(results).squeeze(1)
sum(torch.argmax(results_.cpu(), axis=1) == labels)/len(labels)

tensor(0.5020)

In [None]:
# Generate predictions
results2 = []
with torch.no_grad():
    for step, batch in enumerate(bert_train_neg_dataloader):
        # Unpack batch
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)       

        # Forward pass
        output, bert_output = bert_model(b_input_ids, b_input_mask)
        results2.append(output)



In [None]:
# Calculate accuracy
labels = df_neg_encode.tensors[2]
results_ = torch.vstack(results).squeeze(1)
sum(torch.argmax(results_.cpu(), axis=1) == labels)/len(labels)

tensor(0.5010)

In [None]:
results = torch.vstack(results).squeeze(1)
results2 = torch.vstack(results2).squeeze(1)

## Results Log
* FakeBERT, Trained on positive and negative classes
  * Result on positive/negative: tensor(0.9990)
  * Result on fake-news dataset: ???

