In [None]:
#import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import torch

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

import torch.optim as optim

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
news = pd.read_csv("./real-and-fake-news-dataset/news.csv")
news.drop('Unnamed: 0', axis=1, inplace=True)
news['titletext'] = news['title'] + " " + news['text']
news.head()

Unnamed: 0,title,text,label,titletext
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,You Can Smell Hillary’s Fear Daniel Greenfield...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,Kerry to go to Paris in gesture of sympathy U....
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,Bernie supporters on Twitter erupt in anger ag...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,The Battle of New York: Why This Primary Matte...


In [None]:
# slice
news = news.head(500)

In [None]:
news['titletext'] = news['titletext'].str.slice(0,128)
news['title'] = news['title'].str.slice(0,128)
news['text'] = news['text'].str.slice(0,128)

In [None]:
news['label'] = news['label'].astype('category').cat.codes

In [None]:
news.head()

Unnamed: 0,title,text,label,titletext
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,You Can Smell Hillary’s Fear Daniel Greenfield...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathy U....
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(news[['title','text','titletext']],news['label'],stratify=news['label'],test_size=0.3)
X_test,X_valid,y_test,y_valid = train_test_split(X_test,y_test,stratify=y_test,test_size=0.5)

In [None]:
print(f"Train Size: {X_train.shape} {y_train.shape}")
print(f"Test Size: {X_test.shape} {y_test.shape}")
print(f"Valid Size: {X_valid.shape} {y_valid.shape}")

Train Size: (350, 3) (350,)
Test Size: (75, 3) (75,)
Valid Size: (75, 3) (75,)


In [None]:
X_train['label'] = y_train.values
X_test['label'] = y_test.values
X_valid['label'] = y_valid.values

In [None]:
X_train.to_csv("./real-and-fake-news-dataset/train.csv", index=False)
X_test.to_csv("./real-and-fake-news-dataset/test.csv", index=False)
X_valid.to_csv("./real-and-fake-news-dataset/valid.csv", index=False)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.int)
text_field  = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('title', text_field), ('text', text_field), ('titletext', text_field), ('label', label_field),]

Note:
<span style='color:Red'>In order to use BERT tokenizer with TorchText, we have to set use_vocab=False and tokenize=tokenizer.encode. This will let TorchText know that we will not be building our own vocabulary using our dataset from scratch, but instead, use the pre-trained BERT tokenizer and its corresponding word-to-index mapping.

In [None]:
%%time

# TabularDataset

train, valid, test = TabularDataset.splits(path="./real-and-fake-news-dataset", train='train.csv', validation='valid.csv', test='test.csv', format='CSV', fields=fields, skip_header=True)

CPU times: user 428 ms, sys: 0 ns, total: 428 ms
Wall time: 427 ms


In [None]:
# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)

In [None]:
x = next(iter(train_iter))

In [None]:
x


[torchtext.data.batch.Batch of size 16]
	[.title]:[torch.cuda.LongTensor of size 16x128 (GPU 0)]
	[.text]:[torch.cuda.LongTensor of size 16x128 (GPU 0)]
	[.titletext]:[torch.cuda.LongTensor of size 16x128 (GPU 0)]
	[.label]:[torch.cuda.IntTensor of size 16 (GPU 0)]

In [None]:
x.text
x.titletext
x.title
x.label

tensor([[  101,  2076,  1996,  ...,     0,     0,     0],
        [  101,  1996,  4883,  ...,     0,     0,     0],
        [  101,  2343, 13857,  ...,     0,     0,     0],
        ...,
        [  101, 18520,  7207,  ...,     0,     0,     0],
        [  101,  1996,  3784,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0]], device='cuda:0')

tensor([[  101,  8096, 24731,  ...,     0,     0,     0],
        [  101, 15802,  2584,  ...,     0,     0,     0],
        [  101,  8112, 14616,  ...,     0,     0,     0],
        ...,
        [  101, 18520, 17727,  ...,     0,     0,     0],
        [  101,  8112,  1010,  ...,     0,     0,     0],
        [  101,  1996,  8115,  ...,     0,     0,     0]], device='cuda:0')

tensor([[  101,  8096, 24731,  ...,     0,     0,     0],
        [  101, 15802,  2584,  ...,     0,     0,     0],
        [  101,  8112, 14616,  ...,     0,     0,     0],
        ...,
        [  101, 18520, 17727,  ...,     0,     0,     0],
        [  101,  8112,  1010,  ...,     0,     0,     0],
        [  101,  1996,  8115,  ...,     0,     0,     0]], device='cuda:0')

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0], device='cuda:0',
       dtype=torch.int32)

In [None]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT,self).__init__()
        options_name = 'bert-base-uncased'
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)
        
        
    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

## Functions to Save, Load Checkpoint and Metrics

In [None]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):
    if save_path == None:
        return
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

In [None]:
def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
    if save_path == None:
        return
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):
    if load_path==None:
        return
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']


In [None]:
def train(model, optimizer, critertion=nn.BCELoss(),train_loader=train_iter,valid_loader=valid_iter,num_epochs=5
                   ,eval_every = len(train_iter) // 2,file_path = "",best_valid_loss = float("Inf")):
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []
    
    model.train()
    for epoch in range(num_epochs):
        for (title, text, titletext, labels), _ in train_loader:
            labels = labels.type(torch.LongTensor) 
            labels = labels.to(device)
            
            titletext = titletext.type(torch.LongTensor)  
            titletext = titletext.to(device)
            #print(labels.shape)
            #print(titletext.shape)
            
            output = model(titletext, labels)
            loss, _ = output
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            global_step += 1
            
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    
                    # validation loop
                    for (title, text, titletext, labels), _ in valid_loader:
                        labels = labels.type(torch.LongTensor)           
                        labels = labels.to(device)
                        titletext = titletext.type(torch.LongTensor)  
                        titletext = titletext.to(device)
                        output = model(titletext, labels)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()
                        
                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)
                
                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    #save_checkpoint(file_path + model.pt, model, best_valid_loss)
                    #save_metrics(file_path + metrics.pt, train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [None]:
model = BERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch [1/5], Step [11/110], Train Loss: 0.6670, Valid Loss: 0.7580
Epoch [1/5], Step [22/110], Train Loss: 0.7145, Valid Loss: 0.7330
Epoch [2/5], Step [33/110], Train Loss: 0.7198, Valid Loss: 0.7107
Epoch [2/5], Step [44/110], Train Loss: 0.7288, Valid Loss: 0.6965
Epoch [3/5], Step [55/110], Train Loss: 0.6772, Valid Loss: 0.7183
Epoch [3/5], Step [66/110], Train Loss: 0.7014, Valid Loss: 0.7165
Epoch [4/5], Step [77/110], Train Loss: 0.7007, Valid Loss: 0.6947
Epoch [4/5], Step [88/110], Train Loss: 0.7400, Valid Loss: 0.6920
Epoch [5/5], Step [99/110], Train Loss: 0.7089, Valid Loss: 0.6913
Epoch [5/5], Step [110/110], Train Loss: 0.7222, Valid Loss: 0.6897
Model saved to ==> metrics.pt
Finished Training!
