In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sklearn
import torch

from google.colab import drive
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch import nn
from torch.functional import Tensor
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torchsummary import summary

!pip install transformers
import transformers
from transformers import AdamW, AutoModel, BertTokenizerFast

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
if torch.cuda.is_available():
    print("Using the GPU. You are good to go!")
    device = 'cuda'
else:
    print("Using the CPU. Overall speed will be slowed down")
    device = 'cpu'

Using the GPU. You are good to go!


In [None]:
drive.mount('/content/drive', force_remount=True)
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'eecs498-ml'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

Mounted at /content/drive
['Fake.csv', 'True.csv', 'bert.ipynb']


In [None]:
true_data = pd.read_csv(os.path.join(GOOGLE_DRIVE_PATH, "True.csv"))
fake_data = pd.read_csv(os.path.join(GOOGLE_DRIVE_PATH, "Fake.csv"))

print("--- TRUE DATA ---")
print(true_data.head())
print()
print("--- FAKE DATA ---")
print(fake_data.head())

--- TRUE DATA ---
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   

--- FAKE DATA ---
                                               title  \
0   D

In [None]:
true_data['target'] = ['True'] * len(true_data)
fake_data['target'] = ['Fake'] * len(fake_data)
data=true_data.append(fake_data).sample(frac=1).reset_index().drop(columns=['index'])
data['label'] = pd.get_dummies(data.target)['Fake']

In [None]:
def print_datasets(output=True):
  print("--- TRUE DATA ---")
  print(true_data.head())
  print()
  print("--- FAKE DATA ---")
  print(fake_data.head())
  print()
  print("--- COMBINED DATA ---")
  print(data.head())
  print()
  print(data.tail())

In [None]:
def truncate_article(dataset, num_slices):
  for index, row in dataset.iterrows():
    sent = nltk.tokenize.sent_tokenize(row["text"])
    num_sent = len(sent)
    if num_sent > 4:
      lower = int(num_sent/num_slices)
      upper = int((num_slices - 1)*num_sent/num_slices)
      sent = sent[lower:upper]
    elif (num_sent <= 4) and (num_sent > 2):
      sent = sent[1:-1]
    dataset.at[index, "short_text"] = " ".join(sent)

In [None]:
truncate_article(data, 4)

print_datasets(False)

--- TRUE DATA ---
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date target  \
0  December 31, 2017    True   
1  December 29, 2017    True   
2  December 31, 2017    True   
3  December 30, 2017    True   
4  December 29, 2017    True   

                               

In [None]:
data['text'] = data['text'].apply(lambda x: (x.split('(Reuters)', 1))[-1])
data['text'] = data['text'].apply(lambda x: (x.split('21st Century Wire', 1))[-1])
data['text'] = data['text'].apply(lambda x: x.replace('-', ''))
data['text'] = data['text'].apply(lambda x: x.replace('  ', ''))

In [None]:
def data_splitter(data, delete_percent, train_percent, valid_percent, test_percent):
	total = delete_percent + train_percent + valid_percent + test_percent
	percent = delete_percent / total
	texts, _, labels, _ = train_test_split(
		data['text'],
		data['label'],
		random_state=498,
		test_size=percent,
    stratify=data['target']
	)

	total -= delete_percent
	percent = train_percent / total
	temp_texts, train_texts, temp_labels, train_labels = train_test_split(
		texts,
		labels,
		random_state=498,
		test_size=percent,
    stratify=labels
	)

	total -= train_percent
	percent = valid_percent / total
	test_texts, valid_texts, test_labels, valid_labels = train_test_split(
		temp_texts,
		temp_labels,
		random_state=498,
		test_size=percent,
    stratify=temp_labels
	)
 
	return train_texts, valid_texts, test_texts, train_labels, valid_labels, test_labels

In [None]:
train_texts, valid_texts, test_texts, train_labels, valid_labels, test_labels = data_splitter(
    data, 
    delete_percent=0.75, 
    train_percent=0.20, 
    valid_percent=0.04, 
    test_percent=0.01
)

print(data.shape[0])
print(train_texts.shape[0])

bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

44898
8980


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def tokenize_dataset(texts, labels, length):
  tokens = tokenizer.batch_encode_plus(
    texts.tolist(),
    max_length = length,
    padding=True,
    truncation=True
  )

  token_sequence = torch.tensor(tokens['input_ids'])
  token_mask = torch.tensor(tokens['attention_mask'])
  y = torch.tensor(labels.tolist())

  return token_sequence, token_mask, y

In [None]:
MAX_LENGTH = 512
train_sequence, train_mask, train_y = tokenize_dataset(train_texts, train_labels, MAX_LENGTH)
valid_sequence, valid_mask, valid_y = tokenize_dataset(valid_texts, valid_labels, MAX_LENGTH)
test_sequence, test_mask, test_y = tokenize_dataset(test_texts, test_labels, MAX_LENGTH)

#batch size
batch_size = 1

# wrap and set datasets
train_data = TensorDataset(train_sequence, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
valid_data = TensorDataset(valid_sequence, valid_mask, valid_y)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      super(BERT_Arch, self).__init__()
      self.bert = bert 
      self.dropout = nn.Dropout(0.1)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(768,512)
      self.fc2 = nn.Linear(512,2)
      self.softmax = nn.LogSoftmax(dim=1)

    #forward pass
    def forward(self, sent_id, mask):
      cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)
      x = x.to(device)
      return x

model = BERT_Arch(bert).to(device)

#GET WORKING
# print(summary(model, (1,28,28), device=device))
# print(summary(model, device=device))
#compute the class weights
class_weights = compute_class_weight(
        class_weight='balanced', 
        classes=np.unique(train_labels),
        y=train_labels
)
print("Class Weights:",class_weights)
weights = torch.tensor(class_weights, dtype=torch.float)
weights = weights.to(device)

##HYPERPARAMETERS
# loss function
cross_entropy = nn.NLLLoss(weight=weights) 
# optimizer
optimizer = AdamW(model.parameters(), lr = 1e-6) # learning rate
# number of training epochs
epochs = 5

Class Weights: [1.0480859  0.95613288]




In [None]:
def train():  
  model.train()

  total_loss, total_accuracy = 0, 0
  total_preds=[]
  for step,batch in enumerate(train_dataloader):
    
    if step % 4000 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    batch = [r for r in batch]
    sent_id, mask, labels = batch
    sent_id = sent_id.to(device)
    mask = mask.to(device)
    labels = labels.to(device)
    model.zero_grad()        
   
    preds = model(sent_id, mask)
    preds = preds.to(device)
    
    loss = cross_entropy(preds, labels)
    total_loss += loss.item()
    loss.backward()

    accuracy = torch.sum(torch.argmax(preds, dim=1) == labels) / len(labels)
    total_accuracy += accuracy.item()

    # clip grads to 1 
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    
    total_preds.append(preds)

  avg_loss = total_loss / len(train_dataloader)
  avg_accuracy = total_accuracy / len(train_dataloader)
  total_preds = torch.concat(total_preds, dim=0)

  #returnloss and preds
  return avg_loss, avg_accuracy, total_preds


In [None]:
def evaluate():
  model.eval()

  total_loss, total_accuracy = 0, 0
  total_preds = []
  for step, batch in enumerate(valid_dataloader):
    if step % 4000 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
      #elapsed = format_time(time.time() - t0)
            
      # output progress
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(valid_dataloader)))

    # push the batch to gpu
    batch = [t for t in batch]
    sent_id, mask, labels = batch
    sent_id = sent_id.to(device)
    mask = mask.to(device)
    labels = labels.to(device)
    # turn off autograd
    with torch.no_grad():
      preds = model(sent_id, mask)

      loss = cross_entropy(preds,labels)
      total_loss = total_loss + loss.item()
      total_preds.append(preds)

      accuracy = torch.sum(torch.argmax(preds, dim=1) == labels) / len(labels)
      total_accuracy += accuracy.item()

  # calculate the validation loss of the epoch
  avg_loss = total_loss / len(valid_dataloader) 
  avg_accuracy = total_accuracy / len(train_dataloader)
  total_preds = torch.concat(total_preds, dim=0)
  total_preds = total_preds.to(device)
  return avg_loss, avg_accuracy, total_preds




In [None]:
best_valid_loss = float('inf')

train_losses = []
valid_losses = []
train_accuracies = []
valid_accuracies = []
for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    train_loss, train_accuracy, train_preds = train()
    valid_loss, valid_accuracy, valid_preds = evaluate()
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)

    print('################################################################')
    print('Train Loss:\t', train_loss)
    print('Valid Loss:\t', valid_loss)
    print('Train Accuracy:\t', train_accuracy)
    print('Valid Accuracy:\t', valid_accuracy)
    print('################################################################')

plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.plot(train_accuracies)
plt.plot(valid_accuracies, '')
plt.xlabel("Epochs")
plt.ylabel('Accuracy')
plt.legend(['Training Accuracy', 'Validation Accuracy'])

plt.ylim(0, 1)
plt.subplot(1, 2, 2)
plt.plot(train_losses)
plt.plot(valid_losses, '')
plt.xlabel("Epochs")
plt.ylabel('loss'.capitalize())
plt.legend(['Training Loss', 'Validation Loss'])

plt.ylim(0, None)
plt.savefig('plot.png')


 Epoch 1 / 5
  Batch 4,000  of  8,980.
  Batch 8,000  of  8,980.
################################################################
Train Loss:	 0.6575032767563859
Valid Loss:	 0.6754917513894411
Train Accuracy:	 0.5751670378619154
Valid Accuracy:	 0.11169265033407573
################################################################

 Epoch 2 / 5


In [None]:
# from sklearn.model_selection import KFold

# results = []
# kf = KFold(n_splits=5, random_state=498, shuffle=True)

# for train_index, test_index in kf.split(X=actual_texts, y=actual_labels):
#   train_texts = actual_texts.iloc[train_index]
#   test_texts = actual_texts.iloc[test_index]
#   train_labels = actual_labels.iloc[train_index]
#   test_labels = actual_labels.iloc[test_index]

#   train_sequence, train_mask, train_y = tokenize_dataset(train_texts, train_labels, 512)
#   test_sequence, test_mask, test_y = tokenize_dataset(test_texts, test_labels, 512)


#   train_data = TensorDataset(train_sequence, train_mask, train_y)
#   train_sampler = RandomSampler(train_data)
#   train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#   test_data = TensorDataset(test_sequence, test_mask, test_y)
#   test_sampler = RandomSampler(test_data)
#   test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

#   train_losses=[]
#   valid_losses=[]

#   for epoch in range(epochs):
#       print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

#       #train model
#       train_loss, _ = train()
#       valid_loss, _ = evaluate()
#       if valid_loss < best_valid_loss:
#           best_valid_loss = valid_loss
#           torch.save(model.state_dict(), 'saved_weights.pt')
      
#       train_losses.append(train_loss)
#       valid_losses.append(valid_loss)
      
#       print('\n################################################################')
#       print(f'Training Loss: {train_loss:.3f}')
#       print(f'Validation Loss: {valid_loss:.3f}')
#       with torch.no_grad():
#         test_sequence = test_sequence.to(device)
#         test_mask = test_mask.to(device)
#         preds = model(test_sequence, test_mask)

#       preds = torch.argmax(preds, dim=1)
#       print("Accuracy:", classification_report(test_y, preds.cpu().numpy(), output_dict=True)['accuracy'])
#       print('################################################################')




In [14]:
import plotly.graph_objects as go

pred = 0.62

fig = go.Figure(go.Indicator(
    mode = "gauge+number+delta",
    value = pred,
    domain = {'x': [0, 1], 'y': [0, 1]},
    title = {'text': "Accuracy", 'font': {'size': 24}},
    delta = {'reference': 1, 'increasing': {'color': "Blue"}},
    gauge = {
        'axis': {'range': [0, 1], 'tickwidth': 0.1, 'tickcolor': "darkblue"},
        'bar': {'color': "darkblue"},
        'bgcolor': "white",
        'borderwidth': 2,
        'bordercolor': "gray",
        # 'steps': [
        #     {'range': [0, 0.6], 'color': 'red'},
        #     {'range': [0.9, 1], 'color': 'green'}],
        # 'threshold': {
        #     'line': {'color': "black", 'width': 4},
        #     'thickness': 0.75,
        #     'value': .9}
             }))

fig.update_layout(paper_bgcolor = "lavender", font = {'color': "darkblue", 'family': "Arial"})

fig.show()