In [None]:
#@title Installation
!pip install transformers

In [None]:
#@title Module Imports
import io,os
import pandas as pd
import csv
import json
import torch
from sklearn.model_selection import train_test_split
from torch.nn import Module,\
                     ModuleList,\
                     Linear,\
                     Sigmoid,\
                     BCELoss
import torch.nn.functional as F
from torch.optim import Adam
from transformers import BertTokenizer,\
                         BertModel
                        #  LongformerModel,\
                        #  LongformerTokenizer

In [None]:
#@title Dataset Download { form-width: "15%" }
kaggle_json = {"username":"masoudmousavi","key":"c94c66490147612c6fc696e81c98d771"}
os.makedirs('/content/.kaggle/',exist_ok=True)
os.makedirs('/root/.kaggle/',exist_ok=True)
with open('/content/.kaggle/kaggle.json', 'w') as file:
  json.dump(kaggle_json, file)
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
! kaggle competitions download -c fake-news
! unzip /content/train.csv.zip 
! rm -rf /content/train.csv.zip
! unzip /content/test.csv.zip 
! rm -rf /content/test.csv.zip

In [None]:
#@title Device Config
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device is {device}')

Device is cuda


In [None]:
#@title Dataset Load and Preprocessing { form-width: "15%" }
DEV_SET_SIZE = 6400
_train_df = pd.read_csv("train.csv")
_train_df = _train_df.sample(frac=1)
train_df, dev_df = train_test_split(_train_df, test_size = DEV_SET_SIZE) 
_train_df = None
test_df = pd.read_csv("test.csv")
submit_df = pd.read_csv("submit.csv")

In [None]:
train_df

In [None]:
#@title Encoder Models and Tokenizers { form-width: "15%" }
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_encoder = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
bert_encoder.to(device)

# lf_encoder = LongformerModel.from_pretrained('allenai/longformer-base-4096', output_hidden_states=True)
# lf_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
# lf_encoder.to(device)

In [None]:
#@title Neural Network { form-width: "15%" }
class NeuralNet(Module):
  def __init__(self, fully_connected_dims, dropout, input_size=768):
    super().__init__()
    self.dropout = dropout
    self.input_size = input_size
    fc_list = [Linear(self.input_size, fully_connected_dims[0])] +\
              [Linear(fully_connected_dims[i], fully_connected_dims[i+1])
               for i in range(len(fully_connected_dims[:-1] ))
              ] +\
              [
               Linear(fully_connected_dims[-1], 1)
              ]
    self.fc_layers = ModuleList(fc_list)
    self.classifier = Sigmoid()

  def forward(self, x):
    h = x
    for fc in self.fc_layers[:-1]:
      h = fc(h)
      h = F.relu(h)
      if self.training:
        h = F.dropout(h, p=self.dropout)
    
    h = self.fc_layers[-1](h)
    if self.training:
      h = F.dropout(h, p=self.dropout)
    h = self.classifier(h)
    return h

In [None]:
#@title Model and Hyperparameters
model = NeuralNet((128, 128), 0.4)
model.to(device)
print(model)
mini_batch_size = 128
n_epochs = 250
optimizer = Adam(model.parameters(), lr=1e-3)
criterion = BCELoss()

In [None]:
#@title bert_batch_maker(df) { form-width: "15%" }
def bert_batch_maker(df, batch_name):
  from collections import defaultdict
  tokens_by_len = defaultdict(list)
  labels_by_len = defaultdict(list)
  encodings = list()
  labels = list()

  tokens_list = dict()
  i = 0
  for row in df.iterrows():
    i += 1

    id, title, author, text, label = row[1]
    
    cpu_device = 'cpu'
    
    part1_masked_sentence = f'[CLS] {title} [SEP] {author} [SEP] '
    part1_tokens = bert_tokenizer.tokenize(part1_masked_sentence)
    part1_sentence_size = len(part1_tokens)

    part2_masked_sentence = f'{text} [SEP]'
    part2_tokens = bert_tokenizer.tokenize(part2_masked_sentence)
    part2_sentence_size = len(part2_tokens)

    if part2_sentence_size + part1_sentence_size > 512:
      truncate_len = 512 - 1 - part1_sentence_size
      truncated_part2_tokens = part2_tokens[:truncate_len] + [part2_tokens[-1]]
      tokens = part1_tokens + truncated_part2_tokens
    else:
      tokens = part1_tokens + part2_tokens

    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokens)
    tokens_tensor = torch.tensor([indexed_tokens], device=device) 

    bert_encoder.eval()
    batch_id = 0
    with torch.no_grad():
        outputs = bert_encoder(tokens_tensor)
        bert_hidden_states = outputs[2]
        z_encoding = torch.stack(bert_hidden_states[-4:]).sum(dim=0) # sum of last four layers
        
        cls_encoding = z_encoding[0, 0].reshape(1, -1)

        
        cls_encoding = torch.tensor(cls_encoding, device='cpu')
        print(f'Sentence {i} of {" ".join(list(map(str.capitalize, batch_name.split(" "))))}, Bert Encoding...')
        # batch_id += 1
        encodings.append(cls_encoding)
        labels.append(torch.tensor(label, dtype=torch.float, device='cpu'))
        

  batch_encodings = torch.stack(encodings)
  batch_labels = torch.stack(labels)

  return batch_encodings, batch_labels

In [None]:
#@title batch_maker_method2(df) INCOMPLETE { form-width: "15%" }
def batch_maker_method2(df):
  from collections import defaultdict
  tokens_by_len = defaultdict(list)
  labels_by_len = defaultdict(list)
  encodings = list()
  labels = list()

  tokens_list = dict()
  i = 0
  for row in df.iterrows():

    id, title, author, text, label = row[1]
    labels.append(label)
    cpu_device = 'cpu'
    
    part1_masked_sentence = f'[CLS] {title} [SEP] {author} [SEP] '
    part1_tokens = bert_tokenizer.tokenize(part1_masked_sentence)
    part1_sentence_size = len(part1_tokens)

    part2_masked_sentence = f'{text} [SEP]'
    part2_tokens = bert_tokenizer.tokenize(part2_masked_sentence)
    part2_sentence_size = len(part2_tokens)

    if part2_sentence_size + part1_sentence_size > 512:
      truncate_len = 512 - 1 - part1_sentence_size
      truncated_part2_tokens = part2_tokens[:truncate_len] + [part2_tokens[-1]]
      tokens = part1_tokens + truncated_part2_tokens
    else:
      tokens = part1_tokens + part2_tokens

    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokens)
    tokens_tensor = torch.tensor([indexed_tokens], device=device) 

    bert_encoder.eval()
    batch_id = 0
    with torch.no_grad():
        outputs = bert_encoder(tokens_tensor)
        bert_hidden_states = outputs[2]
        z_encoding = torch.stack(bert_hidden_states[-4:]).sum(dim=0) # sum of last four layers
        
        cls_encoding = z_encoding[0, 0].reshape(1, -1)

        
        cls_encoding = torch.tensor(cls_encoding, device='cpu')
        print(f'Sentence {id}, Bert Encoding...')
        # batch_id += 1
        encodings.append(cls_encoding)
        labels.append(torch.tensor(label, dtype=torch.float, device='cpu'))
        

  









    
  #   # indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokens)
  #   # tokens_tensor = torch.tensor([indexed_tokens], device=device) 
    
  #   tokens_list.update({i:tokens})
  #   i += 1
  #   # labels.append(torch.tensor(label, dtype=torch.float, device=cpu_device))
  #   # print(label)
  #   # print(f'Sentence {id + 1}, Bert Encoding...')
  #   # print(label)
  
  # for id, token_list in tokens_list.items():
  #   # print(tl.__class__.__name__)
  #   ########################################################### 
  #   # print(zip(tl))
  #   # id, token_list = list(tl)
  #   tokens_by_len[len(token_list)].append(token_list)
  #   labels_by_len[len(token_list)].append(labels[id])
  #   ########################################################### 
    
  # # print(labels_by_len.items())
  # # return
  # labels = list()
  # for _len_key, _tokens_list in tokens_by_len.items(): 
  #   indices_list = list()
  #   for sentence_tokens in _tokens_list:
  #     ids = bert_tokenizer.convert_tokens_to_ids(sentence_tokens)
  #     indices_list.append(torch.tensor(ids, device=device))
  #   indices_tensors = torch.stack(indices_list)
  #   labels.append(labels_by_len[_len_key])
  #   # print(indices_tensors.shape)

  #   bert_encoder.eval()
  #   batch_id = 0
  #   with torch.no_grad():
  #       outputs = bert_encoder(indices_tensors)

  #       # print(outputs[2].shape)

  #       bert_hidden_states = outputs[2]
  #       z_encodings = torch.stack(bert_hidden_states[-4:]).sum(dim=0) # sum of last four layers
        
  #       cls_encodings = z_encodings[:, 0]
        
  #       cls_encodings = torch.tensor(cls_encodings, device=cpu_device)
  #       print(f'Batch {batch_id + 1}, Bert Encoding Size {cls_encodings.shape[0]}...')
  #       batch_id += 1
  #       encodings.append(cls_encodings)
  #       # labels.append(torch.tensor(label, dtype=torch.float, device=device))
        

  # # batch_encodings = torch.stack(encodings)
  # # batch_labels = torch.stack(labels)

  # return encodings, labels

In [None]:
#@title train(model, x, y, optimizer, loss_fn)
def train(model, x, y, optimizer, loss_fn):
  batch_loss = 0

  model.train()

  out = model(x)
  optimizer.zero_grad()
  loss = loss_fn(out.reshape(-1), y)
  batch_loss = loss.item()
  loss.backward()
  optimizer.step()

  prediction = torch.round(out)
  correct = torch.tensor(
      (prediction.reshape(-1) == y),
      dtype=torch.int, device=device).sum().item()

  return batch_loss, correct

In [None]:
#@title evaluate(model, x, y)
@torch.no_grad()
def evaluate(model, x, y):
  model.eval()

  out = model(x)
  prediction = torch.round(out)
  correct = torch.tensor(
      (prediction.reshape(-1) == y),
      dtype=torch.int, device=device).sum().item()

  model.train()
  return correct


In [None]:
#@title BERT Batch Maker
h_train, y_train = bert_batch_maker(train_df, 'train set')
h_dev, y_dev = bert_batch_maker(dev_df, 'dev set')

In [None]:
#@title NN Fitting { form-width: "15%" }
i = 0

batch_loss = 0

for epoch in range(n_epochs):
  loss = 0
  correct = 0
  for batch in range(mini_batch_size, h_train.shape[0], mini_batch_size):
    X = h_train[batch - mini_batch_size:batch, :, :]
    X = X.to(device)
  
    y = y_train[batch - mini_batch_size:batch]
    y = y.to(device)



    model.train()
    batch_loss, batch_correct = train(model, X, y, optimizer, criterion)
    loss += batch_loss
    correct += batch_correct
  
  model.eval()
  X = h_dev.to(device)
  y = y_dev.to(device)
  dev_correct = evaluate(model, X, y)
  # model.train()
  print(f'{"=" * 32} Epoch {epoch + 1} {"=" * 32}')
  print(f'Epoch Loss: {loss}')
  print(f'Train Accuracy: {correct / h_train.shape[0] * 100:.3f} %')
  print(f'Dev Accuracy:  {dev_correct / h_dev.shape[0] * 100:.3f} %')

  # Remove the CWD from sys.path while we load stuff.


Epoch Loss: 37.37760230898857
Train Accuracy: 77.507 %
Dev Accuracy:  98.172 %
Epoch Loss: 36.039905443787575
Train Accuracy: 78.069 %
Dev Accuracy:  98.297 %
Epoch Loss: 36.14202120900154
Train Accuracy: 78.021 %
Dev Accuracy:  97.578 %
Epoch Loss: 36.58903504908085
Train Accuracy: 78.194 %
Dev Accuracy:  97.859 %
Epoch Loss: 36.414849892258644
Train Accuracy: 77.694 %
Dev Accuracy:  98.109 %
Epoch Loss: 35.94998079538345
Train Accuracy: 78.146 %
Dev Accuracy:  97.594 %
Epoch Loss: 36.1414158642292
Train Accuracy: 77.875 %
Dev Accuracy:  97.938 %
Epoch Loss: 35.882567688822746
Train Accuracy: 78.243 %
Dev Accuracy:  98.031 %
Epoch Loss: 35.944834530353546
Train Accuracy: 78.049 %
Dev Accuracy:  96.922 %
Epoch Loss: 36.55700930953026
Train Accuracy: 78.007 %
Dev Accuracy:  98.250 %
Epoch Loss: 35.979854732751846
Train Accuracy: 77.875 %
Dev Accuracy:  98.000 %
Epoch Loss: 35.53332667052746
Train Accuracy: 78.368 %
Dev Accuracy:  98.047 %
Epoch Loss: 36.87261524796486
Train Accuracy: 77

In [None]:
torch.save(model.state_dict(), 'BERT-FND.pth')