In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
!pip install simpletransformers

In [None]:
!pip install stanza

In [None]:
import pandas as pd
import numpy as np
import json, re
import time
import os
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import itertools

from torch.utils.data import (
    Dataset, 
    DataLoader,
    TensorDataset, 
    random_split, 
    RandomSampler, 
    SequentialSampler)

from transformers import (
    BertModel,
    BertForSequenceClassification,
    BertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    AdamW,
    get_linear_schedule_with_warmup)


In [None]:
def encode_dataframe(statement_col, target_col, unpack=False):
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    bert_encoded_dict = statement_col.apply(lambda sent: bert_tokenizer.encode_plus(
                                      sent,                      
                                      add_special_tokens = True, 
                                      max_length = 120,           
                                      pad_to_max_length = True,
                                      return_attention_mask = True,   
                                      return_tensors = 'pt',     
                                      truncation = True
                                ))
    bert_input_ids = torch.cat([item['input_ids'] for item in bert_encoded_dict], dim=0)
    bert_attention_masks = torch.cat([item['attention_mask'] for item in bert_encoded_dict], dim=0)
    labels = torch.tensor(target_col)
    sentence_ids = torch.tensor(range(len(target_col)))

    bert_dataset = TensorDataset(sentence_ids, bert_input_ids, bert_attention_masks, labels)
    trial_dataset =  index_remover(bert_dataset)

    if unpack:
        return bert_input_ids, bert_attention_masks, labels
    else:
        return trial_dataset

In [None]:
def index_remover(tensordata):
    input_ids = []
    attention_masks = []
    labels = []
   
    for a,b,c,d in tensordata:
        input_ids.append(b.tolist())
        attention_masks.append(c.tolist())
        labels.append(d.tolist())
        
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    
    final_dataset =  TensorDataset(input_ids, attention_masks, labels)
    return final_dataset

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
batch_size = 32
epochs = 10
df_train = torch.load("/content/drive/MyDrive/18662/Project/Data/climate_train.pt")
bert_train_dataloader = DataLoader(
            df_train,  
            batch_size = batch_size 
        )

In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device("cpu")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           num_labels = 2,
                                                           output_attentions = False,
                                                           output_hidden_states = False
                                                          ).to(device)

bert_optimizer = AdamW(bert_model.parameters(), lr = 5e-5, eps = 1e-8 )
bert_training_stats = []
total_steps = len(bert_train_dataloader) * epochs
bert_scheduler = get_linear_schedule_with_warmup(bert_optimizer, num_warmup_steps = 0,num_training_steps = total_steps)

In [None]:
def train():
    for epoch in range(0, epochs):
        print('Epoch {:} / {:}'.format(epoch + 1, epochs))
        train_loss = 0
        bert_model.train()

        for step, batch in enumerate(bert_train_dataloader):
            input_ids = batch[0].to(device)
            input_mask = batch[1].to(device)
            labels = batch[2].to(device)

            bert_model.zero_grad()        

            output = bert_model(input_ids, token_type_ids=None, attention_mask=input_mask,labels=labels)
                                
            train_loss += output[0].item()

            output[0].backward()

            torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)

            bert_optimizer.step()
            bert_scheduler.step()

        
        avg_train_loss = train_loss / len(bert_train_dataloader)            
        
        print("Average training loss: {0:.2f}".format(avg_train_loss))
        bert_training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss': avg_train_loss,
            }
        )

    print("Training complete")

In [None]:
save_path = "/content/drive/MyDrive/18662/Project/checkpoints/bert_orig/"
train()
bert_model.save_pretrained(save_path)

In [None]:
def evaluate(bert_dev_dataloader, bert_model):
    predictions = []
    gt = []
    with torch.no_grad():
        for step, batch in enumerate(bert_dev_dataloader):
            input_ids = batch[0].to(device)
            input_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            output = bert_model(input_ids, input_mask)
            predictions.append(output)   
            gt.append(labels)     
            
    predictions = torch.vstack([item[0].detach() for item in predictions])
    gt = [list(i.cpu().numpy()) for i in gt]
    gt = np.array(list(itertools.chain(*gt)))

    return predictions, gt

In [None]:
df_dev = torch.load("/content/drive/MyDrive/18662/Project/Data/climate_dev.pt")
bert_dev_dataloader = DataLoader(df_dev,  batch_size = batch_size)

prediction, gt = evaluate(bert_dev_dataloader, bert_model)
f1 = f1_score(gt, prediction, average=None)

print("F1 score for BERT-base fine-tuned on CLIMATE-FEVER:", f1)