In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
!pip install simpletransformers

In [None]:
!pip install stanza

In [None]:
import pandas as pd
import numpy as np
import json, re
import time
import os
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import itertools

from torch.utils.data import (
    Dataset, 
    DataLoader,
    TensorDataset, 
    random_split, 
    RandomSampler, 
    SequentialSampler)

from transformers import (
    BertModel,
    BertForSequenceClassification,
    BertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    AdamW,
    get_linear_schedule_with_warmup)


In [None]:
batch_size = 32
epochs = 10
df_train = torch.load("/content/drive/MyDrive/18662/Project/Data/climate_generated_train.pt")
train_dataloader = DataLoader(
            df_train,  
            batch_size = batch_size 
        )

In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device("cpu")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base",
                                                           num_labels = 2,
                                                           output_attentions = False,
                                                           output_hidden_states = False
                                                          ).to(device)

optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8 )
training_stats = []
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0,num_training_steps = total_steps)

In [None]:
def train():
    for epoch in range(0, epochs):
        print('Epoch {:} / {:}'.format(epoch + 1, epochs))
        train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            input_ids = batch[0].to(device)
            input_mask = batch[1].to(device)
            labels = batch[2].to(device)

            model.zero_grad()        

            output = model(input_ids, token_type_ids=None, attention_mask=input_mask,labels=labels)
                                
            train_loss += output[0].item()

            output[0].backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

        
        avg_train_loss = train_loss / len(train_dataloader)            
        
        print("Average training loss: {0:.2f}".format(avg_train_loss))
        training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss': avg_train_loss,
            }
        )

    print("Training complete")

In [None]:
save_path = "/content/drive/MyDrive/18662/Project/checkpoints/roberta_generated/"
train()
model.save_pretrained(save_path)

In [None]:
def evaluate(dev_dataloader, model):
    predictions = []
    gt = []
    with torch.no_grad():
        for step, batch in enumerate(dev_dataloader):
            input_ids = batch[0].to(device)
            input_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            output = model(input_ids, input_mask)
            predictions.append(output)   
            gt.append(labels)     
            
    predictions = torch.vstack([item[0].detach() for item in predictions])
    gt = [list(i.cpu().numpy()) for i in gt]
    gt = np.array(list(itertools.chain(*gt)))

    return predictions, gt

In [None]:
df_dev = torch.load("/content/drive/MyDrive/18662/Project/Data/climate_generated_dev.pt")
dev_dataloader = DataLoader(df_dev,  batch_size = batch_size)

prediction, gt = evaluate(dev_dataloader, model)
f1 = f1_score(gt, prediction, average=None)

print("F1 score for RoBERTa-large fine-tuned on CLIMATE-FEVER:", f1)