In [1]:
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/Colab Notebooks/"

Mounted at /content/drive


In [2]:
# import libraries here
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from peft import LoraConfig, get_peft_model, TaskType

from transformers import logging
logging.set_verbosity_error()
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='huggingface_hub.utils._auth')

# Import data

In [42]:
# import data
test = pd.read_csv(path + 'test.csv')

In [9]:
# show head
test.head()

Unnamed: 0,Claim,Evidence
0,We should further exploit geothermal energy,Taxpayer funding of research and development o...
1,We should prohibit corporal punishment,"Regarding discipline, Sukhmani writes that cor..."
2,We should ban male infant circumcision,"Benatar and Benatar (2003) argue that ""it is f..."
3,We should ban trans fats usage in food,"Each KIND bar is gluten free, dairy free, non ..."
4,We should ban boxing,About Feng Keshan and Meihuaquan: In the 1800s...


# Load Model

In [10]:
hp = {
    'use_lora': True,
    'maxlen': 128,
    'batchsize':8,
}

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [48]:
# initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
if hp['use_lora']:
  lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["query", "value"], lora_dropout=0.05, bias="none", task_type=TaskType.SEQ_CLS, inference_mode=True)
  model = get_peft_model(model, lora_config)

# load model weights
model.load_state_dict(torch.load(path + 'model.pt'))

model.to(device)
print(f'Model loaded to {device}')

Model loaded to cuda


# Preprocessing

In [43]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(test['Claim'].tolist(), test['Evidence'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=hp['maxlen'])

In [44]:
class InputDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "token_type_ids": self.encodings["token_type_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
        }

In [45]:
# convert to dataset and create dataloader
dataset = InputDataset(encodings)
dataloader = DataLoader(dataset, batch_size=hp['batchsize'], shuffle=False)

# Predict

In [46]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in dataloader:
        # move inputs to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # get predicted class
        pred = torch.argmax(logits, dim=1)

        # store prediction
        predictions.extend(pred.cpu().tolist())

In [17]:
# save predictions
df = pd.DataFrame({'prediction': predictions})
df.to_csv(path + 'predictions.csv', index=False)