<a href="https://colab.research.google.com/github/pfuhr/InfoRet/blob/main/TC_one_to_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Transformer Classifier on one category.ipynb Description:**
Transformer classifier classifying presence or absence of one category. We mainly used the model on category 9, though the code allows modification via the parameter cat.

1st option: augment training dataset with samples from paraphrased dataset, s.t the ratio of positive to negative labels in the training dataset is exactly 1:1.

2nd option: non augmented dataset arguments.tsv training and test-sets analogous to other models (current option, as code for option 1 is commented out, see #Option 1)

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m28.6 MB/s[0m eta [36m0:00:0

In [3]:
!pip install torchvision



In [4]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split
import numpy as np
from transformers import BertTokenizer
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoModel
from torchvision import models
from google.colab import files
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import random

In [5]:
#Definition of functions and classes handling Data Preprocessing and Data Organization in Dataset

def preprocess(textdata):

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def encode_text(text):
        #Tokenization
        tokens = tokenizer.tokenize(text)

        #Add special tokens
        tokens = ['[CLS]'] + tokens + ['[SEP]']

        #convert tokens to ids
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        encoded_text = torch.tensor(token_ids)

        return encoded_text

    processed_data = []

    for text in textdata:
        text_rep = encode_text(text)
        processed_data = processed_data + [text_rep]

    max_seq_length = max(len(t) for t in processed_data)

    print("This is the max seq_length:", max_seq_length)

    #fill up sequences with zeros
    #create attention masks
    #attention masks are tensors which indicate which tokens should be attended to and which not, it helps the model ignore padding tokens
    attention_masks = []
    for sequence in processed_data:
        seq_length = len(sequence)
        attention_masks = attention_masks + [torch.cat((torch.ones(seq_length), torch.zeros(max_seq_length - seq_length)))]

    padded_sequences = pad_sequence(processed_data, batch_first = True) #padding sequences means adding zeros s.t each input tensor is of dimension max_seq_length

    return padded_sequences, attention_masks

class MyDataset(Dataset):

    def __init__(self, sequence_reps, attention_masks, labels):
        self.sequence_reps = sequence_reps
        self.labels = labels
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence_rep = self.sequence_reps[idx]
        attention_mask = torch.tensor(self.attention_masks[idx])
        label = torch.tensor(self.labels[idx]).float()

        return sequence_rep, attention_mask, label


In [6]:
#class handling network architecture

class TransformerClassifier(nn.Module):
    def __init__(self, transformer_model, hidden_size):
        super(TransformerClassifier, self).__init__()
        self.transformer = transformer_model #the transformer computes a rich enough representation and the linear layer adds the info together
        self.hidden_layer = nn.Linear(transformer_model.config.hidden_size, hidden_size)
        self.activation = nn.ReLU()
        self.classifier = nn.Linear(hidden_size, 2)

    def forward(self, inputs, attention_masks):
        transformer_output = self.transformer(inputs, attention_mask=attention_masks)
        #make sure to pass floats

        hidden_output = self.hidden_layer(transformer_output.pooler_output)
        activated_output = self.activation(hidden_output)
        logits = self.classifier(activated_output)

        return logits

In [7]:
uploaded = files.upload() #arguments
uploaded2 = files.upload() #paraphrases
uploaded3 = files.upload() #labels-level2

Saving arguments.tsv to arguments.tsv


Saving paraphrases.tsv to paraphrases.tsv


Saving labels-level2.tsv to labels-level2.tsv


In [8]:
#Load argument data and paraphrases
arguments_df = pd.read_csv('arguments.tsv', delimiter='\t')
paraphrases_df = pd.read_csv('paraphrases.tsv', delimiter='\t')

#Load label data
labels_df = pd.read_csv('labels-level2.tsv', delimiter='\t')

In [9]:
cat = 9 #category
arguments_train_df = arguments_df.iloc[:4517]
arguments_test_df = arguments_df.iloc[4517:]

paraphrases_df = paraphrases_df.iloc[:4517] #Option 1

labels_train_df = labels_df.iloc[:4517, cat]
labels_test_df = labels_df.iloc[4517:, cat]

In [10]:
#convert labels to list
labels_list = labels_train_df.values.tolist()

"""
#Option 2

#compute number of missing labels
one_labels = np.sum(labels_train_df.values)
zero_labels = 4517 - one_labels
missing_labels = zero_labels - one_labels

#find indices corresponding to positive labels


indices = []
for i in range(len(labels_list)):
  if labels_list[i] == 1:
    indices.append(i)

#choose missing_labels elements from this list

random.shuffle(indices)
selected_indices = indices[:missing_labels]
"""

'\n#Option 2\n\n#compute number of missing labels\none_labels = np.sum(labels_train_df.values)\nzero_labels = 4517 - one_labels\nmissing_labels = zero_labels - one_labels\n\n#find indices corresponding to positive labels\n\n\nindices = []\nfor i in range(len(labels_list)):\n  if labels_list[i] == 1:\n    indices.append(i)\n\n#choose missing_labels elements from this list \n\nrandom.shuffle(indices)\nselected_indices = indices[:missing_labels]\n'

In [11]:
#Extract premises convert to lists
#for simplicity reasons we'll only work with the premises

premises_list = arguments_train_df['Premise'].tolist()

"""
#Option 1

#extend premises_list and labels_list
para_premises_list = paraphrases_df['Premise'].tolist()
selected_para_premises_list = selected_elements = [para_premises_list[i] for i in selected_indices]
para_labels_list = [labels_list[i] for i in selected_indices]


premises_list = premises_list + selected_para_premises_list
labels_list = labels_list + para_labels_list
"""

#test premises and labels
labels_test_list = labels_test_df.values.tolist()
premises_test_list = arguments_test_df['Premise'].tolist()

In [12]:
#creating customized dataset
sequence_reps, attention_masks = preprocess(premises_list)
test_sequence_reps, test_attention_masks = preprocess(premises_test_list)
dataset = MyDataset(sequence_reps, attention_masks, labels_list)
test_data = MyDataset(test_sequence_reps, test_attention_masks, labels_test_list)


#splitting
train_data, val_data = train_test_split(dataset, test_size=0.06, random_state=42)

#Define batch_size and create a DataLoader for training and validation
batch_size = 32
dataloader = DataLoader(train_data, batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size, shuffle=True)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

This is the max seq_length: 59
This is the max seq_length: 159


  attention_mask = torch.tensor(self.attention_masks[idx])


In [13]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "bert-base-uncased"
model1 = AutoModel.from_pretrained(model_name) #pushing it to device later

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
#Training

#Initialisation of the model
hidden_size = 128
model = TransformerClassifier(model1, hidden_size).to(torch_device) #model1 is Bert

#Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss() #receives logits
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

# Define early stopping parameters
best_val_loss = float('inf')  # Initialize with a large value
patience = 3  # Number of epochs without improvement before stopping
counter = 0  # Counter to track epochs without improvement

#Training Loop
num_epochs = 20
for epoch in range(num_epochs):
    print("epoch: ", epoch)
    for inputs, attention_masks, labels in dataloader:

        #pushing data to GPU to perform calculations there
        inputs = inputs.to(torch_device)
        attention_masks = attention_masks.to(torch_device)
        labels = labels.to(torch_device)

        #forward and backward pass
        optimizer.zero_grad()
        outputs = model(inputs, attention_masks)
        labels = labels.long()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    #validation loop
    with torch.no_grad():
        model.eval()  # Set model to evaluation mode
        val_loss = 0.0
        for inputs, attention_masks, labels in val_dataloader:
            #pushing data to GPU to perform calculations there
            inputs = inputs.to(torch_device)
            attention_masks = attention_masks.to(torch_device)
            labels = labels.to(torch_device)

            outputs = model(inputs, attention_masks)
            labels = labels.long()

            val_loss += criterion(outputs, labels).item()
        val_loss /= len(val_dataloader)

        # Check for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else: #error is increasing on the validation set
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break


epoch:  0
epoch:  1
epoch:  2
epoch:  3
epoch:  4
epoch:  5
epoch:  6
Early stopping at epoch 6


In [15]:
#Evaluation

# Create a DataLoader for your test dataset
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()  # Set the model to evaluation mode

all_labels = []
all_predictions = []

with torch.no_grad():
    for inputs, attention_masks, labels in test_dataloader:

        #pushing data to GPU to perform calculations there
        inputs = inputs.to(torch_device)
        attention_masks = attention_masks.to(torch_device)

        # Compute logits
        logits = model(inputs, attention_masks)

        #get predictions applying argmax on the logits
        predictions = torch.argmax(logits, dim=1)
        labels.long()

        # Append labels and predictions for aggregation
        all_labels.append(labels)
        all_predictions.append(predictions)

# Concatenate true labels and predictions across all mini-batches
all_labels = torch.cat(all_labels, dim=0)
all_predictions = torch.cat(all_predictions, dim=0).cpu()

# Calculate overall evaluation metrics
precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)
accuracy = accuracy_score(all_labels, all_predictions) #this time actual accuracy

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("Accuracy:", accuracy)

  attention_mask = torch.tensor(self.attention_masks[idx])


Precision: 0.6971830985915493
Recall: 0.6644295302013423
F1: 0.6804123711340205
Accuracy: 0.7529880478087649
