<a href="https://colab.research.google.com/github/pfuhr/InfoRet/blob/main/TC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TC.ipynb Description:**
Model utilizing the bert-base-uncased Transformer trained on arguments.tsv and evaluated using the same subset of arguments.tsv as the other models.

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m102.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.1 MB/s[0m eta [36m0:00:

In [2]:
!pip install torchvision



In [3]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split
import numpy as np
from transformers import BertTokenizer
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoModel
from torchvision import models
from google.colab import files
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


In [4]:
#Definition of functions and classes handling Data Preprocessing and Data Organization in Dataset

def preprocess(textdata):

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def encode_text(text):
        #Tokenization
        tokens = tokenizer.tokenize(text)

        #Add special tokens
        tokens = ['[CLS]'] + tokens + ['[SEP]']

        #convert tokens to ids
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        encoded_text = torch.tensor(token_ids)

        return encoded_text

    processed_data = []

    for text in textdata:
        text_rep = encode_text(text)
        processed_data = processed_data + [text_rep]

    max_seq_length = max(len(t) for t in processed_data)

    #fill up sequences with zeros
    #create attention masks
    #attention masks are tensors which indicate which tokens should be attended to and which not, it helps the model ignore padding tokens
    attention_masks = []
    for sequence in processed_data:
        seq_length = len(sequence)
        attention_masks = attention_masks + [torch.cat((torch.ones(seq_length), torch.zeros(max_seq_length - seq_length)))]

    padded_sequences = pad_sequence(processed_data, batch_first = True) #padding sequences means adding zeros s.t each input tensor is of dimension max_seq_length

    return padded_sequences, attention_masks

class MyDataset(Dataset):

    def __init__(self, sequence_reps, attention_masks, labels):
        self.sequence_reps = sequence_reps
        self.labels = labels
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence_rep = self.sequence_reps[idx]
        attention_mask = torch.tensor(self.attention_masks[idx])
        label = torch.tensor(self.labels[idx]).float()

        return sequence_rep, attention_mask, label


In [5]:
#class handling network architecture

class TransformerClassifier(nn.Module):
    def __init__(self, transformer_model):
        super(TransformerClassifier, self).__init__()
        self.transformer = transformer_model #the transformer computes a rich enough representation and the linear layer adds the info together
        self.classifier = nn.Linear(transformer_model.config.hidden_size, 20)

    def forward(self, inputs, attention_masks):
        transformer_output = self.transformer(inputs, attention_mask=attention_masks)
        #make sure to pass floats
        logits = self.classifier(transformer_output.pooler_output)
        return logits

In [6]:
uploaded = files.upload() #arguments
uploaded2 = files.upload() #labels

Saving arguments.tsv to arguments.tsv


Saving labels-level2.tsv to labels-level2.tsv


In [7]:
#Load argumenet data
arguments_df = pd.read_csv('arguments.tsv', delimiter='\t')

#Load label data
labels_df = pd.read_csv('labels-level2.tsv', delimiter='\t')

In [8]:
#splitting data in train and test sets
arguments_train_df = arguments_df.iloc[:4517]
arguments_test_df = arguments_df.iloc[4517:]
labels_train_df = labels_df.iloc[:4517]
labels_test_df = labels_df.iloc[4517:]


In [9]:
#Extract premises and binary labels and convert to lists
#for simplicity reasons we'll only work with the premises

premises_list = arguments_train_df['Premise'].tolist()
premises_test_list = arguments_test_df['Premise'].tolist()

labels_list = labels_train_df.iloc[:, 1:].values.tolist()
labels_test_list = labels_test_df.iloc[:, 1:].values.tolist()


In [10]:
#creating customized dataset
sequence_reps, attention_masks = preprocess(premises_list)
test_sequence_reps, test_attention_masks = preprocess(premises_test_list)
dataset = MyDataset(sequence_reps, attention_masks, labels_list)
test_data = MyDataset(test_sequence_reps, test_attention_masks, labels_test_list)

#splitting
train_data, val_data = train_test_split(dataset, test_size=0.06, random_state=42) # 0.06 corresponds to approximately 5% of the whole dataset

#Define batch_size and create a DataLoader for training and validation
batch_size = 32
dataloader = DataLoader(train_data, batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size, shuffle=True)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

  attention_mask = torch.tensor(self.attention_masks[idx])


In [11]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

#accessing bert-base-uncased
model_name = "bert-base-uncased"
model1 = AutoModel.from_pretrained(model_name) #pushing it to device later

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [12]:
#Training

#Initialisation of the model

model = TransformerClassifier(model1).to(torch_device) #model1 is Bert

#Define Loss Function and Optimizer
criterion = nn.BCEWithLogitsLoss() #log(sig(x))y + log(1-sig(x))(1-y)
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

# Define early stopping parameters
best_val_loss = float('inf')  # Initialize with a large value
patience = 3  # Number of epochs without improvement before stopping
counter = 0  # Counter to track epochs without improvement

#Training Loop
num_epochs = 20
for epoch in range(num_epochs):
    print("epoch: ", epoch)
    for inputs, attention_masks, labels in dataloader:
        #pushing data to GPU to perform calculations there
        inputs = inputs.to(torch_device)
        attention_masks = attention_masks.to(torch_device)
        labels = labels.to(torch_device)
        #print('new batch')
        #forward and backward pass
        optimizer.zero_grad()
        outputs = model(inputs, attention_masks)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    #validation loop
    with torch.no_grad():
        model.eval()  # Set model to evaluation mode
        val_loss = 0.0
        for inputs, attention_masks, labels in val_dataloader:
            #pushing data to GPU to perform calculations there
            inputs = inputs.to(torch_device)
            attention_masks = attention_masks.to(torch_device)
            labels = labels.to(torch_device)
            outputs = model(inputs, attention_masks)

            val_loss += criterion(outputs, labels).item()
        val_loss /= len(val_dataloader)

        # Check for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else: #error is increasing on the validation set
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break


epoch:  0
epoch:  1
epoch:  2
epoch:  3
epoch:  4
epoch:  5
epoch:  6
epoch:  7
epoch:  8
epoch:  9
epoch:  10
epoch:  11
epoch:  12
Early stopping at epoch 12


In [14]:
#Evaluation

# Create a DataLoader for your test dataset
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

model.eval()  # Set the model to evaluation mode

all_labels = []
all_predictions = []

with torch.no_grad():
    for inputs, attention_masks, labels in test_dataloader:

        #pushing data to GPU to perform calculations there
        inputs = inputs.to(torch_device)
        attention_masks = attention_masks.to(torch_device)

        # Compute logits
        logits = model(inputs, attention_masks)

        # Apply sigmoid activation function to get probabilities
        probs = torch.sigmoid(logits)

        # Apply thresholding to get binary predictions
        predictions = (probs > 0.5).int()

        all_labels.append(labels)
        all_predictions.append(predictions)

# Concatenate true labels and predictions across all mini-batches
all_labels = torch.cat(all_labels, dim=0)
all_predictions = torch.cat(all_predictions, dim=0).cpu()

# Calculate overall evaluation metrics
precision = precision_score(all_labels, all_predictions, average=None)
recall = recall_score(all_labels, all_predictions, average=None)
f1 = f1_score(all_labels, all_predictions, average=None)

#accuracy
# Initialize a list to store accuracy scores for each category
category_accuracies = []
category_ppp = [] #category positive predictions
# Calculate accuracy for each category
for category_index in range(20):
    true_category = all_labels[:, category_index]
    predicted_category = all_predictions[:, category_index]

    category_accuracy = accuracy_score(true_category, predicted_category)
    category_accuracies.append(category_accuracy)

    #percentage of positive predictions
    category_ppp.append((np.sum(np.array(predicted_category)))/len(predicted_category))



# Calculate overall metrics
overall_precision = precision_score(all_labels, all_predictions, average='micro')
overall_recall = recall_score(all_labels, all_predictions, average='micro')
overall_f1 = f1_score(all_labels, all_predictions, average='micro')
overall_accuracy = np.mean(np.array(category_accuracies))

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("Accuracy:", category_accuracies)

print("ov Precision:", overall_precision)
print("ov Recall:", overall_recall)
print("ov F1:", overall_f1)
print("ov Accuracy:", overall_accuracy)

#accuracy = accuracy_score(all_labels, all_predictions)
#calculates the percentage of elements in the test dataset where we have an exact match of all the 20 true labels with the models output
#currently not used because value is always very close to 0 as it is difficult to classify all labels across all categories correctly at the same time


  attention_mask = torch.tensor(self.attention_masks[idx])


Precision: [0.37837838 0.57407407 0.2        0.         0.57589286 0.14285714
 0.37362637 0.         0.59665871 0.62589928 0.63157895 0.4137931
 0.         0.         0.42435424 0.5        0.57964602 0.61111111
 0.21311475 0.38356164]
Recall: [0.38181818 0.33695652 0.03333333 0.         0.55364807 0.01538462
 0.59649123 0.         0.83892617 0.41232227 0.13483146 0.15789474
 0.         0.         0.59278351 0.06024096 0.5        0.23913043
 0.13       0.18543046]
F1: [0.3800905  0.42465753 0.05714286 0.         0.56455142 0.02777778
 0.45945946 0.         0.69735007 0.49714286 0.22222222 0.22857143
 0.         0.         0.49462366 0.10752688 0.53688525 0.34375
 0.16149068 0.25      ]
Accuracy: [0.8180610889774237, 0.7768924302788844, 0.9561752988047809, 0.9867197875166003, 0.7357237715803453, 0.9070385126162018, 0.8937583001328021, 0.9548472775564409, 0.7118193891102258, 0.7662682602921647, 0.8884462151394422, 0.7848605577689243, 0.9774236387782205, 0.9216467463479415, 0.6879150066401

  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
print('percentages of positive predictions:', category_ppp)

percentages of positive predictions: [0.14741035856573706, 0.14342629482071714, 0.006640106241699867, 0.005312084993359893, 0.29747675962815406, 0.009296148738379814, 0.12084993359893759, 0.0, 0.5564409030544488, 0.1845949535192563, 0.025232403718459494, 0.07702523240371846, 0.0026560424966799467, 0.0013280212483399733, 0.3598937583001328, 0.013280212483399735, 0.300132802124834, 0.02390438247011952, 0.08100929614873838, 0.09694555112881806]


In [20]:
"""
#preparing and downloading evaluation results, aval metrics still missing
array= np.array([precision, recall, f1, category_accuracies])
reshaped_array = np.transpose(array)

df = pd.DataFrame(category_ppp)
df2 = pd.DataFrame(reshaped_array)

# Save the DataFrame to a CSV file
df.to_csv('PPP_TC.csv', index=False)  # You can change the file name and format if needed
df2.to_csv('Evaluation_TC.csv', index=False)
"""


"\n#preparing and downloading evaluation results, aval metrics still missing\narray= np.array([precision, recall, f1, category_accuracies])\nreshaped_array = np.transpose(array)\n\ndf = pd.DataFrame(category_ppp)\ndf2 = pd.DataFrame(reshaped_array)\n\n# Save the DataFrame to a CSV file\ndf.to_csv('PPP_TC.csv', index=False)  # You can change the file name and format if needed\ndf2.to_csv('Evaluation_TC.csv', index=False)\n"

In [21]:
"""
# Download the CSV file to your local device
from google.colab import files
files.download('Evaluation_TC.csv')
files.download('PPP_TC.csv')
"""

"\n# Download the CSV file to your local device\nfrom google.colab import files\nfiles.download('Evaluation_TC.csv')\nfiles.download('PPP_TC.csv')\n"