<a href="https://colab.research.google.com/github/pfuhr/InfoRet/blob/main/20BCoad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#TODO:
#cleaning: Done!
#runs: Done!

**20 binary classifiers oad.ipynb Description:**
20 binary classifiers using a simple bow model trained on the initial dataset arguments.tsv and the paraphrased dataset paraphrases.tsv.

In [1]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torchvision import models
from google.colab import files

In [2]:
#class handling data

class MyDataset(Dataset):
    def __init__(self, texts, labels, vocab=None):
        self.texts = texts
        self.labels = labels

        #create vocabulary
        if vocab is not None:
            self.vocab = vocab
        else:
            self.vocab = self.build_vocab(texts)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        #handle numpy.int
        #idx = self.convert_to_int(idx)
        #handle single index
        if isinstance(idx, (int, np.int64)):
            return self.texts[idx], self.labels[idx] #check if matches ...
        #handle slicing #returns dataset which uses the same vocab
        elif isinstance(idx, slice):
            return MyDataset(self.texts[idx], self.labels[idx], self.vocab)

        else:
            raise TypeError('invalid index type: must be an int or a slice', idx, type(idx))

    def build_vocab(self, texts):
        # create a set of unique words from the texts
        word_set = set()

        for text in texts:
            words = text.split()
            word_set.update(words)

        # sort the words and create a vocabulary mapping
        vocab = {word: i for i, word in enumerate(sorted(word_set))}

        return vocab

class UsedDataset(Dataset):

    def __init__(self, dataset, index):
        self.dataset = dataset
        self.index = index

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text, label = self.dataset[idx]
        encoded_text = self.encode_text(text)
        return encoded_text, label[self.index]

    def encode_text(self, text):
        #initialize a binary vector with zeros
        vector=torch.zeros(len(self.dataset.vocab))

        words = text.split()
        for word in words:
            if word in self.dataset.vocab:
                vector[self.dataset.vocab[word]] = 1

        return vector




In [3]:
#class handling network architecture

class BagOfWordsModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BagOfWordsModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [4]:
uploaded = files.upload() #arguments
uploaded1 = files.upload()
uploaded2 = files.upload()

Saving arguments.tsv to arguments.tsv


Saving paraphrases.tsv to paraphrases.tsv


Saving labels-level2.tsv to labels-level2.tsv


In [5]:
#Load argument data
arguments_df = pd.read_csv('arguments.tsv', delimiter='\t')

#Load paraphrases
paraphrases_df = pd.read_csv('paraphrases.tsv', delimiter='\t') #to be changed to paraphrases.tsv

#Load label data
labels_df = pd.read_csv('labels-level2.tsv', delimiter='\t')


In [6]:
#Extract premises, stances and conclusions, concatenate them and convert to list
premises_list = arguments_df['Premise'].tolist()
conclusions_list = arguments_df['Conclusion'].tolist()
stances_list = arguments_df['Stance'].tolist()
inputs_list = [conclusions_list[i] + " " + stances_list[i] + " " + premises_list[i] for i in range(len(conclusions_list))]

#same for paraphrases
para_premises_list = paraphrases_df['Premise'].tolist()
para_conclusions_list = paraphrases_df['Conclusion'].tolist()
para_stances_list = paraphrases_df['Stance'].tolist()
para_inputs_list = [para_conclusions_list[i] + " " + para_stances_list[i] + " " + para_premises_list[i] for i in range(len(para_conclusions_list))]

#extract inputs and convert to list
labels_list = labels_df.iloc[:, 1:].values.tolist()

In [7]:
#preparing Data for test and validation

aug_inputs_list = inputs_list + para_inputs_list
aug_labels_list = labels_list + labels_list
dataset = MyDataset(aug_inputs_list, aug_labels_list)
train_dataset1 = dataset[:4240]
train_dataset2 = dataset[len(inputs_list):(len(inputs_list)+4240)]
val_dataset = dataset[4240:4517]  #leave the same to more accurately see the effect of the augmented data
eval_dataset = dataset[4517:5270] #same as in 20bc

train_dataset = MyDataset(train_dataset1.texts + train_dataset2.texts, train_dataset1.labels + train_dataset2.labels, dataset.vocab) #giving the right vocab


In [10]:
#creation of 20 models and training them using a loop through all the categories

categories = ['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']

input_size = len(dataset.vocab)
hidden_size = 128
output_size = 2

Evaluation = []
Percentage_of_positive_predictions = []

for i in range(20):

  print('Model for category', i, ':', categories[i])

  #creating category dataset
  used_dataset = UsedDataset(train_dataset, i)
  val_used_dataset = UsedDataset(val_dataset, i)

  #Define batch_size and create a DataLoader
  batch_size = 32
  dataloader = DataLoader(used_dataset, batch_size, shuffle=True)
  val_dataloader = DataLoader(val_used_dataset, batch_size, shuffle=True)

  #Training

  #Initialisation of the model

  model = BagOfWordsModel(input_size, hidden_size, output_size)

  #Define Loss Function and Optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

  # Define early stopping parameters
  best_val_loss = float('inf')  # Initialize with a large value
  patience = 3  # Number of epochs without improvement before stopping
  counter = 0  # Counter to track epochs without improvement

  #Training Loop
  num_epochs = 10
  for epoch in range(num_epochs):
      for inputs, labels in dataloader:
          optimizer.zero_grad()

          outputs = model(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()

      #Validation Loop leading to eventual early stopping
      with torch.no_grad():
        model.eval()  # Set model to evaluation mode
        val_loss = 0.0
        for val_inputs, val_labels in val_dataloader:
            val_outputs = model(val_inputs)
            val_loss += criterion(val_outputs, val_labels).item()
        val_loss /= len(val_dataloader)

        # Check for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else: #error is increasing on the validation set
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break

  #Evaluation Loop
  eval_used_dataset = UsedDataset(eval_dataset, i)
  test_dataloader = DataLoader(eval_used_dataset, batch_size=batch_size, shuffle=False)

  model.eval()

  TP = 0
  FP = 0
  TN = 0
  FN = 0
  pred_pos = 0
  with torch.no_grad():
    for inputs, labels in test_dataloader:

        outputs = model(inputs)

        _, predicted = torch.max(outputs, 1) #torch.max returns (max, argmax) along the first dimension as outputs is two-dim tensor

        TP += torch.logical_and((predicted == labels), (predicted == torch.ones_like(predicted))).sum().item()
        FP += torch.logical_and((predicted == torch.ones_like(predicted)), (predicted != labels)).sum().item()
        TN += torch.logical_and((predicted == labels), (predicted == torch.zeros_like(predicted))).sum().item()
        FN += torch.logical_and((predicted == torch.zeros_like(predicted)), (predicted != labels)).sum().item()

        pred_pos += predicted.sum().item()

  control = False

  if TP+FP == 0:
    print("Precision undefined")
    Precision = None
    control = True
  else:
    Precision = TP/(TP + FP)
    print("Precision:", Precision)

  if TP+FN == 0:
    print("Recall undefined")
    Recall = None
    control = True
  else:
    Recall = TP/(TP + FN) #= TP/P
    print("Recall:", Recall)

  if (control == True) or (Precision + Recall == 0):
    print("F1 undefined")
    F1 = None
  else:
    F1 = 2*(Precision*Recall)/(Precision + Recall)
    print("F1:", F1)

  Accuracy = (TP + TN)/(TP + FP + TN + FN)
  print("Accuracy:", Accuracy)

  perc_pred_pos = pred_pos/(TP + FP + TN +FN)
  print("percentage of positive predictions", perc_pred_pos)

  Evaluation = Evaluation + [[Precision, Recall, F1, Accuracy]] #order in csv doc
  Percentage_of_positive_predictions.append(perc_pred_pos)

# Convert the Evaluation lists into Pandas DataFrames
df = pd.DataFrame(Evaluation)
df2 = pd.DataFrame(Percentage_of_positive_predictions)

# Save the DataFrame to a CSV file
df.to_csv('Evaluation_of_20bc_oad.csv', index=False)  # You can change the file name and format if needed
df2.to_csv('PPP_20bc_oad.csv', index=False)


Model for category 0 : Self-direction: thought
Early stopping at epoch 3
Precision: 0.40625
Recall: 0.11818181818181818
F1: 0.18309859154929578
Accuracy: 0.8459495351925631
percentage of positive predictions 0.04249667994687915
Model for category 1 : Self-direction: action
Early stopping at epoch 3
Precision: 0.5967741935483871
Recall: 0.20108695652173914
F1: 0.3008130081300813
Accuracy: 0.7715803452855245
percentage of positive predictions 0.08233731739707835
Model for category 2 : Stimulation
Early stopping at epoch 5
Precision undefined
Recall: 0.0
F1 undefined
Accuracy: 0.9601593625498008
percentage of positive predictions 0.0
Model for category 3 : Hedonism
Early stopping at epoch 4
Precision undefined
Recall: 0.0
F1 undefined
Accuracy: 0.9920318725099602
percentage of positive predictions 0.0
Model for category 4 : Achievement
Early stopping at epoch 5
Precision: 0.6727272727272727
Recall: 0.15879828326180256
F1: 0.2569444444444444
Accuracy: 0.7158034528552457
percentage of posit

In [9]:
# saving and downloading evaluation data #currently not used
"""
# Convert the Evaluation lists into Pandas DataFrames
df = pd.DataFrame(Evaluation)
df2 = pd.DataFrame(Percentage_of_positive_predictions)

# Save the DataFrame to a CSV file
df.to_csv('Evaluation_of_20bc_oad.csv', index=False)  # You can change the file name and format if needed
df2.to_csv('PPP_20bc_oad.csv', index=False)

# Download the CSV file to your local device
from google.colab import files
files.download('Evaluation_of_20bc_oad.csv')
files.download('PPP_20bc_oad.csv')
"""

"\n# Convert the Evaluation lists into Pandas DataFrames\ndf = pd.DataFrame(Evaluation)\ndf2 = pd.DataFrame(Percentage_of_positive_predictions)\n\n# Save the DataFrame to a CSV file\ndf.to_csv('Evaluation_of_20bc_oad.csv', index=False)  # You can change the file name and format if needed\ndf2.to_csv('PPP_20bc_oad.csv', index=False)\n\n# Download the CSV file to your local device\nfrom google.colab import files\nfiles.download('Evaluation_of_20bc_oad.csv')\nfiles.download('PPP_20bc_oad.csv')\n"