# Finetune BERT

- Use the clean dataset (v2)

https://huggingface.co/transformers/custom_datasets.html#sequence-classification-with-imdb-reviews

## Librairies

In [None]:
# pytorch version '1.9.0a0+df837d0'

In [21]:
# Tasks:

# 1. Finetune for linked/not_linked using text (Done)
# 2. Finetune for linked/not_linked using strct fts as text (Done)
# 3. Idea: try giving the component class in both 1 and 2 and see if it improves! It should cause Claims
# are the most linked component type. (Done! Good results!) (Not nice.)
# 3.1 Check where you got the MC/CL/PREM information. If you got it from a pervious classifier,
# then it's ok. otherwise, it's cheating.
# Idea: Task 6 is the equivalent of this task. Should be good results also!
# 4. Finetune of joined label using text (Done) (Semi nice results)
# 5. Finetune for joined label using strct fts as text (Error!)
# 6. Finetune for MC/CL/PREM using strct w linked (Error!) (Not nice.)

In [1]:
!pip install pandas==1.3.4
!pip install transformers==4.12.5
!pip install datasets==1.15.1
!pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import os
import pickle

from collections import Counter

# import pandas as pd
from sklearn.metrics import classification_report

import numpy as np
import torch
import torch.nn as nn

import transformers
from transformers import Trainer
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorWithPadding

import datasets
from datasets import Dataset
from datasets import ClassLabel
from datasets import load_metric

## Global variables

In [3]:
# DATA_FOLDER = '/notebooks/Data/bert_sequence_classification'
DATA_FILE = '/notebooks/link_identification_task/data/pe_dataset_w_strct_comp_type.pt'
RESULTS_FOLDER = '/notebooks/Results/bert_sequence_classification'

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
device

device(type='cuda')

## Load data

In [6]:
dataset = torch.load(DATA_FILE)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['essay_nr', 'starting_idx', 'component_label', 'ending_idx', 'text', 'labels', 'split', 'essay', 'argument_bound_1', 'argument_bound_2', 'argument_id', 'label', 'structural_fts_as_text', 'joined_label', 'strct_fts_w_linked', 'strct_fts_w_comp_type'],
        num_rows: 4712
    })
    test: Dataset({
        features: ['essay_nr', 'starting_idx', 'component_label', 'ending_idx', 'text', 'labels', 'split', 'essay', 'argument_bound_1', 'argument_bound_2', 'argument_id', 'label', 'structural_fts_as_text', 'joined_label', 'strct_fts_w_linked', 'strct_fts_w_comp_type'],
        num_rows: 1138
    })
})

In [8]:
dataset['train']['text'][230]

'There will not be such worries when young adults live in their own home'

## Tokenize data

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [10]:
label_names = set(dataset['train']['joined_label'])
label_nb = len(label_names)
labels = ClassLabel(num_classes=label_nb, names=label_names)

In [11]:
labels

ClassLabel(num_classes=5, names={'CL-NL', 'PREM-NL', 'MC-NL', 'PREM-L', 'CL-L'}, names_file=None, id=None)

In [12]:
def tokenize(batch):
    tokens = tokenizer(batch['structural_fts_as_text'], truncation=True, padding=True, max_length=512)
    tokens['labels'] = labels.str2int(batch['joined_label'])
    return tokens


In [13]:
dataset = dataset.map(tokenize, batched=True)



  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [14]:
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['argument_bound_1', 'argument_bound_2', 'argument_id', 'attention_mask', 'component_label', 'ending_idx', 'essay', 'essay_nr', 'input_ids', 'joined_label', 'label', 'labels', 'split', 'starting_idx', 'strct_fts_w_comp_type', 'strct_fts_w_linked', 'structural_fts_as_text', 'text', 'token_type_ids'],
        num_rows: 4712
    })
    test: Dataset({
        features: ['argument_bound_1', 'argument_bound_2', 'argument_id', 'attention_mask', 'component_label', 'ending_idx', 'essay', 'essay_nr', 'input_ids', 'joined_label', 'label', 'labels', 'split', 'starting_idx', 'strct_fts_w_comp_type', 'strct_fts_w_linked', 'structural_fts_as_text', 'text', 'token_type_ids'],
        num_rows: 1138
    })
})

In [16]:
set(dataset['train']['labels'])

{tensor(3),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(4),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(0),
 tensor(1),
 tensor(4),
 tensor(1),
 tensor(4),
 tensor(1),
 tensor(0),
 tensor(1),
 tensor(1),
 tensor(4),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(1),
 tensor(3),
 tensor(0),
 tensor(4),
 tensor(1),
 tensor(1),
 tensor(4),
 tensor(1),
 tensor(4),
 tensor(1),
 tensor(2),
 tensor(4),
 tensor(4),
 tensor(2),
 tensor(1),
 tensor(1),
 tensor(3),
 tensor(0),
 tensor(3),
 tensor(0),
 tensor(1),
 tensor(1),
 tensor(0),
 tensor(4),
 tensor(4),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(3),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(0),
 tensor(1),
 tensor(0),
 tensor(0),
 tensor(1),
 tensor(0),
 tensor(1),
 tensor(4),
 tensor(4),
 tensor(1),
 tensor(0),
 ten

## Split data

In [17]:
train_dataset = dataset['train']#.shuffle(seed=42)
test_dataset = dataset['test']#.shuffle(seed=42)

train_val_datasets = dataset['train'].train_test_split(train_size=0.8)
train_dataset = train_val_datasets['train']
val_dataset = train_val_datasets['test']

In [18]:
dataset_d = {}
dataset_d['train'] = train_dataset
dataset_d['test'] = test_dataset
dataset_d['val'] = val_dataset

In [19]:
tokenizer.decode(dataset['test'][945]['input_ids'])

'[CLS] topic : attending boys and girls in a same school would bring many advantages, sentence : in my opinion, boys and girls should study together in the sense that they have a chance to know each other and learn how to behave with an opposite sex, para number : 1, first in para : yes, last in para : no, is in introduction : yes, is in conclusion : no [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [20]:
# sanity check
set(dataset_d['train']['split'])

{'TRAIN'}

In [21]:
# sanity check
set(dataset_d['val']['split'])

{'TRAIN'}

In [22]:
# sanity check
set(dataset_d['test']['split'])

{'TEST'}

In [23]:
labels

ClassLabel(num_classes=5, names={'CL-NL', 'PREM-NL', 'MC-NL', 'PREM-L', 'CL-L'}, names_file=None, id=None)

## Model

In [24]:
# global variables
NUM_LABELS = labels.num_classes
BATCH_SIZE = 48
NB_EPOCHS = 6

In [25]:
NUM_LABELS

5

In [26]:
del model

NameError: name 'model' is not defined

In [29]:
del trainer

NameError: name 'trainer' is not defined

In [30]:
del test_trainer

NameError: name 'test_trainer' is not defined

In [31]:
torch.cuda.empty_cache()

In [32]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [33]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Training

In [34]:
counter = Counter(dataset_d['train']['labels'].tolist())
counter

Counter({1: 2093, 0: 313, 2: 459, 4: 634, 3: 270})

In [35]:
class_weights = [max(counter.values()) / counter[k] for k in sorted(counter.keys())]
class_weights = torch.FloatTensor(class_weights).to(device)
class_weights

tensor([6.6869, 1.0000, 4.5599, 7.7519, 3.3013], device='cuda:0')

In [36]:
# https://huggingface.co/transformers/main_classes/trainer.html
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss()#(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [37]:
metric = load_metric('f1')

def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [38]:
training_args = TrainingArguments(
    
    # output
    output_dir=RESULTS_FOLDER,          
    
    # params
    num_train_epochs=NB_EPOCHS,               # nb of epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # cf. paper Sun et al.
    learning_rate=1e-5,#2e-5,                 # cf. paper Sun et al.
#     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,                         # cf. paper Sun et al.
    weight_decay=0.01,                        # strength of weight decay
    
    # eval
    evaluation_strategy="steps",              # cf. paper Sun et al.
    eval_steps=20,                            # cf. paper Sun et al.
    
    # log
    logging_dir="/notebooks/Results/bert_sequence_classification/tb_logs",  
    logging_strategy='steps',
    logging_steps=20,
    
    # save
    save_strategy='steps',
    save_total_limit=2,
    # save_steps=20, # default 500
    load_best_model_at_end=True,              # cf. paper Sun et al.
    # metric_for_best_model='eval_loss' 
    metric_for_best_model='f1'
)

In [39]:
trainer = CustomTrainer( # Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [40]:
results = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: argument_bound_2, component_label, argument_bound_1, strct_fts_w_comp_type, strct_fts_w_linked, starting_idx, structural_fts_as_text, ending_idx, essay, text, joined_label, essay_nr, split, argument_id.
***** Running training *****
  Num examples = 3769
  Num Epochs = 6
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 474


Step,Training Loss,Validation Loss,F1
20,1.5671,1.382484,0.14375
40,1.3352,1.245005,0.14375
60,1.2183,1.126298,0.14375
80,1.1096,1.021288,0.315974
100,1.0166,0.967576,0.307276
120,1.0028,0.922521,0.311068
140,0.9664,0.893415,0.392047
160,0.9041,0.879559,0.421332
180,0.9249,0.8649,0.413706
200,0.8669,0.855834,0.416372


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: argument_bound_2, component_label, argument_bound_1, strct_fts_w_comp_type, strct_fts_w_linked, starting_idx, structural_fts_as_text, ending_idx, essay, text, joined_label, essay_nr, split, argument_id.
***** Running Evaluation *****
  Num examples = 943
  Batch size = 48
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: argument_bound_2, component_label, argument_bound_1, strct_fts_w_comp_type, strct_fts_w_linked, starting_idx, structural_fts_as_text, ending_idx, essay, text, joined_label, essay_nr, split, argument_id.
***** Running Evaluation *****
  Num examples = 943
  Batch size = 48
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: argument_boun

In [36]:
# save best model
trainer.save_model(os.path.join("/notebooks/Results/bert_sequence_classification", 'checkpoint-best-pe-w-probs_1'))

Saving model checkpoint to /notebooks/Results/bert_sequence_classification/checkpoint-best-pe-w-probs_1
Configuration saved in /notebooks/Results/bert_sequence_classification/checkpoint-best-pe-w-probs_1/config.json
Model weights saved in /notebooks/Results/bert_sequence_classification/checkpoint-best-pe-w-probs_1/pytorch_model.bin
tokenizer config file saved in /notebooks/Results/bert_sequence_classification/checkpoint-best-pe-w-probs_1/tokenizer_config.json
Special tokens file saved in /notebooks/Results/bert_sequence_classification/checkpoint-best-pe-w-probs_1/special_tokens_map.json


## Results

In [41]:
# # load model
#model_file = os.path.join("/notebooks/Results/bert_sequence_classification", 'checkpoint-best-pe-w-probs_1')
# model_file = os.path.join(RESULTS_FOLDER, 'checkpoint-1500')

#model = BertForSequenceClassification.from_pretrained(model_file, num_labels=NUM_LABELS)
#model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

- Test results

In [42]:
test_trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))
test_raw_preds, test_labels, _ = test_trainer.predict(test_dataset)
test_preds = np.argmax(test_raw_preds, axis=1)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: argument_bound_2, component_label, argument_bound_1, strct_fts_w_comp_type, strct_fts_w_linked, starting_idx, structural_fts_as_text, ending_idx, essay, text, joined_label, essay_nr, split, argument_id.
***** Running Prediction *****
  Num examples = 1138
  Batch size = 8


In [43]:
sum(test_preds)

2027

In [44]:
target_name = labels.int2str([0,1,2,3,4])
print(classification_report(test_labels, test_preds, target_names=target_name, digits=3))

              precision    recall  f1-score   support

       CL-NL      0.684     0.121     0.206       107
     PREM-NL      0.807     0.883     0.843       624
       MC-NL      0.716     0.971     0.824       140
      PREM-L      0.350     0.071     0.118        99
        CL-L      0.580     0.780     0.665       168

    accuracy                          0.736      1138
   macro avg      0.627     0.565     0.531      1138
weighted avg      0.711     0.736     0.692      1138



In [77]:
test_labels

array([0, 1, 2, ..., 2, 2, 1])

In [78]:
labels.int2str(0).split("-")[0]

'CL'

In [45]:
def get_component_type(x):
    # print(x)
    result = x.split("-")[0]
    # print(result)
    return result
    

In [46]:
def get_link_type(x):
    
    return x.split("-")[1]

In [99]:
labels.int2str(test_labels)

['CL-NL',
 'MC-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'CL-L',
 'PREM-NL',
 'PREM-NL',
 'PREM-L',
 'CL-NL',
 'MC-NL',
 'MC-NL',
 'CL-L',
 'PREM-NL',
 'CL-NL',
 'PREM-L',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'CL-L',
 'CL-NL',
 'MC-NL',
 'MC-NL',
 'CL-NL',
 'CL-L',
 'PREM-NL',
 'PREM-NL',
 'PREM-L',
 'PREM-NL',
 'CL-NL',
 'PREM-NL',
 'PREM-NL',
 'CL-L',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-L',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'MC-NL',
 'CL-L',
 'PREM-NL',
 'PREM-NL',
 'MC-NL',
 'PREM-NL',
 'CL-L',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-L',
 'PREM-NL',
 'CL-NL',
 'PREM-NL',
 'PREM-L',
 'PREM-NL',
 'PREM-NL',
 'CL-L',
 'MC-NL',
 'CL-L',
 'PREM-NL',
 'PREM-L',
 'PREM-NL',
 'PREM-NL',
 'CL-NL',
 'PREM-NL',
 'PREM-NL',
 'MC-NL',
 'MC-NL',
 'CL-NL',
 'CL-L',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'CL-NL',
 'PREM-L',
 'PREM-NL',
 'PREM-NL',
 'PREM-L',
 'MC-NL',
 'CL-NL',
 'MC-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-NL',
 'PREM-L',
 'PREM

In [47]:
test_labels_components = list(map(get_component_type, labels.int2str(test_labels)))

In [48]:
test_labels_components

['CL',
 'MC',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'MC',
 'MC',
 'CL',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'CL',
 'MC',
 'MC',
 'CL',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'MC',
 'CL',
 'PREM',
 'PREM',
 'MC',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'MC',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'MC',
 'MC',
 'CL',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'MC',
 'CL',
 'MC',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'MC',
 'MC',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'CL',
 'MC',
 'MC',
 'CL',
 'PREM',
 'CL',
 'P

In [49]:
test_preds_components = list(map(get_component_type, labels.int2str(test_preds)))

In [50]:
test_preds_components

['MC',
 'MC',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'MC',
 'MC',
 'CL',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'CL',
 'MC',
 'CL',
 'MC',
 'MC',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'MC',
 'MC',
 'PREM',
 'PREM',
 'MC',
 'CL',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'MC',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'MC',
 'PREM',
 'CL',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'MC',
 'MC',
 'MC',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'MC',
 'MC',
 'MC',
 'CL',
 'PREM',
 'CL',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'PREM',
 'MC',
 'MC',
 'MC',
 'CL',
 'PREM',
 'CL',
 'P

In [51]:
print(classification_report(test_labels_components, test_preds_components, digits=3))

              precision    recall  f1-score   support

          CL      0.698     0.622     0.658       275
          MC      0.716     0.971     0.824       140
        PREM      0.923     0.898     0.910       723

    accuracy                          0.840      1138
   macro avg      0.779     0.830     0.797      1138
weighted avg      0.843     0.840     0.839      1138



In [52]:
test_labels_links = list(map(get_link_type, labels.int2str(test_labels)))

In [53]:
test_preds_links = list(map(get_link_type, labels.int2str(test_preds)))

In [54]:
print(classification_report(test_labels_links, test_preds_links, digits=3))

              precision    recall  f1-score   support

           L      0.610     0.562     0.585       267
          NL      0.869     0.890     0.879       871

    accuracy                          0.813      1138
   macro avg      0.739     0.726     0.732      1138
weighted avg      0.808     0.813     0.810      1138



- Train results

In [None]:
train_trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))
train_raw_preds, train_labels, _ = train_trainer.predict(train_dataset)
train_preds = np.argmax(train_raw_preds, axis=1)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: aux_count, prec_tokens_sentence_count, modal_verb_present, forward_indic, share_count_concl, share_count, coord_conjunction_count, token_count, prec_comps_in_para, first_in_para, succ_comps_in_para, noun_verb_share, thesis_indic_context, determiner_count, split, adverb_count, rebuttal_indic_context, rebuttal_indic, thesis_indic, noun_count, pronoun_count, argument_id, adjective_count, cl_prob, essay, subord_conjunction_count, claim_word_context, mc_prob, succ_tokens_sentence_count, argument_boun

In [None]:
sum(train_preds)

2312

In [None]:
target_name = labels.int2str([0,1,2])
print(classification_report(train_labels, train_preds, target_names=target_name))

              precision    recall  f1-score   support

     Premise       0.96      0.98      0.97      2416
  MajorClaim       0.88      0.97      0.93       472
       Claim       0.92      0.85      0.88       970

    accuracy                           0.94      3858
   macro avg       0.92      0.93      0.93      3858
weighted avg       0.94      0.94      0.94      3858



In [None]:
torch.cuda.empty_cache()