# Finetune BERT

- Use the clean dataset (v2)

https://huggingface.co/transformers/custom_datasets.html#sequence-classification-with-imdb-reviews

## Librairies

In [1]:
# pytorch version '1.9.0a0+df837d0'

In [2]:
!pip install pandas==1.3.4
!pip install transformers==4.12.5
!pip install datasets==1.15.1

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
!pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [4]:
import os
import pickle

from collections import Counter

# import pandas as pd
from sklearn.metrics import classification_report

import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

import transformers
from transformers import Trainer
from transformers import BertTokenizer
from transformers import BertModel, BertPreTrainedModel
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorWithPadding

import datasets
from datasets import Dataset
from datasets import ClassLabel
from datasets import load_metric

## Global variables

In [5]:
DATA_FOLDER = '/notebooks/Data/bert_sequence_classification'
DATA_FILE = '/notebooks/cascade_bert/pe_dataset_w_real_bert_probs_as_fts.pt'
RESULTS_FOLDER = '/notebooks/Results/bert_sequence_classification'

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
device

device(type='cuda')

## Load data

In [8]:
dataset = torch.load(DATA_FILE)

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts', 'probs_tensor'],
        num_rows: 4709
    })
    test: Dataset({
        features: ['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts', 'probs_tensor'],
        num_rows: 1258
    })
})

In [10]:
dataset['train']['topic_full_sentence_structural_fts_combined'][4548]

'Topic: Do guns can really increase the level of violence? Sentence: Firstly, the number of unemployment rate has risen to 15 percent in several globalized and non-globalized countries. Structural features: Three. No. No. No. No.'

In [11]:
dataset['train']['probs_tensor'][1]

[0.692, 0.294, 0.014]

## Tokenize data

In [12]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [13]:
label_names = set(dataset['train']['labels'])
label_nb = len(label_names)
labels = ClassLabel(num_classes=label_nb, names=label_names)

In [14]:
labels

ClassLabel(num_classes=3, names={0, 1, 2}, names_file=None, id=None)

In [15]:
def tokenize(batch):
    tokens = tokenizer(batch['topic_full_sentence_structural_fts_combined'], truncation=True, padding=True, max_length=512)
    tokens['labels'] = labels.str2int(batch['labels'])
    return tokens

# # DUMMY EXPERIMENT 2
# def tokenize(batch):
#     tokens = tokenizer(batch['complete_text'], truncation=True, padding=True, max_length=512)
#     tokens['labels'] = labels.str2int(batch['labels'])
#     return tokens

In [16]:
dataset = dataset.map(tokenize, batched=True)



  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [17]:
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels', 'probs_tensor'])

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels', 'probs_tensor', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
        num_rows: 4709
    })
    test: Dataset({
        features: ['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels', 'probs_tensor', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
        num_rows: 1258
    })
})

In [19]:
dataset['train']['probs_tensor'].shape

torch.Size([4709, 3])

In [20]:
dataset['train']['labels'].shape

torch.Size([4709])

## Split data

In [21]:
train_dataset = dataset['train']#.shuffle(seed=42)
test_dataset = dataset['test']#.shuffle(seed=42)

train_val_datasets = dataset['train'].train_test_split(train_size=0.8)
train_dataset = train_val_datasets['train']
val_dataset = train_val_datasets['test']

In [22]:
dataset_d = {}
dataset_d['train'] = train_dataset
dataset_d['test'] = test_dataset
dataset_d['val'] = val_dataset

In [23]:
dataset_d

{'train': Dataset({
     features: ['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels', 'probs_tensor', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
     num_rows: 3767
 }),
 'test': Dataset({
     features: ['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels', 'probs_tensor', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
     num_rows: 1258
 }),
 'val': Dataset({
     features: ['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels', 'probs_tensor', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
     num_rows: 942
 })}

In [24]:
# sanity check
set(dataset_d['train']['split']), len(dataset_d['train'])

({'TRAIN'}, 3767)

In [25]:
# sanity check
set(dataset_d['val']['split']), len(dataset_d['val'])

({'TRAIN'}, 942)

In [26]:
# sanity check
set(dataset_d['test']['split']), len(dataset_d['test'])

({'TEST'}, 1258)

## Model

In [27]:
# global variables
NUM_LABELS = labels.num_classes
NUM_FEATURES = dataset_d['train']['probs_tensor'].shape[1] # o for DUMMY EXPERIMENT
BATCH_SIZE = 48
NB_EPOCHS = 6

In [28]:
NUM_LABELS, NUM_FEATURES

(3, 3)

In [29]:
class CustomBertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_features):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.num_features = num_features # *** MODIF ***
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size + num_features, config.num_labels) # *** MODIF ***

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        probs_tensor=None # *** MODIF ***
    ):
        
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]
        #print(pooled_output)
        #print(pooled_output.shape)
        #print(probs_tensor.shape)
        pooled_output = torch.cat([pooled_output, probs_tensor], dim=1) # *** MODIF *** COMMENT FOR DUMMY EXPERIMENT
        pooled_output = self.dropout(pooled_output)
        #print(pooled_output.shape)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [30]:
model = CustomBertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                            num_labels=NUM_LABELS, 
                                                            num_features=NUM_FEATURES)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing CustomBertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing CustomBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CustomBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CustomBertForSequenceClassification were not initialized from

CustomBertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [31]:
# just the last layer / freezing the bert model / training only last.

# for param in model.bert.parameters():
#     param.requires_grad = False

In [32]:
# for param in model.bert.parameters():
    
#     print(param)

In [33]:
# # testing
# from torch.utils.data import DataLoader

# batch = dataset_d['train'][10:14]
# batch = DataLoader(dataset_d['train'], batch_size=2, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))

# for b in batch:
#     break

# b.to(device)

# input_ids = b['input_ids']
# attention_mask = b['attention_mask']
# labels = b['labels']
# features = b['features']

# outputs = model(input_ids=input_ids, attention_mask=attention_mask, features=features, labels=labels)
# outputs

## Training

In [34]:
counter = Counter(dataset_d['train']['labels'].tolist())
counter

Counter({1: 2371, 2: 448, 0: 948})

In [35]:
class_weights = [max(counter.values()) / counter[k] for k in sorted(counter.keys())]
class_weights = torch.FloatTensor(class_weights).to(device)
class_weights

tensor([2.5011, 1.0000, 5.2924], device='cuda:0')

In [36]:
# https://huggingface.co/transformers/main_classes/trainer.html
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs[1]#.get('logits')
        loss_fct = nn.CrossEntropyLoss()#(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [37]:
metric = load_metric('f1')

def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [38]:
training_args = TrainingArguments(
    
    # output
    output_dir=RESULTS_FOLDER,          
    
    # params
    num_train_epochs=NB_EPOCHS,               # nb of epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # cf. paper Sun et al.
    learning_rate=1e-5,#2e-5,                 # cf. paper Sun et al.
#     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,                         # cf. paper Sun et al.
    weight_decay=0.01,                        # strength of weight decay
    
    # eval
    evaluation_strategy="steps",              # cf. paper Sun et al.
    eval_steps=20,                            # cf. paper Sun et al.
    
    # log
    logging_dir=RESULTS_FOLDER+'logs',  
    logging_strategy='steps',
    logging_steps=20,
    
    # save
    save_strategy='steps',
    save_total_limit=2,
    # save_steps=20, # default 500
    load_best_model_at_end=True,              # cf. paper Sun et al.
    # metric_for_best_model='eval_loss'
    metric_for_best_model='f1'
)

In [39]:
trainer = CustomTrainer( # Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [40]:
results = trainer.train()

The following columns in the training set  don't have a corresponding argument in `CustomBertForSequenceClassification.forward` and have been ignored: topic_and_full_sentence, split, topic_full_sentence_stuctural_fts, topic_full_sentence_structural_fts_combined, feature_tensor, Unnamed: 0, text, sentence.
***** Running training *****
  Num examples = 3767
  Num Epochs = 6
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 474


Step,Training Loss,Validation Loss,F1
20,1.1781,1.043391,0.255672
40,0.9167,0.835112,0.255672
60,0.7557,0.678903,0.372474
80,0.6228,0.599988,0.43948
100,0.5757,0.570946,0.476227
120,0.5821,0.550665,0.719701
140,0.5539,0.516058,0.784067
160,0.5143,0.51174,0.73543
180,0.5313,0.472106,0.755232
200,0.4716,0.455535,0.780805


The following columns in the evaluation set  don't have a corresponding argument in `CustomBertForSequenceClassification.forward` and have been ignored: topic_and_full_sentence, split, topic_full_sentence_stuctural_fts, topic_full_sentence_structural_fts_combined, feature_tensor, Unnamed: 0, text, sentence.
***** Running Evaluation *****
  Num examples = 942
  Batch size = 48
The following columns in the evaluation set  don't have a corresponding argument in `CustomBertForSequenceClassification.forward` and have been ignored: topic_and_full_sentence, split, topic_full_sentence_stuctural_fts, topic_full_sentence_structural_fts_combined, feature_tensor, Unnamed: 0, text, sentence.
***** Running Evaluation *****
  Num examples = 942
  Batch size = 48
The following columns in the evaluation set  don't have a corresponding argument in `CustomBertForSequenceClassification.forward` and have been ignored: topic_and_full_sentence, split, topic_full_sentence_stuctural_fts, topic_full_sentence_st

In [None]:
# save best model
trainer.save_model(os.path.join(RESULTS_FOLDER, 'checkpoint-best'))

Saving model checkpoint to /raid/home/jeremiec/Data/persuasive_essays/BERT_finetuning/checkpoint-best
Configuration saved in /raid/home/jeremiec/Data/persuasive_essays/BERT_finetuning/checkpoint-best/config.json
Model weights saved in /raid/home/jeremiec/Data/persuasive_essays/BERT_finetuning/checkpoint-best/pytorch_model.bin
tokenizer config file saved in /raid/home/jeremiec/Data/persuasive_essays/BERT_finetuning/checkpoint-best/tokenizer_config.json
Special tokens file saved in /raid/home/jeremiec/Data/persuasive_essays/BERT_finetuning/checkpoint-best/special_tokens_map.json


## Results

In [None]:
# load model
# model_file = os.path.join(RESULTS_FOLDER, 'checkpoint-best')
# model_file = os.path.join(RESULTS_FOLDER, 'checkpoint-1500')

# model = CustomBertForSequenceClassification.from_pretrained(model_file,
#                                                             num_labels=NUM_LABELS,
#                                                             num_features=NUM_FEATURES)
# model.to(device)

loading configuration file /raid/home/jeremiec/Data/persuasive_essays/BERT_finetuning/checkpoint-best/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "CustomBertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file /r

CustomBertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [41]:
model.eval()

CustomBertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

- Test results

In [42]:
test_trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))
test_raw_preds, test_labels, _ = test_trainer.predict(test_dataset)
test_preds = np.argmax(test_raw_preds, axis=1)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `CustomBertForSequenceClassification.forward` and have been ignored: topic_and_full_sentence, split, topic_full_sentence_stuctural_fts, topic_full_sentence_structural_fts_combined, feature_tensor, Unnamed: 0, text, sentence.
***** Running Prediction *****
  Num examples = 1258
  Batch size = 8


In [43]:
sum(test_preds)

1158

In [44]:
target_name = labels.int2str([0,1,2])
print(classification_report(test_labels, test_preds, target_names=target_name))

              precision    recall  f1-score   support

           0       0.69      0.66      0.68       301
           1       0.92      0.89      0.91       805
           2       0.78      0.96      0.86       152

    accuracy                           0.85      1258
   macro avg       0.80      0.84      0.81      1258
weighted avg       0.85      0.85      0.85      1258



precision    recall  f1-score   support

     Premise       0.90      0.90      0.90       805
  MajorClaim       0.70      0.99      0.82       152
       Claim       0.67      0.52      0.59       301

    accuracy                           0.82      1258
   macro avg       0.75      0.80      0.77      1258
weighted avg       0.82      0.82      0.81      1258

BERT + all 5 features concated. w learning rate: 5e-3 w param freezing but the already trained

- Train results

In [None]:
train_trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))
train_raw_preds, train_labels, _ = train_trainer.predict(train_dataset)
train_preds = np.argmax(train_raw_preds, axis=1)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `CustomBertForSequenceClassification.forward` and have been ignored: premise_indicator, text, complete_text, argument_bound_1, argument_id, claim_indicator, split, argument_bound_2, essay.
***** Running Prediction *****
  Num examples = 3858
  Batch size = 8


In [None]:
sum(train_preds)

4545

In [None]:
target_name = labels.int2str([0,1,2])
print(classification_report(train_labels, train_preds, target_names=target_name))

              precision    recall  f1-score   support

  MajorClaim       0.82      0.98      0.89       472
     Premise       0.98      0.82      0.89      2416
       Claim       0.69      0.89      0.78       970

    accuracy                           0.86      3858
   macro avg       0.83      0.90      0.85      3858
weighted avg       0.88      0.86      0.86      3858



In [None]:
torch.cuda.empty_cache()