In [1]:
!pip install pandas==1.3.4
!pip install transformers==4.12.5
!pip install datasets==1.15.1
!pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pandas==1.3.4
  Downloading pandas-1.3.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 13.3 MB/s eta 0:00:01
Installing collected packages: pandas
Successfully installed pandas-1.3.4
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers==4.12.5
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 21.6 MB/s eta 0:00:01
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 37.0 MB/s ta 0:00:01
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 30.7 MB/s eta 0:00:01
I

In [2]:
import os
import pickle

from collections import Counter

# import pandas as pd
from sklearn.metrics import classification_report

import numpy as np
import torch
import torch.nn as nn

import transformers
from transformers import Trainer
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorWithPadding

import datasets
from datasets import Dataset
from datasets import ClassLabel
from datasets import load_metric

## Global variables

In [3]:
DATA_FOLDER = '/notebooks/Data/bert_sequence_classification'
DATA_FILE = '/notebooks/linguistic_features/data/hf_datasets/pe_dataset_linguistic_features.pt'
RESULTS_FOLDER = '/notebooks/Results/bert_sequence_classification'

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
device

device(type='cuda')

## Load data

In [6]:
dataset = torch.load(DATA_FILE)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['essay_nr', 'component_id', 'label_and_comp_idxs', 'text', 'label_x', 'label_ComponentType', 'relation_SupportAttack', 'label_RelationType', 'label_LinkedNotLinked', 'split', 'essay', 'argument_bound_1', 'argument_bound_2', 'argument_id', 'sentence', 'paragraph', 'para_nr', 'total_paras', 'token_count', 'token_count_covering_para', 'tokens_count_covering_sentence', 'preceeding_tokens_in_sentence_count', 'succeeding_tokens_in_sentence_count', 'token_ratio', 'relative_position_in_para_char', 'is_in_intro', 'relative_position_in_para_token', 'is_in_conclusion', 'is_first_in_para', 'is_last_in_para', 'nr_preceeding_comps_in_para', 'nr_following_comps_in_para', 'structural_fts_as_text', 'structural_fts_as_text_combined', 'component_POS', 'strct_fts_and_component_pos', 'sentence_POS', 'strct_fts_and_sentence_pos', 'component_syn_deps', 'strct_fts_and_component_syn_deps', 'sentence_syn_deps', 'strct_fts_and_sentence_syn_deps', 'strct_pos_s

In [8]:
dataset['train']['strct_pos_syn_deps_component'][0]

'Topic: Gender Equality at university admission, Sentence: Therefore, universities follow the requirement of job providers and decide subject suitable for particular gender., Para Number: 3, First in Para: No, Last in Para: Yes, Is in Introduction: No, Is in Conclusion: No. Part Of Speech tags: NOUN, VERB, DET, NOUN, ADP, NOUN, NOUN, CCONJ, VERB, NOUN, ADJ, ADP, ADJ, NOUN. Syntactic dependencies tags: nsubj, ROOT, det, dobj, prep, compound, pobj, cc, conj, dobj, oprd, prep, amod, pobj.'

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
label_names = set(dataset['train']['label_ComponentType'])
label_nb = len(label_names)
labels = ClassLabel(num_classes=label_nb, names=label_names)

In [11]:
labels

ClassLabel(num_classes=3, names={'MajorClaim', 'Premise', 'Claim'}, names_file=None, id=None)

In [12]:
def tokenize(batch):
    tokens = tokenizer(batch['strct_pos_syn_deps_component'], truncation=True, padding=True, max_length=512)
    tokens['labels'] = labels.str2int(batch['label_ComponentType'])
    return tokens

# this is just the text. if the results are nice, check transfer with text + topic 

In [13]:
dataset = dataset.map(tokenize, batched=True)



  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['argument_bound_1', 'argument_bound_2', 'argument_id', 'attention_mask', 'component_POS', 'component_id', 'component_syn_deps', 'essay', 'essay_nr', 'input_ids', 'is_first_in_para', 'is_in_conclusion', 'is_in_intro', 'is_last_in_para', 'label_ComponentType', 'label_LinkedNotLinked', 'label_RelationType', 'label_and_comp_idxs', 'label_x', 'labels', 'nr_following_comps_in_para', 'nr_preceeding_comps_in_para', 'para_nr', 'paragraph', 'preceeding_tokens_in_sentence_count', 'relation_SupportAttack', 'relative_position_in_para_char', 'relative_position_in_para_token', 'sentence', 'sentence_POS', 'sentence_syn_deps', 'split', 'strct_fts_and_component_pos', 'strct_fts_and_component_syn_deps', 'strct_fts_and_sentence_pos', 'strct_fts_and_sentence_syn_deps', 'strct_pos_syn_deps_component', 'strct_pos_syn_deps_sentence', 'structural_fts_as_text', 'structural_fts_as_text_combined', 'succeeding_tokens_in_sentence_count', 'text', 'token_count', '

In [16]:
train_dataset = dataset['train'].shuffle(seed=42)
test_dataset = dataset['test'].shuffle(seed=42)

train_val_datasets = dataset['train'].train_test_split(train_size=0.8, seed=42)
train_dataset = train_val_datasets['train']
val_dataset = train_val_datasets['test']

In [17]:
dataset_d = {}
dataset_d['train'] = train_dataset
dataset_d['test'] = test_dataset
dataset_d['val'] = val_dataset

In [18]:
tokenizer.decode(dataset['train'][230]['input_ids'])

"[CLS] topic : young people should go to university or not, sentence : although higher education does not guarantee young students'success, the benefits of learning in universities are the vital factor in creating more possibilities for the development of society through advancing academic fulfillment and the young generation who have experiences and challenges., para number : 4, first in para : no, last in para : yes, is in introduction : no, is in conclusion : yes. part of speech tags : det, noun, adp, verb, adp, noun, aux, det, adj, noun, adp, verb, adj, noun, adp, det, noun, adp, noun, adp, verb, adj, noun, cconj, det, adj, noun, pron, aux, noun, cconj, noun. syntactic dependencies tags : det, nsubj, prep, pcomp, prep, pobj, root, det, amod, attr, prep, pcomp, amod, dobj, prep, det, pobj, prep, pobj, prep, amod, amod, pobj, cc, det, amod, conj, nsubj, relcl, dobj, cc, conj. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [19]:
# sanity check
set(dataset_d['train']['split'])

{'TRAIN'}

In [20]:
# sanity check
set(dataset_d['val']['split'])

{'TRAIN'}

In [21]:
# sanity check
set(dataset_d['test']['split'])

{'TEST'}

In [22]:
# global variables
NUM_LABELS = labels.num_classes
BATCH_SIZE = 16
NB_EPOCHS = 6

In [23]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)
model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [24]:
# https://huggingface.co/transformers/main_classes/trainer.html
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss()#(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [25]:
metric = load_metric('f1')

def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels, average='macro')

Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

In [26]:
training_args = TrainingArguments(
    
    # output
    output_dir=RESULTS_FOLDER,          
    
    # params
    num_train_epochs=NB_EPOCHS,               # nb of epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # cf. paper Sun et al.
    learning_rate=1e-5,#2e-5,                 # cf. paper Sun et al.
#     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,                         # cf. paper Sun et al.
    weight_decay=0.01,                        # strength of weight decay
    
    # eval
    evaluation_strategy="steps",              # cf. paper Sun et al.
    eval_steps=20,                            # cf. paper Sun et al.
    
    # log
    logging_dir="/notebooks/Results/bert_sequence_classification/tb_logs",  
    logging_strategy='steps',
    logging_steps=20,
    
    # save
    save_strategy='steps',
    save_total_limit=2,
    # save_steps=20, # default 500
    load_best_model_at_end=True,              # cf. paper Sun et al.
    # metric_for_best_model='eval_loss' 
    metric_for_best_model='f1'
)

In [27]:
trainer = CustomTrainer( # Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [28]:
results = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: token_ratio, component_POS, strct_fts_and_component_syn_deps, argument_id, is_in_intro, argument_bound_1, text, total_paras, preceeding_tokens_in_sentence_count, is_in_conclusion, relation_SupportAttack, relative_position_in_para_char, argument_bound_2, essay_nr, strct_pos_syn_deps_sentence, paragraph, essay, structural_fts_as_text, label_x, tokens_count_covering_sentence, is_first_in_para, sentence, para_nr, nr_preceeding_comps_in_para, token_count_covering_para, component_syn_deps, sentence_POS, strct_fts_and_component_pos, label_LinkedNotLinked, relative_position_in_para_token, split, component_id, strct_fts_and_sentence_syn_deps, sentence_syn_deps, nr_following_comps_in_para, strct_pos_syn_deps_component, label_and_comp_idxs, strct_fts_and_sentence_pos, structural_fts_as_text_combined, token_count, label_RelationType, succeeding_tokens_in_

Step,Training Loss,Validation Loss,F1
20,1.1838,1.021577,0.308664
40,0.9388,0.901607,0.257662
60,0.8992,0.886751,0.257662
80,0.8736,0.889564,0.257662
100,0.9029,0.948134,0.257662
120,0.9048,0.898164,0.268597
140,0.8178,0.786897,0.449002
160,0.7405,0.777147,0.466374
180,0.782,0.775161,0.470875
200,0.7989,0.741264,0.465257


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: token_ratio, component_POS, strct_fts_and_component_syn_deps, argument_id, is_in_intro, argument_bound_1, text, total_paras, preceeding_tokens_in_sentence_count, is_in_conclusion, relation_SupportAttack, relative_position_in_para_char, argument_bound_2, essay_nr, strct_pos_syn_deps_sentence, paragraph, essay, structural_fts_as_text, label_x, tokens_count_covering_sentence, is_first_in_para, sentence, para_nr, nr_preceeding_comps_in_para, token_count_covering_para, component_syn_deps, sentence_POS, strct_fts_and_component_pos, label_LinkedNotLinked, relative_position_in_para_token, split, component_id, strct_fts_and_sentence_syn_deps, sentence_syn_deps, nr_following_comps_in_para, strct_pos_syn_deps_component, label_and_comp_idxs, strct_fts_and_sentence_pos, structural_fts_as_text_combined, token_count, label_RelationType, succeeding_tokens_i

In [29]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [30]:
test_trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))
test_raw_preds, test_labels, _ = test_trainer.predict(test_dataset)
test_preds = np.argmax(test_raw_preds, axis=1)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: token_ratio, component_POS, strct_fts_and_component_syn_deps, argument_id, is_in_intro, argument_bound_1, text, total_paras, preceeding_tokens_in_sentence_count, is_in_conclusion, relation_SupportAttack, relative_position_in_para_char, argument_bound_2, essay_nr, strct_pos_syn_deps_sentence, paragraph, essay, structural_fts_as_text, label_x, tokens_count_covering_sentence, is_first_in_para, sentence, para_nr, nr_preceeding_comps_in_para, token_count_covering_para, component_syn_deps, sentence_PO

In [31]:
len(test_preds)

1260

In [32]:
target_name = labels.int2str([0,1,2])
print(classification_report(test_labels, test_preds, target_names=target_name, digits=3))

              precision    recall  f1-score   support

  MajorClaim      0.761     0.915     0.831       153
     Premise      0.916     0.898     0.907       805
       Claim      0.676     0.642     0.659       302

    accuracy                          0.839      1260
   macro avg      0.784     0.819     0.799      1260
weighted avg      0.840     0.839     0.838      1260



In [29]:
torch.cuda.empty_cache()