In [1]:
!pip install pandas==1.3.4
!pip install transformers==4.12.5
!pip install datasets==1.15.1
#!pip install datasets
!pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import os
import pickle

from collections import Counter

# import pandas as pd
from sklearn.metrics import classification_report

import numpy as np
import torch
import torch.nn as nn

import transformers
from transformers import Trainer
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorWithPadding

import datasets
from datasets import Dataset
from datasets import ClassLabel
from datasets import load_metric

In [3]:
torch.cuda.empty_cache()

## Global variables

In [4]:
DATA_FOLDER = '/notebooks/Data/bert_sequence_classification'
DATA_FILE = '/notebooks/ICANN/Datasets/dataset_persuasive_essays_icann.pt'
RESULTS_FOLDER = '/notebooks/cascade_bert/saved_models'

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
device

device(type='cuda')

## Load data

In [7]:
dataset = torch.load(DATA_FILE)

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['split', 'text', 'labels', 'sentence', 'topic_and_full_sentence', 'topic_full_sentence_stuctural_fts', 'topic_full_sentence_structural_fts_combined', 'feature_tensor'],
        num_rows: 4709
    })
    test: Dataset({
        features: ['split', 'text', 'labels', 'sentence', 'topic_and_full_sentence', 'topic_full_sentence_stuctural_fts', 'topic_full_sentence_structural_fts_combined', 'feature_tensor'],
        num_rows: 1258
    })
})

In [9]:
dataset['train']['topic_full_sentence_structural_fts_combined'][230]

'Topic: Some young adults want independence from their parents quickly. Sentence: There will not be such worries when young adults live in their own home, because parents will take care for them. Structural features: Two. No. No. No. No.'

In [10]:
dataset['train'] = dataset['train'].flatten_indices()

Flattening the indices:   0%|          | 0/5 [00:00<?, ?ba/s]

In [11]:
dataset['test'] = dataset['test'].flatten_indices()

Flattening the indices:   0%|          | 0/2 [00:00<?, ?ba/s]

In [12]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [13]:
label_names = ['Claim', 'Premise', 'MajorClaim']
label_nb = len(label_names)
labels = ClassLabel(num_classes=label_nb, names=label_names)

In [14]:
labels

ClassLabel(num_classes=3, names=['Claim', 'Premise', 'MajorClaim'], names_file=None, id=None)

In [15]:
def tokenize(batch):
    tokens = tokenizer(batch['topic_full_sentence_structural_fts_combined'], truncation=True, padding=True, max_length=512)
    tokens['labels'] = labels.str2int(batch['labels'])
    return tokens

# this is just the text. if the results are nice, check transfer with text + topic 

In [16]:
dataset = dataset.map(tokenize, batched=True)



  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [17]:
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
        num_rows: 4709
    })
    test: Dataset({
        features: ['attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
        num_rows: 1258
    })
})

In [19]:
train_dataset = dataset['train']#.shuffle(seed=42)
test_dataset = dataset['test']#.shuffle(seed=42)

# train_val_datasets = dataset['train'].train_test_split(train_size=0.8)
# train_dataset = train_val_datasets['train']
# val_dataset = train_val_datasets['test']

In [20]:
dataset_d = {}
dataset_d['train'] = train_dataset
dataset_d['test'] = test_dataset
# dataset_d['val'] = val_dataset

In [21]:
test_dataset

Dataset({
    features: ['attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
    num_rows: 1258
})

In [22]:
# 4709, 1258

In [23]:
tokenizer.decode(dataset['train'][2945]['input_ids'])

"[CLS] topic : what's more important : hard work or luck? sentence : it is not for which ronaldo is more fortune than me. structural features : two. no. no. no. no. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [24]:
# sanity check
set(dataset_d['train']['split'])

{'TRAIN'}

In [25]:
# sanity check
set(dataset_d['test']['split'])

{'TEST'}

## load model

In [26]:
# # load model
model_file = os.path.join("/notebooks/cascade_bert/saved_models", 'best-model-probs')
# model_file = os.path.join(RESULTS_FOLDER, 'checkpoint-1500')

model = BertForSequenceClassification.from_pretrained(model_file, num_labels=3)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [27]:
trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))

In [28]:
test_raw_preds, test_labels, _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_raw_preds, axis=1)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: topic_and_full_sentence, split, sentence, topic_full_sentence_stuctural_fts, topic_full_sentence_structural_fts_combined, feature_tensor, text.
***** Running Prediction *****
  Num examples = 1258
  Batch size = 8


In [29]:
# target_name = labels.int2str([0,1,2])
print(classification_report(test_labels, test_preds))

              precision    recall  f1-score   support

           0       0.67      0.71      0.69       301
           1       0.93      0.88      0.90       805
           2       0.79      0.92      0.85       152

    accuracy                           0.84      1258
   macro avg       0.80      0.84      0.82      1258
weighted avg       0.85      0.84      0.85      1258



In [30]:
train_raw_preds, train_labels, _ = trainer.predict(train_dataset)
train_preds = np.argmax(train_raw_preds, axis=1)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: topic_and_full_sentence, split, sentence, topic_full_sentence_stuctural_fts, topic_full_sentence_structural_fts_combined, feature_tensor, text.
***** Running Prediction *****
  Num examples = 4709
  Batch size = 8


In [31]:
# target_name = labels.int2str([0,1,2])
print(classification_report(train_labels, train_preds))

              precision    recall  f1-score   support

           0       0.64      0.69      0.66      1173
           1       0.92      0.87      0.89      2957
           2       0.77      0.90      0.83       579

    accuracy                           0.83      4709
   macro avg       0.78      0.82      0.80      4709
weighted avg       0.83      0.83      0.83      4709



In [32]:
# this is the correct softmax thing

# x = torch.softmax(torch.tensor(train_raw_preds[0]), 0)

In [33]:
# Sanity check

list(train_labels) == list(dataset['train']['labels']) 

True

In [34]:
# Sanity check

list(test_labels) == list(dataset['test']['labels'])

True

### dataset work

In [39]:
train_raw_preds.shape

(4709, 3)

In [41]:
# this is the correct softmax thing

x = torch.softmax(torch.tensor(train_raw_preds[0]), 0)

In [42]:
x

tensor([0.0393, 0.0224, 0.9383])

In [35]:
train_probs = np.array(train_raw_preds)

In [36]:
test_probs = np.array(test_raw_preds)

In [115]:
train_probs[0]

array([-0.82, -1.38,  2.36], dtype=float32)

In [107]:
train_probs = np.round(train_probs, 2)

In [110]:
test_probs = np.round(test_probs, 2)

In [111]:
test_probs

array([[ 0.38, -1.06,  0.34],
       [-0.05, -2.19,  1.46],
       [ 0.59,  3.03, -2.52],
       ...,
       [ 0.99,  2.18, -2.6 ],
       [ 0.92,  3.2 , -2.66],
       [-0.74, -2.08,  2.2 ]], dtype=float32)

In [118]:
new_list_train = []

for i in range(0, 4709, 1):
    
    new_list_train.append(str(train_probs[i]))

In [120]:
new_list_train

'[-0.82 -1.38  2.36]'

In [121]:
import pandas as pd

In [122]:
df_train_probs = pd.DataFrame(new_list_train) 

In [123]:
df_train_probs

Unnamed: 0,0
0,[-0.82 -1.38 2.36]
1,[ 1.54 0.69 -2.34]
2,[ 0.32 3.74 -2.35]
3,[ 0.5 3.78 -2.47]
4,[ 0.81 3.53 -2.65]
...,...
4704,[ 0.71 3.29 -2.78]
4705,[ 0.84 2.61 -2.78]
4706,[ 0.84 2.61 -2.78]
4707,[ 1.29 1.68 -2.75]


In [124]:
new_list_test = []

for i in range(0, 1258, 1):
    
    new_list_test.append(str(test_probs[i]))

In [125]:
new_list_test

['[ 0.38 -1.06  0.34]',
 '[-0.05 -2.19  1.46]',
 '[ 0.59  3.03 -2.52]',
 '[ 0.09  3.73 -2.09]',
 '[-0.02  3.64 -2.  ]',
 '[ 1.45  0.05 -1.77]',
 '[ 0.56  3.38 -2.62]',
 '[-0.26  3.56 -1.69]',
 '[ 0.14  3.8  -2.19]',
 '[ 1.42  0.34 -2.  ]',
 '[-0.32 -2.34  1.73]',
 '[-0.78 -1.45  2.35]',
 '[ 1.53  0.65 -2.34]',
 '[ 0.82  3.02 -2.61]',
 '[ 1.56  0.55 -2.29]',
 '[ 0.57  3.71 -2.53]',
 '[ 0.57  3.71 -2.53]',
 '[ 1.01  2.55 -2.73]',
 '[ 1.41  0.95 -2.42]',
 '[ 0.57  3.78 -2.54]',
 '[ 1.4   0.49 -2.03]',
 '[ 0.46 -2.05  0.92]',
 '[ 0.76 -1.74  0.29]',
 '[-0.16 -2.31  1.57]',
 '[ 1.12 -0.93 -0.86]',
 '[ 1.49  0.6  -2.28]',
 '[ 0.58  3.72 -2.53]',
 '[ 0.11  3.64 -2.13]',
 '[ 1.32  0.58 -2.07]',
 '[ 0.9   3.36 -2.76]',
 '[ 1.43  0.81 -2.36]',
 '[-0.03  3.49 -2.07]',
 '[ 0.36  3.55 -2.32]',
 '[ 1.52  0.49 -2.25]',
 '[ 0.31  3.82 -2.37]',
 '[ 0.25  3.69 -2.32]',
 '[ 0.05  3.71 -2.2 ]',
 '[ 0.43  3.75 -2.52]',
 '[ 0.14  3.67 -2.25]',
 '[ 0.36  3.77 -2.44]',
 '[ 0.56  3.46 -2.44]',
 '[-0.71 -2.19  

In [126]:
df_test_probs = pd.DataFrame(new_list_test)

In [127]:
df_test_probs

Unnamed: 0,0
0,[ 0.38 -1.06 0.34]
1,[-0.05 -2.19 1.46]
2,[ 0.59 3.03 -2.52]
3,[ 0.09 3.73 -2.09]
4,[-0.02 3.64 -2. ]
...,...
1253,[-0.07 3.36 -1.88]
1254,[ 1.51 0.57 -2.19]
1255,[ 0.99 2.18 -2.6 ]
1256,[ 0.92 3.2 -2.66]


In [128]:
4709+1258

5967

In [129]:
train_dataset

Dataset({
    features: ['attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
    num_rows: 4709
})

In [130]:
train_dataset.to_csv("train_dataset.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

9520416

In [131]:
df_train = pd.read_csv("train_dataset.csv")

In [132]:
df_train['topic_full_sentence_structural_fts_combined'][0]

'Topic: Should students be taught to compete or to cooperate? Sentence: From this point of view, I firmly believe that we should attach more importance to cooperation during primary education. Structural features: One. Yes. No. Yes. Yes.'

In [133]:
df_train.shape

(4709, 12)

In [134]:
test_dataset.to_csv('test_dataset.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2403992

In [135]:
df_test = pd.read_csv("test_dataset.csv")

In [136]:
df_test.shape

(1258, 12)

In [137]:
df_train['class_probs'] = df_train_probs

In [138]:
df_train['class_probs'][0]

'[-0.82 -1.38  2.36]'

In [191]:
df_test['class_probs'] = df_test_probs

In [192]:
df_test['class_probs'][771]

'[ 0.99  3.05 -2.72]'

In [141]:
try_s = df_test['class_probs'][0]

In [144]:
try_s = try_s.replace('[', '')

In [146]:
try_s = try_s.replace(']','')

In [148]:
try_s = try_s.lstrip()

In [149]:
try_s

'0.38 -1.06  0.34'

In [153]:
try_l = try_s.split()

In [154]:
try_l

['0.38', '-1.06', '0.34']

In [155]:
new_str = try_l[0] + ", " + try_l[1] + ', ' + try_l[2]

In [156]:
new_str

'0.38, -1.06, 0.34'

In [None]:
# now make the concat function (done)
# round the float (done)
# run it on train dataset (done)
# run it on test dataset (done)
# make dataset object out of it

In [103]:
df_train.columns

Index(['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels',
       'sentence', 'split', 'text', 'token_type_ids',
       'topic_and_full_sentence',
       'topic_full_sentence_structural_fts_combined',
       'topic_full_sentence_stuctural_fts', 'class_probs'],
      dtype='object')

In [104]:
df_test.columns

Index(['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels',
       'sentence', 'split', 'text', 'token_type_ids',
       'topic_and_full_sentence',
       'topic_full_sentence_structural_fts_combined',
       'topic_full_sentence_stuctural_fts', 'class_probs'],
      dtype='object')

In [161]:
def concat_probs_w_fts(x):
    
    text_s = x.topic_full_sentence_structural_fts_combined
    probs = x.class_probs
    
    probs = probs.replace('[', '')
    probs = probs.replace(']','')
    probs = probs.lstrip()
    probs_l = probs.split()
    probs = probs_l[0] + ", " + probs_l[1] + ', ' + probs_l[2]
    
    new_ft = text_s + ' Class probabilities: ' + probs
    
    return new_ft   
    
    
    

In [162]:
concat_probs_w_fts(df_train.iloc[0])

'Topic: Should students be taught to compete or to cooperate? Sentence: From this point of view, I firmly believe that we should attach more importance to cooperation during primary education. Structural features: One. Yes. No. Yes. Yes. Class probabilities: -0.82, -1.38, 2.36'

In [163]:
df_train['strct_fts_w_probs'] = df_train.apply(lambda x: concat_probs_w_fts(x), axis=1)

In [166]:
train_probs[190]

array([ 1.5 ,  0.54, -2.27], dtype=float32)

In [168]:
df_train['strct_fts_w_probs'][190], df_train['topic_full_sentence_structural_fts_combined'][190]

('Topic: Television is the culprit for destroying communication between friends & family. Sentence: In spite of enjoying watching television shows, it is really time consuming task. Structural features: Two. No. No. Yes. No. Class probabilities: 1.5, 0.54, -2.27',
 'Topic: Television is the culprit for destroying communication between friends & family. Sentence: In spite of enjoying watching television shows, it is really time consuming task. Structural features: Two. No. No. Yes. No.')

In [None]:
# df_train is correct!

In [169]:
df_test['strct_fts_w_probs'] = df_test.apply(lambda x: concat_probs_w_fts(x), axis=1)

In [196]:
train_probs[230]

array([ 0.83,  2.81, -2.82], dtype=float32)

In [197]:
df_train['strct_fts_w_probs'][230]

'Topic: Some young adults want independence from their parents quickly. Sentence: There will not be such worries when young adults live in their own home, because parents will take care for them. Structural features: Two. No. No. No. No. Class probabilities: 0.83, 2.81, -2.82'

In [None]:
# df_test correct!

In [175]:
#clean dfs

In [177]:
df_train.columns

Index(['Unnamed: 0', 'attention_mask', 'feature_tensor', 'input_ids', 'labels',
       'sentence', 'split', 'text', 'token_type_ids',
       'topic_and_full_sentence',
       'topic_full_sentence_structural_fts_combined',
       'topic_full_sentence_stuctural_fts', 'class_probs',
       'strct_fts_w_probs'],
      dtype='object')

In [180]:
df_train = df_train[['split', 'text', 'labels', 'sentence', 'topic_and_full_sentence', 'topic_full_sentence_stuctural_fts', 'topic_full_sentence_structural_fts_combined', 'feature_tensor', 'strct_fts_w_probs']]

In [181]:
df_train.columns

Index(['split', 'text', 'labels', 'sentence', 'topic_and_full_sentence',
       'topic_full_sentence_stuctural_fts',
       'topic_full_sentence_structural_fts_combined', 'feature_tensor',
       'strct_fts_w_probs'],
      dtype='object')

In [182]:
df_test = df_test[['split', 'text', 'labels', 'sentence', 'topic_and_full_sentence', 'topic_full_sentence_stuctural_fts', 'topic_full_sentence_structural_fts_combined', 'feature_tensor', 'strct_fts_w_probs']]

In [184]:
df_test.columns

Index(['split', 'text', 'labels', 'sentence', 'topic_and_full_sentence',
       'topic_full_sentence_stuctural_fts',
       'topic_full_sentence_structural_fts_combined', 'feature_tensor',
       'strct_fts_w_probs'],
      dtype='object')

In [185]:
from datasets import DatasetDict

In [186]:
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

In [187]:
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})

In [188]:
dataset

DatasetDict({
    train: Dataset({
        features: ['split', 'text', 'labels', 'sentence', 'topic_and_full_sentence', 'topic_full_sentence_stuctural_fts', 'topic_full_sentence_structural_fts_combined', 'feature_tensor', 'strct_fts_w_probs'],
        num_rows: 4709
    })
    test: Dataset({
        features: ['split', 'text', 'labels', 'sentence', 'topic_and_full_sentence', 'topic_full_sentence_stuctural_fts', 'topic_full_sentence_structural_fts_combined', 'feature_tensor', 'strct_fts_w_probs'],
        num_rows: 1258
    })
})

In [189]:
torch.save(dataset, os.path.join('/notebooks/cascade_bert', 'pe_dataset_w_bert_probs.pt'))

In [86]:
dataset['train'] = dataset['train'].add_column('bert_probs_z', new_list_train)

In [87]:
dataset['train']['bert_probs_z']

TypeError: can't convert np.ndarray of type numpy.str_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [82]:
len(nd)

235

In [80]:
dataset['train'] = dataset['train'].add_column('bert_probs_4', nd)

ArrowInvalid: Added column's length must match table's length. Expected length 4709 but got length 235

In [38]:
dataset['train']

Dataset({
    features: ['attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts'],
    num_rows: 4709
})

In [39]:
for i in [1,2,3]:
    
    dataset['train'] = dataset['train'].add_column('bert_probs' + str(i), train_probs[:,i-1])
    dataset['test'] = dataset['test'].add_column('bert_probs' + str(i), test_probs[:,i-1])

In [40]:
dataset['train'] 

Dataset({
    features: ['attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts', 'bert_probs1', 'bert_probs2', 'bert_probs3'],
    num_rows: 4709
})

In [41]:
dataset['train']['bert_probs1']

tensor([-0.8163,  1.5404,  0.3230,  ...,  0.8422,  1.2905, -0.6934])

In [42]:
# concatenate to get new feature

In [43]:
train_probs_list = []

for k in range(0,4709,1):
    
    new_str = str(round(dataset['train']['bert_probs1'][k].tolist(), 2)) + ' ' + str(round(dataset['train']['bert_probs1'][k].tolist(), 2)) + ' ' + str(round(dataset['train']['bert_probs1'][k].tolist(), 2))
    
    train_probs_list.append(new_str)

In [44]:
len(train_probs_list)

4709

In [45]:
test_probs_list = []

for k in range(0,1258,1):
    
    new_str = str(round(dataset['test']['bert_probs1'][k].tolist(), 2)) + ' ' + str(round(dataset['test']['bert_probs1'][k].tolist(), 2)) + ' ' + str(round(dataset['test']['bert_probs1'][k].tolist(), 2))
    
    test_probs_list.append(new_str)

In [46]:
type(test_probs_list)

list

In [47]:
dataset['train'] = dataset['train'].add_column('full_bert_probs_2', train_probs_list)

In [48]:
dataset['train']['full_bert_probs_2']

TypeError: can't convert np.ndarray of type numpy.str_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [421]:
dataset['train'] = dataset['train'].remove_columns(['bert_probs_str'])

In [422]:
dataset['test'] = dataset['test'].remove_columns(['bert_probs_str'])

In [423]:
dataset['train']

Dataset({
    features: ['attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts', 'bert_probs1', 'bert_probs2', 'bert_probs3'],
    num_rows: 4709
})

In [424]:
for split in ['train', 'test']:


    a = np.array(dataset[split]['bert_probs1']).round(2).astype(str)

    b = np.array(dataset[split]['bert_probs2']).round(2).astype(str)

    c = np.array(dataset[split]['bert_probs3']).round(2).astype(str)

    tmp = np.core.defchararray.add(a, ' ')

    tmp = np.core.defchararray.add(tmp, b)

    tmp = np.core.defchararray.add(tmp, ' ')

    tmp = np.core.defchararray.add(tmp, c)
    
    tmp = list(tmp)
    
    print(type(tmp))
    
    dataset[split] = dataset[split].add_column('bert_probs_str', tmp)

<class 'list'>
<class 'list'>


In [425]:
dataset['train']

Dataset({
    features: ['attention_mask', 'feature_tensor', 'input_ids', 'labels', 'sentence', 'split', 'text', 'token_type_ids', 'topic_and_full_sentence', 'topic_full_sentence_structural_fts_combined', 'topic_full_sentence_stuctural_fts', 'bert_probs1', 'bert_probs2', 'bert_probs3', 'bert_probs_str'],
    num_rows: 4709
})

In [426]:
type(dataset['train']['bert_probs_str'])

TypeError: can't convert np.ndarray of type numpy.str_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [490]:
tmp_1 = np.array(['a']*len(dataset['train']))

In [491]:
tmp_1

array(['a', 'a', 'a', ..., 'a', 'a', 'a'], dtype='<U1')

In [492]:
dataset['train'] = dataset['train'].add_column('bert_probs_str_dummy_4', tmp_1)

In [494]:
dataset['train']['bert_probs_str_dummy_4']

TypeError: can't convert np.ndarray of type numpy.str_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [22]:
# global variables
NUM_LABELS = labels.num_classes
BATCH_SIZE = 48
NB_EPOCHS = 6

In [23]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)
model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [24]:
# https://huggingface.co/transformers/main_classes/trainer.html
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss()#(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [25]:
metric = load_metric('f1')

def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels, average='macro')

Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

In [26]:
training_args = TrainingArguments(
    
    # output
    output_dir=RESULTS_FOLDER,          
    
    # params
    num_train_epochs=NB_EPOCHS,               # nb of epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # cf. paper Sun et al.
    learning_rate=1e-5,#2e-5,                 # cf. paper Sun et al.
#     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,                         # cf. paper Sun et al.
    weight_decay=0.01,                        # strength of weight decay
    
    # eval
    evaluation_strategy="steps",              # cf. paper Sun et al.
    eval_steps=20,                            # cf. paper Sun et al.
    
    # log
    logging_dir="/notebooks/Results/bert_sequence_classification/tb_logs",  
    logging_strategy='steps',
    logging_steps=20,
    
    # save
    save_strategy='steps',
    save_total_limit=2,
    # save_steps=20, # default 500
    load_best_model_at_end=True,              # cf. paper Sun et al.
    # metric_for_best_model='eval_loss' 
    metric_for_best_model='f1'
)

In [27]:
trainer = CustomTrainer( # Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [28]:
results = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: feature_tensor, topic_full_sentence_structural_fts_combined, topic_full_sentence_stuctural_fts, sentence, text, split, topic_and_full_sentence.
***** Running training *****
  Num examples = 3767
  Num Epochs = 6
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 474


Step,Training Loss,Validation Loss,F1
20,0.9633,0.899323,0.257012
40,0.8659,0.787196,0.257279
60,0.754,0.61082,0.569856
80,0.5927,0.546675,0.739689
100,0.5533,0.528088,0.746311
120,0.5318,0.496619,0.752567
140,0.4963,0.498779,0.75892
160,0.5118,0.475554,0.731007
180,0.4692,0.477034,0.703705
200,0.4631,0.456184,0.784807


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: feature_tensor, topic_full_sentence_structural_fts_combined, topic_full_sentence_stuctural_fts, sentence, text, split, topic_and_full_sentence.
***** Running Evaluation *****
  Num examples = 942
  Batch size = 48
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: feature_tensor, topic_full_sentence_structural_fts_combined, topic_full_sentence_stuctural_fts, sentence, text, split, topic_and_full_sentence.
***** Running Evaluation *****
  Num examples = 942
  Batch size = 48
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: feature_tensor, topic_full_sentence_structural_fts_combined, topic_full_sentence_stuctural_fts, sentence, text, split, topic_and_fu

In [29]:
# save best model
trainer.save_model(os.path.join("/notebooks/cascade_bert/saved_models", 'best-model-for-probs'))

Saving model checkpoint to /notebooks/cascade_bert/saved_models/best-model-for-probs
Configuration saved in /notebooks/cascade_bert/saved_models/best-model-for-probs/config.json
Model weights saved in /notebooks/cascade_bert/saved_models/best-model-for-probs/pytorch_model.bin
tokenizer config file saved in /notebooks/cascade_bert/saved_models/best-model-for-probs/tokenizer_config.json
Special tokens file saved in /notebooks/cascade_bert/saved_models/best-model-for-probs/special_tokens_map.json


In [30]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [31]:
test_trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))
test_raw_preds, test_labels, _ = test_trainer.predict(test_dataset)
test_preds = np.argmax(test_raw_preds, axis=1)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: feature_tensor, topic_full_sentence_structural_fts_combined, topic_full_sentence_stuctural_fts, sentence, text, split, topic_and_full_sentence.
***** Running Prediction *****
  Num examples = 1258
  Batch size = 8


In [32]:
sum(test_preds)

671

In [33]:
target_name = labels.int2str([0,1,2])
print(classification_report(test_labels, test_preds, target_names=target_name))

              precision    recall  f1-score   support

     Premise       0.93      0.89      0.91       805
       Claim       0.69      0.67      0.68       301
  MajorClaim       0.77      0.95      0.85       152

    accuracy                           0.85      1258
   macro avg       0.80      0.84      0.81      1258
weighted avg       0.85      0.85      0.85      1258

