In [None]:
import torch

from datasets import (Dataset, DatasetDict, load_dataset)
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          PreTrainedModel, BertModel, BertForSequenceClassification,
                          TrainingArguments, Trainer)
from sklearn.metrics import f1_score

import numpy as np

from aspects.datasets import (RedditAnnotatedDataset, ValueEvalDataset, ValueNetDataset)
from aspects.datasets.utils import cast_dataset_to_hf, hf_dataset_tokenize
import copy
import pandas as pd


In [None]:
proj_dir = "."
batch_size = 2

args = TrainingArguments(
    output_dir=".",
    do_train=False,
    do_eval=False,
    do_predict=True,
    per_device_eval_batch_size=batch_size
)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(f"{proj_dir}/kiesel2", num_labels=20).cuda()

multi_trainer = Trainer(
    model,
    args,
    tokenizer=tokenizer
)

In [None]:
# get labels


import json

def load_json_file(filepath):
    """Load content of json-file from `filepath`"""
    with open(filepath, 'r') as  json_file:
        return json.load(json_file)

def load_values_from_json():
    """Load values per level from json-file from `filepath`"""
    json_values = load_json_file(f"{proj_dir}/data/valueeval/dataset-identifying-the-human-values-behind-arguments/values.json")
    values = { "1":set(), "2":set(), "3":set(), "4a":set(), "4b":set() }
    for value in json_values["values"]:
        values["1"].add(value["name"])
        values["2"].add(value["level2"])
        for valueLevel3 in value["level3"]:
            values["3"].add(valueLevel3)
        for valueLevel4a in value["level4a"]:
            values["4a"].add(valueLevel4a)
        for valueLevel4b in value["level4b"]:
            values["4b"].add(valueLevel4b)
    values["1"] = sorted(values["1"])
    values["2"] = sorted(values["2"])
    values["3"] = sorted(values["3"])
    values["4a"] = sorted(values["4a"])
    values["4b"] = sorted(values["4b"])
    return values

values = load_values_from_json()
labels = values["2"]


In [4]:
# Load ValueArg dataset as test_dataset
# train_ve_idx, val_ve_idx, test_ve_idx = dataset_eval.get_splits()
# test_set = dataset_eval[test_ve_idx]

dataset_eval = ValueEvalDataset(
    f"{proj_dir}/data/valueeval/dataset-identifying-the-human-values-behind-arguments/",
    cast_to_valuenet=True,
    return_predefined_splits=True
)
test_set_idx = []
for i, elem in enumerate(dataset_eval):
    if elem['split'] == 'test':
        test_set_idx.append(i)

test_set = dataset_eval[test_set_idx]
print(type(test_set))
test_set.reset_index(drop=True, inplace=True)
print(test_set[1:10])
new_records = {}
for i, row in test_set.iterrows():
    if row.text not in new_records:
        if row.orig_label == 1:
            new_records[row.text] = {
                'labels': [row.value],
            }
    else:
        if row.orig_label == 1:
            new_records[row.text]['labels'].append(row.value)

new_df = pd.DataFrame.from_dict(new_records, orient='index')
new_df['text'] = new_df.index
new_df.reset_index(drop=True, inplace=True)
test_set = new_df
print(test_set)
TRUE_LABELS = test_set.labels.copy()
print(TRUE_LABELS)
test_set.drop(columns=['labels'], inplace=True)
test_dataset = Dataset.from_dict((test_set).to_dict('list'))
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=True)


test_dataset_valuearg = copy.copy(test_dataset.map(tokenize, batched=True, batch_size=batch_size))
true_labels_valuearg = copy.copy(TRUE_LABELS)


<class 'pandas.core.frame.DataFrame'>
                                                text  orig_label  \
1  student loans set children up to be valuable c...           1   
2  student loans set children up to be valuable c...           1   
3  student loans set children up to be valuable c...           0   
4  student loans set children up to be valuable c...           0   
5  student loans set children up to be valuable c...           0   
6  student loans set children up to be valuable c...           0   
7  student loans set children up to be valuable c...           0   
8  student loans set children up to be valuable c...           0   
9  student loans set children up to be valuable c...           0   

            value split  
1  self-direction  test  
2        security  test  
3           power  test  
4       tradition  test  
5    universalism  test  
6     stimulation  test  
7        hedonism  test  
8      conformity  test  
9     benevolence  test  
                     

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 374/374 [00:00<00:00, 414.44ba/s]


In [5]:
dataset_eval = ValueNetDataset(
    f"{proj_dir}/data/valuenet/",
    return_predefined_splits=True
)
print(dataset_eval[0])
_, _, test_set_idx = dataset_eval.get_splits()
test_set = dataset_eval[test_set_idx]
test_set.reset_index(drop=True, inplace=True)
new_records = {}
for i, row in test_set.iterrows():
    if row.text not in new_records:
        if row.orig_label == 1 or row.orig_label == -1:
            new_records[row.text] = {
                'labels': [row.value],
            }
    else:
        if row.orig_label == 1 or row.orig_label == -1:
            new_records[row.text]['labels'].append(row.value)

new_df = pd.DataFrame.from_dict(new_records, orient='index')
new_df['text'] = new_df.index
new_df.reset_index(drop=True, inplace=True)
test_set = new_df
print(test_set)
TRUE_LABELS = test_set.labels.copy()
print(TRUE_LABELS)
test_set.drop(columns=['labels'], inplace=True)
test_dataset = Dataset.from_dict((test_set).to_dict('list'))
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=True)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=batch_size)

test_dataset_valuenet = copy.copy(test_dataset.map(tokenize, batched=True, batch_size=batch_size))
true_labels_valuenet = copy.copy(TRUE_LABELS)


scenario           [POWER] After accomplishing every task I cross...
orig_label                                                         1
text                After accomplishing every task I cross each i...
new_class_label                                               power1
value                                                          power
Name: 0, dtype: object
             labels                                               text
0     [benevolence]   'ruining' a project because Koreans find it o...
1     [benevolence]                            **Help: I cut myself...
2        [security]   A friend let her stay with him until she got ...
3        [hedonism]               A girl in my class is very beautiful
4     [benevolence]                       A message for the one I love
...             ...                                                ...
1424  [benevolence]   yelling at my father to flush the goddamn toilet
1425   [conformity]                               yelling at my 

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 715/715 [00:01<00:00, 396.70ba/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 715/715 [00:01<00:00, 387.39ba/s]


In [6]:
def f1_score_per_label(y_pred, y_true, value_classes):
    """Compute label-wise and averaged F1-scores"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    f1_scores = {}
    for i, v in enumerate(value_classes):
        
        f1_scores[v] = round(f1_score(y_true[:, i], y_pred[:, i], zero_division=0), 2)

    f1_scores['avg-f1-score'] = round(np.mean(list(f1_scores.values())), 2)

    return f1_scores


def get_score(test_dataset, test_labels):
    print(len(test_dataset))
    predictions = multi_trainer.predict(test_dataset)
    preds = 1 * (predictions.predictions > 0.5)
    print(preds.shape)

    value2column = {
        "achievement": [0],
        "benevolence": [1, 2],
        "conformity": [3, 4],
        "hedonism": [6],
        "power": [8, 9],
        "security": [10, 11],
        "self-direction": [12, 13],
        "stimulation": [14],
        "tradition": [15],
        "universalism": [16, 17, 18, 19]
    }

    predicted_schwartz_labels = []
    for i, item in enumerate(test_dataset):
        multilabel_preds_schwartz = []
        for value in value2column:
            relevant_columns = value2column[value]
            check_value_active = any(preds[i, relevant_columns])
            if check_value_active:
                label = 1
            else:
                label = 0
            multilabel_preds_schwartz.append(label)
        predicted_schwartz_labels.append(multilabel_preds_schwartz)
    true_labels_to_multilabel = []
    for sample in test_labels:
        multilabel = []
        for value in value2column:
            if value in sample:
                multilabel.append(1)
            else:
                multilabel.append(0)
        true_labels_to_multilabel.append(multilabel)    

    print(f1_score_per_label(predicted_schwartz_labels, true_labels_to_multilabel, value2column.keys()))

In [7]:
get_score(test_dataset_valuearg, true_labels_valuearg)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 748
  Batch size = 2


748


(748, 20)
{'achievement': 0.6, 'benevolence': 0.5, 'conformity': 0.33, 'hedonism': 0.22, 'power': 0.27, 'security': 0.43, 'self-direction': 0.36, 'stimulation': 0.0, 'tradition': 0.37, 'universalism': 0.62, 'avg-f1-score': 0.37}


In [8]:
get_score(test_dataset_valuenet, true_labels_valuenet)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1429
  Batch size = 2


1429
(1429, 20)
{'achievement': 0.26, 'benevolence': 0.38, 'conformity': 0.23, 'hedonism': 0.05, 'power': 0.04, 'security': 0.21, 'self-direction': 0.03, 'stimulation': 0.05, 'tradition': 0.12, 'universalism': 0.13, 'avg-f1-score': 0.15}


In [15]:
from datasets import concatenate_datasets
concat_all = concatenate_datasets([test_dataset_valuearg, test_dataset_valuenet])
labels_all = pd.concat([true_labels_valuearg, true_labels_valuenet])
# small_va = test_dataset_valuearg.select(range(20))
# small_vn = test_dataset_valuenet.select(range(20))
# labels_va = true_labels_valuearg[:20]
# labels_vn = true_labels_valuenet[:20]

# small_all = concatenate_datasets([small_va, small_vn])
# small_labels = pd.concat([labels_va, labels_vn])

# # print(small_all)
# print(small_labels)
get_score(concat_all, labels_all)


The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2177
  Batch size = 2


2177
(2177, 20)
{'achievement': 0.52, 'benevolence': 0.42, 'conformity': 0.25, 'hedonism': 0.06, 'power': 0.2, 'security': 0.32, 'self-direction': 0.29, 'stimulation': 0.04, 'tradition': 0.25, 'universalism': 0.49, 'avg-f1-score': 0.28}
