In [13]:
import numpy as np
import pandas as pd
from small_text import TransformersDataset, TransformerModelArguments, LeastConfidence, SEALS,  TransformerBasedClassificationFactory as TransformerFactory, PoolBasedActiveLearner, random_initialization_balanced
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from synergy_dataset import Dataset, iter_datasets

from imblearn.over_sampling import RandomOverSampler, SMOTEN
from sklearn.model_selection import train_test_split

In [2]:
d = Dataset("Nelson_2002")
dataset = d.to_frame()
dataset = dataset.fillna(' ')

In [3]:
X = np.array(dataset['abstract'])
y = np.array(dataset['label_included'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
sampler = SMOTEN(random_state=42)
X_train_os, y_train_os = sampler.fit_resample(X_train.reshape(-1, 1), y_train)

In [None]:
# ros = RandomOverSampler(random_state=42)
# X_train_os, y_train_os = ros.fit_resample(X_train.reshape(-1, 1), y_train)
# X_train_os = np.concatenate(X_train_os, axis=0)

In [4]:
# transformer_model = 'bert-base-uncased'
transformer_model = 'allenai/specter2_base'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
train = TransformersDataset.from_arrays(X_train, y_train, tokenizer, target_labels=np.array([0, 1]), max_length=128)
test = TransformersDataset.from_arrays(X_test, y_test, tokenizer, target_labels=np.array([0, 1]), max_length=128)



In [None]:
print('Treino:')
print(0, len(train.y[train.y==0])/len(train.y))
print(1, 1-len(train.y[train.y==0])/len(train.y))
print(f'Tamanho X sem oversampling: {len(X_train)}')
print(f'Tamanho X com oversampling: {len(train)}')

print('\nTeste:')
print(0, len(test.y[test.y==0])/len(test.y))
print(1, 1-len(test.y[test.y==0])/len(test.y))
print(f'Tamanho X sem oversampling: {len(X_test)}')
print(f'Tamanho X com oversampling: {len(test)}')

0 0.5
1 0.5
217
332


In [11]:
num_classes = 2
model_args = TransformerModelArguments(transformer_model)
clf_factory = TransformerFactory(model_args, num_classes, kwargs={'device': 'cuda'})
query_strategy = SEALS(LeastConfidence(), k=2)

In [33]:
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)

indices_initial = random_initialization_balanced(train.y, n_samples=4)

labels_1_idx = np.array([x for x in indices_initial if train.y[x] == 1])
print(indices_initial)
print(train.y[indices_initial])
print(labels_1_idx)
print(train.y[labels_1_idx])

# active_learner.initialize_data(indices_initial, train.y[indices_initial])

[117  74  60 158]
[1 0 0 1]
[117 158]
[1 1]


In [34]:
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)

indices_initial = random_initialization_balanced(train.y, n_samples=4)

labels_1_idx = np.array([x for x in indices_initial if train.y[x] == 1])

active_learner.initialize_data(labels_1_idx, train.y[labels_1_idx])

RuntimeError: one_hot is only applicable to index tensor.

In [None]:
num_queries = 10

for i in range(num_queries):
    indices_queried = active_learner.query(num_samples=20)
    y = train.y[indices_queried]
    active_learner.update(y)

    y_pred_train = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    print(f'\nIteration {i+1}')
    print('Train accuracy: {:.2f}'.format(accuracy_score(train.y, y_pred_train)))
    print('Train precision: {:.2f}'.format(precision_score(train.y, y_pred_train, zero_division=0)))
    print('Train recall: {:.2f}'.format(recall_score(train.y, y_pred_train, zero_division=0)))
    print('Train F1 score: {:.2f}'.format(f1_score(train.y, y_pred_train)))
    print('\n')
    print('Test accuracy: {:.2f}'.format(accuracy_score(test.y, y_pred_test)))
    print('Test precision: {:.2f}'.format(precision_score(test.y, y_pred_test, zero_division=0)))
    print('Test recall: {:.2f}'.format(recall_score(test.y, y_pred_test, zero_division=0)))
    print('Test F1 score: {:.2f}'.format(f1_score(test.y, y_pred_test)))


Iteration 1
Train accuracy: 0.49
Train precision: 0.20
Train recall: 0.01
Train F1 score: 0.01


Test accuracy: 0.74
Test precision: 0.00
Test recall: 0.00
Test F1 score: 0.00

Iteration 2
Train accuracy: 0.85
Train precision: 1.00
Train recall: 0.70
Train F1 score: 0.82


Test accuracy: 0.76
Test precision: 0.00
Test recall: 0.00
Test F1 score: 0.00

Iteration 3
Train accuracy: 0.50
Train precision: 0.00
Train recall: 0.00
Train F1 score: 0.00


Test accuracy: 0.76
Test precision: 0.00
Test recall: 0.00
Test F1 score: 0.00

Iteration 4
Train accuracy: 0.90
Train precision: 0.90
Train recall: 0.90
Train F1 score: 0.90


Test accuracy: 0.72
Test precision: 0.41
Test recall: 0.35
Test F1 score: 0.38

Iteration 5
Train accuracy: 0.92
Train precision: 0.95
Train recall: 0.89
Train F1 score: 0.92


Test accuracy: 0.76
Test precision: 0.50
Test recall: 0.15
Test F1 score: 0.24

Iteration 6
Train accuracy: 0.92
Train precision: 0.94
Train recall: 0.90
Train F1 score: 0.92


Test accuracy: 0.

In [None]:
transformer_model = 'allenai/specter2_base'
# transformer_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)

train = TransformersDataset.from_arrays(X_train_os.flatten(), y_train_os, tokenizer, target_labels=np.array([0, 1]), max_length=256)
test = TransformersDataset.from_arrays(X_test, y_test, tokenizer, target_labels=np.array([0, 1]), max_length=256)

num_classes = 2
model_args = TransformerModelArguments(transformer_model)
clf_factory = TransformerFactory(model_args, num_classes, kwargs={'device': 'cuda'})
# clf_factory = TransformerFactory(model_args, num_classes)
query_strategy = LeastConfidence()

active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)

indices_initial = init(train.y, n_samples=10)

active_learner.initialize_data(indices_initial, train.y[indices_initial])

num_queries = 10

for i in range(num_queries):
    indices_queried = active_learner.query(num_samples=20)
    y = train.y[indices_queried]
    active_learner.update(y)

    y_pred_train = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    print(f'\nIteration {i+1}')
    print('Train accuracy: {:.2f}'.format(accuracy_score(train.y, y_pred_train)))
    print('Train precision: {:.2f}'.format(precision_score(train.y, y_pred_train, zero_division=0)))
    print('Train recall: {:.2f}'.format(recall_score(train.y, y_pred_train, zero_division=0)))
    print('Train F1 score: {:.2f}'.format(f1_score(train.y, y_pred_train)))
    print('\n')
    print('Test accuracy: {:.2f}'.format(accuracy_score(test.y, y_pred_test)))
    print('Test precision: {:.2f}'.format(precision_score(test.y, y_pred_test, zero_division=0)))
    print('Test recall: {:.2f}'.format(recall_score(test.y, y_pred_test, zero_division=0)))
    print('Test F1 score: {:.2f}'.format(f1_score(test.y, y_pred_test)))

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/717k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/717k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]


Iteration 1
Train accuracy: 0.50
Train precision: 0.00
Train recall: 0.00
Train F1 score: 0.00


Test accuracy: 0.76
Test precision: 0.00
Test recall: 0.00
Test F1 score: 0.00

Iteration 2
Train accuracy: 0.87
Train precision: 0.87
Train recall: 0.87
Train F1 score: 0.87


Test accuracy: 0.79
Test precision: 0.58
Test recall: 0.42
Test F1 score: 0.49

Iteration 3
Train accuracy: 0.86
Train precision: 0.83
Train recall: 0.91
Train F1 score: 0.87


Test accuracy: 0.72
Test precision: 0.43
Test recall: 0.50
Test F1 score: 0.46

Iteration 4
Train accuracy: 0.85
Train precision: 1.00
Train recall: 0.70
Train F1 score: 0.82


Test accuracy: 0.76
Test precision: 0.00
Test recall: 0.00
Test F1 score: 0.00

Iteration 5
Train accuracy: 0.91
Train precision: 0.95
Train recall: 0.87
Train F1 score: 0.91


Test accuracy: 0.76
Test precision: 0.50
Test recall: 0.23
Test F1 score: 0.32

Iteration 6
Train accuracy: 0.94
Train precision: 0.95
Train recall: 0.94
Train F1 score: 0.94


Test accuracy: 0.