In [1]:
import numpy as np
import pandas as pd
from small_text import TransformersDataset, TransformerModelArguments, LeastConfidence, TransformerBasedClassificationFactory as TransformerFactory, PoolBasedActiveLearner, random_initialization_balanced as init
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from synergy_dataset import Dataset, iter_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
d = Dataset('Nelson_2002')
dataset = d.to_frame()
# dataset = pd.read_csv('../datasets/synergy_dataset/Radjenovic_2013.csv')
dataset = dataset.dropna()

In [3]:
text = np.array(dataset['abstract'])
labels = np.array(dataset['label_included'])

In [4]:
transformer_model = 'allenai/specter2_base'
# transformer_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)

In [5]:
train = TransformersDataset.from_arrays(text, labels, tokenizer, target_labels=np.array([0, 1]), max_length=128)
#max_length > 256 geralmente ultrapassa a RAM da GPU (8GB)



In [6]:
num_classes = 2
model_args = TransformerModelArguments(transformer_model)
clf_factory = TransformerFactory(model_args, num_classes, kwargs={'device': 'cuda'})
query_strategy = LeastConfidence()

In [7]:
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)

indices_initial = init(train.y, n_samples=4)

active_learner.initialize_data(indices_initial, train.y[indices_initial])

In [8]:
num_queries = 10

for i in range(num_queries):
    indices_queried = active_learner.query(num_samples=10)
    y = train.y[indices_queried]
    active_learner.update(y)

    y_pred = active_learner.classifier.predict(train)
    print(f'\nIteration {i+1}')
    print('Train accuracy: {:.2f}'.format(accuracy_score(train.y, y_pred)))
    print('Train precision: {:.2f}'.format(precision_score(train.y, y_pred, zero_division=np.nan)))
    print('Train recall: {:.2f}'.format(recall_score(train.y, y_pred, zero_division=np.nan)))
    print('Train F1 score: {:.2f}'.format(f1_score(train.y, y_pred)))


Iteration 1
Train accuracy: 0.76
Train precision: nan
Train recall: 0.00
Train F1 score: 0.00

Iteration 2
Train accuracy: 0.79
Train precision: 0.90
Train recall: 0.12
Train F1 score: 0.21

Iteration 3
Train accuracy: 0.76
Train precision: nan
Train recall: 0.00
Train F1 score: 0.00

Iteration 4
Train accuracy: 0.79
Train precision: 0.65
Train recall: 0.29
Train F1 score: 0.40

Iteration 5
Train accuracy: 0.82
Train precision: 0.91
Train recall: 0.26
Train F1 score: 0.40

Iteration 6
Train accuracy: 0.83
Train precision: 0.82
Train recall: 0.35
Train F1 score: 0.49

Iteration 7
Train accuracy: 0.84
Train precision: 0.86
Train recall: 0.40
Train F1 score: 0.55

Iteration 8
Train accuracy: 0.86
Train precision: 0.86
Train recall: 0.48
Train F1 score: 0.62

Iteration 9
Train accuracy: 0.87
Train precision: 0.88
Train recall: 0.55
Train F1 score: 0.67

Iteration 10
Train accuracy: 0.89
Train precision: 0.98
Train recall: 0.53
Train F1 score: 0.69
