In [3]:
import numpy as np
import pandas as pd
from small_text import TransformersDataset, TransformerModelArguments, LeastConfidence, TransformerBasedClassificationFactory as TransformerFactory, PoolBasedActiveLearner, random_initialization_balanced as init
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from synergy_dataset import Dataset, iter_datasets

from imblearn.over_sampling import RandomOverSampler
import torch

from sklearn.model_selection import train_test_split

In [5]:
dataset = Dataset('Radjenovic_2013')
dataset = dataset.to_frame()
# dataset = pd.read_csv('../datasets/synergy_dataset/Radjenovic_2013.csv')
dataset = dataset.dropna()

In [6]:
X = np.array(dataset['abstract'])
y = np.array(dataset['label_included'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
ros = RandomOverSampler(random_state=42)
X_train_os, y_train_os = ros.fit_resample(X_train.reshape(-1, 1), y_train)
X_train_os = np.concatenate(X_train_os, axis=0)

In [8]:
# transformer_model = 'allenai/specter2_base'
transformer_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)

In [9]:
train = TransformersDataset.from_arrays(X_train_os, y_train_os, tokenizer, target_labels=np.array([0, 1]), max_length=256)
test = TransformersDataset.from_arrays(X_test, y_test, tokenizer, target_labels=np.array([0, 1]), max_length=256)
#max_length > 256 geralmente ultrapassa a RAM da GPU (8GB)



In [10]:
num_classes = 2
model_args = TransformerModelArguments(transformer_model)
clf_factory = TransformerFactory(model_args, num_classes, kwargs={'device': 'cuda'})
# clf_factory = TransformerFactory(model_args, num_classes)
query_strategy = LeastConfidence()

In [11]:
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)

indices_initial = init(train.y, n_samples=4)

active_learner.initialize_data(indices_initial, train.y[indices_initial])

<class 'numpy.int64'> <class 'numpy.int64'>


In [12]:
num_queries = 10

for i in range(num_queries):
    indices_queried = active_learner.query(num_samples=10)
    y = train.y[indices_queried]
    active_learner.update(y)

    y_pred_train = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    print(f'\nIteration {i+1}')
    print('Train accuracy: {:.2f}'.format(accuracy_score(train.y, y_pred_train)))
    print('Train precision: {:.2f}'.format(precision_score(train.y, y_pred_train, zero_division=np.nan)))
    print('Train recall: {:.2f}'.format(recall_score(train.y, y_pred_train, zero_division=np.nan)))
    print('Train F1 score: {:.2f}'.format(f1_score(train.y, y_pred_train)))
    print('\n')
    print('Test accuracy: {:.2f}'.format(accuracy_score(test.y, y_pred_test)))
    print('Test precision: {:.2f}'.format(precision_score(test.y, y_pred_test, zero_division=np.nan)))
    print('Test recall: {:.2f}'.format(recall_score(test.y, y_pred_test, zero_division=np.nan)))
    print('Test F1 score: {:.2f}'.format(f1_score(test.y, y_pred_test)))


Iteration 1
Train accuracy: 0.50
Train precision: 0.00
Train recall: 0.00
Train F1 score: 0.00


Test accuracy: 0.99
Test precision: 0.00
Test recall: 0.00
Test F1 score: 0.00

Iteration 2
Train accuracy: 0.50
Train precision: nan
Train recall: 0.00
Train F1 score: 0.00


Test accuracy: 0.99
Test precision: nan
Test recall: 0.00
Test F1 score: 0.00

Iteration 3
Train accuracy: 0.58
Train precision: 0.78
Train recall: 0.21
Train F1 score: 0.34


Test accuracy: 0.92
Test precision: 0.03
Test recall: 0.20
Test F1 score: 0.05

Iteration 4
Train accuracy: 0.54
Train precision: 1.00
Train recall: 0.07
Train F1 score: 0.14


Test accuracy: 0.99
Test precision: nan
Test recall: 0.00
Test F1 score: 0.00

Iteration 5
Train accuracy: 0.55
Train precision: 0.88
Train recall: 0.11
Train F1 score: 0.19


Test accuracy: 0.97
Test precision: 0.00
Test recall: 0.00
Test F1 score: 0.00

Iteration 6
Train accuracy: 0.55
Train precision: 1.00
Train recall: 0.11
Train F1 score: 0.20


Test accuracy: 0.99


In [13]:
transformer_model = 'allenai/specter2_base'
# transformer_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)

train = TransformersDataset.from_arrays(X_train_os, y_train_os, tokenizer, target_labels=np.array([0, 1]), max_length=256)
test = TransformersDataset.from_arrays(X_test, y_test, tokenizer, target_labels=np.array([0, 1]), max_length=256)

num_classes = 2
model_args = TransformerModelArguments(transformer_model)
clf_factory = TransformerFactory(model_args, num_classes, kwargs={'device': 'cuda'})
# clf_factory = TransformerFactory(model_args, num_classes)
query_strategy = LeastConfidence()

active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)

indices_initial = init(train.y, n_samples=4)

active_learner.initialize_data(indices_initial, train.y[indices_initial])

num_queries = 10

for i in range(num_queries):
    indices_queried = active_learner.query(num_samples=10)
    y = train.y[indices_queried]
    active_learner.update(y)

    y_pred_train = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    print(f'\nIteration {i+1}')
    print('Train accuracy: {:.2f}'.format(accuracy_score(train.y, y_pred_train)))
    print('Train precision: {:.2f}'.format(precision_score(train.y, y_pred_train, zero_division=np.nan)))
    print('Train recall: {:.2f}'.format(recall_score(train.y, y_pred_train, zero_division=np.nan)))
    print('Train F1 score: {:.2f}'.format(f1_score(train.y, y_pred_train)))
    print('\n')
    print('Test accuracy: {:.2f}'.format(accuracy_score(test.y, y_pred_test)))
    print('Test precision: {:.2f}'.format(precision_score(test.y, y_pred_test, zero_division=np.nan)))
    print('Test recall: {:.2f}'.format(recall_score(test.y, y_pred_test, zero_division=np.nan)))
    print('Test F1 score: {:.2f}'.format(f1_score(test.y, y_pred_test)))




Iteration 1
Train accuracy: 0.50
Train precision: nan
Train recall: 0.00
Train F1 score: 0.00


Test accuracy: 0.99
Test precision: nan
Test recall: 0.00
Test F1 score: 0.00

Iteration 2
Train accuracy: 0.61
Train precision: 0.97
Train recall: 0.22
Train F1 score: 0.36


Test accuracy: 0.99
Test precision: 0.26
Test recall: 0.25
Test F1 score: 0.26

Iteration 3
Train accuracy: 0.62
Train precision: 0.98
Train recall: 0.25
Train F1 score: 0.40


Test accuracy: 0.98
Test precision: 0.19
Test recall: 0.15
Test F1 score: 0.17

Iteration 4
Train accuracy: 0.80
Train precision: 0.94
Train recall: 0.64
Train F1 score: 0.76


Test accuracy: 0.94
Test precision: 0.09
Test recall: 0.50
Test F1 score: 0.15

Iteration 5
Train accuracy: 0.61
Train precision: 1.00
Train recall: 0.22
Train F1 score: 0.36


Test accuracy: 0.99
Test precision: 0.50
Test recall: 0.05
Test F1 score: 0.09

Iteration 6
Train accuracy: 0.61
Train precision: 1.00
Train recall: 0.21
Train F1 score: 0.35


Test accuracy: 0.99