In [22]:
import numpy as np
import pandas as pd
from small_text import LeastConfidence, PoolBasedActiveLearner, random_initialization_balanced as init
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from synergy_dataset import Dataset, iter_datasets

from imblearn.over_sampling import SMOTEN

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from small_text.classifiers.classification import SklearnClassifier
from small_text.classifiers.factories import SklearnClassifierFactory
from sklearn.naive_bayes import MultinomialNB
from small_text.data.datasets import SklearnDataset
from sklearn.feature_extraction.text import TfidfVectorizer

from small_text import TransformersDataset, TransformerModelArguments, TransformerBasedClassificationFactory as TransformerFactory
from small_text.data.datasets import TextDataset
from transformers import AutoTokenizer
import torch

In [23]:
dataset = Dataset('Leenaars_2020')
dataset = dataset.to_frame()
# dataset = pd.read_csv('../datasets/synergy_dataset/Radjenovic_2013.csv')
dataset = dataset.dropna()
X = np.array(dataset['abstract'])
y = np.array(dataset['label_included'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [24]:
sampler = SMOTEN(random_state=42)
X_train_os, y_train_os = sampler.fit_resample(X_train.reshape(-1, 1), y_train)

In [25]:
num_classes = 2

factory_nb = SklearnClassifierFactory(MultinomialNB(), num_classes)
vectorizer = TfidfVectorizer()

train_nb = SklearnDataset.from_arrays(X_train_os.flatten(), y_train_os, vectorizer, target_labels=np.array([0, 1]))
test_nb = SklearnDataset.from_arrays(X_test.flatten(), y_test, vectorizer, target_labels=np.array([0, 1]), train=False)

  encountered_labels = get_flattened_unique_labels(self)


In [26]:
transformer_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)
train_transformer = TransformersDataset.from_arrays(X_train_os.flatten(), y_train_os, tokenizer, target_labels=np.array([0, 1]), max_length=256)
test_transformer = TransformersDataset.from_arrays(X_test, y_test, tokenizer, target_labels=np.array([0, 1]), max_length=256) #max_length > 256 geralmente ultrapassa a RAM da GPU (8GB)

model_args = TransformerModelArguments(transformer_model)
factory_transformer = TransformerFactory(model_args, num_classes, kwargs={'device': 'cuda'})

In [48]:
query_strategy = LeastConfidence()
active_learner_nb = PoolBasedActiveLearner(factory_nb, query_strategy, train_nb)

active_learner_transformer = PoolBasedActiveLearner(factory_transformer, query_strategy, train_transformer)

indices_initial = init(train_nb.y, n_samples=10)
active_learner_nb.initialize_data(indices_initial, train_nb.y[indices_initial])

  encountered_labels = get_flattened_unique_labels(self)


In [None]:
num_queries = 10
results = []
indices_labeled = []

for i in range(num_queries):
    if i < 3:
        indices_queried = active_learner_nb.query(num_samples=20)
        y = train_nb.y[indices_queried]
        active_learner_nb.update(y)
        indices_labeled = np.concatenate([indices_queried, indices_labeled])

        y_pred_train = active_learner_nb.classifier.predict(train_nb)
        y_pred_test = active_learner_nb.classifier.predict(test_nb)
        print(f'\nIteration {i+1} ({len(indices_labeled)} samples)')
        print('Train accuracy: {:.2f}'.format(accuracy_score(train_nb.y, y_pred_train)))
        print('Train precision: {:.2f}'.format(precision_score(train_nb.y, y_pred_train, zero_division=np.nan)))
        print('Train recall: {:.2f}'.format(recall_score(train_nb.y, y_pred_train, zero_division=np.nan)))
        print('Train F1 score: {:.2f}'.format(f1_score(train_nb.y, y_pred_train)))
        print('\n')
        print('Test accuracy: {:.2f}'.format(accuracy_score(test_nb.y, y_pred_test)))
        print('Test precision: {:.2f}'.format(precision_score(test_nb.y, y_pred_test, zero_division=np.nan)))
        print('Test recall: {:.2f}'.format(recall_score(test_nb.y, y_pred_test, zero_division=np.nan)))
        print('Test F1 score: {:.2f}'.format(f1_score(test_nb.y, y_pred_test)))

        results.append([accuracy_score(train_nb.y, y_pred_train), accuracy_score(test_nb.y, y_pred_test), f1_score(train_nb.y, y_pred_train), f1_score(test_nb.y, y_pred_test), f1_score(train_nb.y, y_pred_train), f1_score(test_nb.y, y_pred_test), recall_score(train_nb.y, y_pred_train), recall_score(test_nb.y, y_pred_test)])
        if i == 2:
            active_learner_transformer.initialize_data(active_learner_nb.indices_labeled, active_learner_nb.y)
    else:
        indices_queried = active_learner_transformer.query(num_samples=20)
        y = train_transformer.y[indices_queried]
        active_learner_transformer.update(y)
        indices_labeled = np.concatenate([indices_queried, indices_labeled])

        y_pred_train = active_learner_transformer.classifier.predict(train_transformer)
        y_pred_test = active_learner_transformer.classifier.predict(test_transformer)
        print(f'\nIteration {i+1} ({len(indices_labeled)} samples)')
        print('Train accuracy: {:.2f}'.format(accuracy_score(train_transformer.y, y_pred_train)))
        print('Train precision: {:.2f}'.format(precision_score(train_transformer.y, y_pred_train, zero_division=np.nan)))
        print('Train recall: {:.2f}'.format(recall_score(train_transformer.y, y_pred_train, zero_division=np.nan)))
        print('Train F1 score: {:.2f}'.format(f1_score(train_transformer.y, y_pred_train)))
        print('\n')
        print('Test accuracy: {:.2f}'.format(accuracy_score(test_transformer.y, y_pred_test)))
        print('Test precision: {:.2f}'.format(precision_score(test_transformer.y, y_pred_test, zero_division=np.nan)))
        print('Test recall: {:.2f}'.format(recall_score(test_transformer.y, y_pred_test, zero_division=np.nan)))
        print('Test F1 score: {:.2f}'.format(f1_score(test_transformer.y, y_pred_test)))

        results.append([accuracy_score(train_transformer.y, y_pred_train), accuracy_score(test_transformer.y, y_pred_test), f1_score(train_transformer.y, y_pred_train), f1_score(test_transformer.y, y_pred_test), f1_score(train_transformer.y, y_pred_train), f1_score(test_transformer.y, y_pred_test), recall_score(train_transformer.y, y_pred_train), recall_score(test_transformer.y, y_pred_test)])

In [None]:
#Plotando resultados
acc_treino = []
acc_teste = []
f1_treino = []
f1_teste = []
recall_treino = []
recall_teste = []

def plota_resultados(treino, teste, metrica):
    fig = plt.figure(figsize=(12, 8))
    ax = plt.axes()
    ax.plot(np.arange(1, len(treino)+1), treino, label=f'{metrica} treino')
    ax.plot(np.arange(1, len(treino)+1), teste, label=f'{metrica} teste')
    ax.legend(loc='lower right')
    plt.xticks(np.arange(1, len(treino)+1))
    plt.ylim((0.0, 1.0))
    plt.ylabel(metrica)
    plt.xlabel('Número de iterações')
    plt.title(f'{metrica} treino x teste')
    plt.savefig(f'{metrica}.png')


for result in results:
    acc_treino.append(result[0])
    acc_teste.append(result[1])
    f1_treino.append(result[2])
    f1_teste.append(result[3])
    recall_treino.append(result[4])
    recall_teste.append(result[5])

plota_resultados(acc_treino, acc_teste, 'Accuracy_score')
plota_resultados(f1_treino, f1_teste, 'F1_score')
plota_resultados(recall_treino, recall_teste, 'Recall')