In [28]:
from modAL.models import ActiveLearner
from modAL.uncertainty import classifier_entropy
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from synergy_dataset import Dataset, iter_datasets

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sentence_transformers import SentenceTransformer, util

In [85]:
def cria_dataset(dataset):
    ds = Dataset(dataset)
    ds = ds.to_frame()
    ds = ds.dropna()
    title = ds['title']
    abstract = ds['abstract']
    X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
    y = np.array(ds['label_included'])

    return X, y

def query_function(classifier, X_pool, n_samples=10):
    n = len(X_pool)
    query_idx = np.random.choice(range(n), n_samples, replace=False)
    return query_idx, X_pool[query_idx]

In [3]:
X, y = cria_dataset('Muthu_2021')

#vectorizer = TfidfVectorizer()
model = SentenceTransformer('all-distilroberta-v1')

X_enc = model.encode(X)



In [86]:
labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]

# indices_initial = np.random.choice(labels_1_idx, 4, replace=False)
indices_initial = np.random.choice(labels_1_idx, 4, replace=False)
X_train = X_enc[indices_initial]
y_train = y[indices_initial]

X_pool = np.delete(X_enc, indices_initial, axis=0)
y_pool = np.delete(y, indices_initial)

In [87]:
learner = ActiveLearner(estimator=RandomForestClassifier(),
                        X_training=X_train, y_training=y_train, query_strategy=query_function)
predictions = learner.predict(X_pool)
print(learner.y_training)
print(f'F1-score: {f1_score(y_pool, predictions)}')
print(f'Acurácia: {learner.score(X_enc, y, )}')

[1 1 1 1]
F1-score: 0.219586580820061
Acurácia: 0.12466742683390346


In [90]:
n_queries = 20
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool)
    while not any(y_pool[query_idx] == 1):
        query_idx, query_instance = learner.query(X_pool)
    train_idx = [a for a in query_idx if y_pool[a] == 1]
    print(y[query_idx])
    print(y[train_idx])
    learner.teach(
        X=X_pool[train_idx],
        y=y_pool[train_idx]
    )
    # remove queried instance from pool
    X_pool = np.delete(X_pool, train_idx, axis=0)
    y_pool = np.delete(y_pool, train_idx)
    predictions = learner.predict(X_pool)
    print(f'Iteração nº {idx+1}')
    print(f'Acurácia: {learner.score(X_enc, y)}')
    print(f'F1-score: {f1_score(y_pool, predictions)}\n')

[0 0 0 0 0 0 0 0 0 0]
[0]
Iteração nº 1
Acurácia: 0.12466742683390346
F1-score: 0.18535550053059782

[0 0 0 1 0 0 0 0 0 0]
[1 0]
Iteração nº 2
Acurácia: 0.12466742683390346
F1-score: 0.18420120439249024

[1 0 0 0 0 0 0 0 0 0]
[0 0]
Iteração nº 3
Acurácia: 0.12466742683390346
F1-score: 0.18304363249379213

[0 1 1 1 0 0 0 1 0 0]
[1]
Iteração nº 4
Acurácia: 0.12466742683390346
F1-score: 0.18246361377351794

[0 0 0 0 0 0 1 0 0 0]
[0]
Iteração nº 5
Acurácia: 0.12466742683390346
F1-score: 0.18188277087033747

[1 1 0 0 1 0 0 0 0 0]
[0]
Iteração nº 6
Acurácia: 0.12466742683390346
F1-score: 0.18130110202630642

[0 0 0 0 0 0 0 1 0 0]
[0]
Iteração nº 7
Acurácia: 0.12466742683390346
F1-score: 0.18071860547847743

[0 0 0 0 0 0 0 0 0 1]
[0]
Iteração nº 8
Acurácia: 0.12466742683390346
F1-score: 0.18013527945888216

[0 0 0 0 0 0 0 0 0 0]
[0]
Iteração nº 9
Acurácia: 0.12466742683390346
F1-score: 0.17955112219451372

[1 0 0 0 0 0 0 0 0 0]
[0]
Iteração nº 10
Acurácia: 0.12466742683390346
F1-score: 0.1789