In [116]:
import numpy as np
import pandas as pd
from small_text import (
    LeastConfidence,
    PoolBasedActiveLearner, 
    random_initialization_balanced,
    QueryStrategy,
    TransformersDataset,
    TransformerModelArguments,
    TransformerBasedClassificationFactory as TransformerFactory)
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from synergy_dataset import Dataset, iter_datasets

from small_text.classifiers.factories import SklearnClassifierFactory
from sklearn.naive_bayes import MultinomialNB
from small_text.data.datasets import SklearnDataset
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTEN

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [117]:
class QueryCosineSimilarity(QueryStrategy):
    def query(self, clf, dataset, indices_unlabeled, indices_labeled, y, n=10):
        results = []
        for index in indices_unlabeled:
            similarity = cosine_distances([dataset.x[i] for i in indices_labeled], dataset.x[index])
            # similarity = cosine_similarity(dataset.x[0], dataset.x[index])
            media = sum(item[0] for item in similarity.tolist())/len(similarity)
            results.append((index, media))
        results.sort(key=lambda x: x[1], reverse=True)
        return [index for index, _ in results[:n]]

In [118]:
def cria_dataset(dataset):
    ds = Dataset(dataset)
    ds = ds.to_frame()
    ds = ds.fillna('')
    title = ds['title']
    abstract = ds['abstract']
    X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
    y = np.array(ds['label_included'])

    return X, y

In [119]:
dataset = 'Muthu_2021'
model_checkpoint = 'all-distilroberta-v1'
transformer_model = 'allenai/specter2_base'
model = SentenceTransformer(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(transformer_model)

In [120]:
X, y = cria_dataset(dataset)
ds = TransformersDataset.from_arrays(X, y, tokenizer, target_labels=np.array([0, 1]), max_length=128)



In [121]:
num_classes = 2
model_args = TransformerModelArguments(transformer_model)
clf_factory = TransformerFactory(model_args, num_classes, kwargs={'device': 'cuda'})
# clf_factory = TransformerFactory(model_args, num_classes)
query_strategy = QueryCosineSimilarity()

In [122]:
cosine_similarity(ds.x[0], ds.x[1])

array([[0.48408911]])

In [123]:
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, ds)
indices_initial = random_initialization_balanced(ds.y, n_samples=10)
indices_initial_1 = [idx for idx in indices_initial if ds.y[idx] == 1]
active_learner.initialize_data(indices_initial, ds.y[indices_initial])
print(indices_initial)
print(ds.y[indices_initial])
print(f'Indices iniciais: {indices_initial_1} Labels iniciais: {ds.y[indices_initial_1]}')

[1242  467  506 1439 1672 2492 1023  520 2255  340]
[0 0 0 1 0 1 1 1 0 1]
Indices iniciais: [1439, 2492, 1023, 520, 340] Labels iniciais: [1 1 1 1 1]


In [124]:
num_queries = 10
results = []
indices_labeled = []

for i in range(num_queries):
    indices_queried = active_learner.query(num_samples=2)
    y = ds.y[indices_queried]
    active_learner.update(y)

    y_pred_test = active_learner.classifier.predict(ds)
    print(f'\nIteration {i+1} ({len(indices_labeled)} samples)')
    print('Test accuracy: {:.2f}'.format(accuracy_score(ds.y, y_pred_test)))
    print('Test precision: {:.2f}'.format(precision_score(ds.y, y_pred_test, zero_division=np.nan)))
    print('Test recall: {:.2f}'.format(recall_score(ds.y, y_pred_test, zero_division=np.nan)))
    print('Test F1 score: {:.2f}'.format(f1_score(ds.y, y_pred_test)))

  X = np.asarray(X)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.