In [42]:
import numpy as np
import pandas as pd
import os
import csv
import statistics

from synergy_dataset import Dataset, iter_datasets
from sentence_transformers import SentenceTransformer, util

In [2]:
def cria_dataset(dataset):
    ds = Dataset(dataset)
    ds = ds.to_frame()
    ds = ds.fillna('')
    title = ds['title']
    abstract = ds['abstract']
    X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
    y = np.array(ds['label_included'])

    return X, y

In [144]:
model_checkpoint = 'all-distilroberta-v1'
dataset = 'Muthu_2021'

In [145]:
X, y = cria_dataset(dataset)
model = SentenceTransformer(model_checkpoint)



In [None]:
n_iter = 10
n_items_inicial = 4

labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]
indices_escolhidos = np.random.choice(labels_1_idx, n_items_inicial, replace=False)
abstract_emb = model.encode(' '.join(X[indices_escolhidos]))
print(X[indices_escolhidos])
print()

for iter in range(n_iter):
    results=[]
    X_emb = model.encode([a for i, a in enumerate(X) if i not in indices_escolhidos])
    for index, (label, abstract) in enumerate(zip([l for i, l in enumerate(y) if i not in indices_escolhidos], [a for i,a in enumerate(X) if i not in indices_escolhidos])):
        print(index, (label, abstract))

###Selecionando 1 dos items que tiver a maior similaridade a cada iteração e adicionando-o no pool

In [152]:
n_iter = 336
n_items_inicial = 4
# np.random.seed(42)

X_emb = model.encode(X)

labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]
indices_escolhidos = np.random.choice(labels_1_idx, n_items_inicial, replace=False)
abstract_emb = X_emb[indices_escolhidos]

X_pool = np.delete(X_emb, indices_escolhidos, axis=0)
y_pool = np.delete(y, indices_escolhidos)

quant = 0

for iter in range(n_iter):
    results=[]
    for index, (label, abstract) in enumerate(zip(y_pool, X_pool)):
        similarity = util.cos_sim(abstract_emb, abstract)
        media = sum([item[0] for item in similarity.tolist()])/len(similarity.tolist())
        results.append((index, label, media, abstract))
    results.sort(key=lambda x: x[2], reverse=True)
    for result in results[:5]:
        print(result[:3])
    print('\n')
    abstract_emb = np.append(abstract_emb, results[0][3].reshape(1, 768), axis=0)
    X_pool = np.delete(X_pool, results[0][0], axis=0)
    y_pool = np.delete(y_pool, results[0][0])
    if results[0][1] == 1:
        quant += 1
print(f'Total 1: {quant}')



(2568, 0, 0.6969761252403259)
(2308, 0, 0.6925850063562393)
(236, 1, 0.685457393527031)
(312, 0, 0.6816498637199402)
(2666, 0, 0.6784901320934296)


(2308, 0, 0.7182405948638916)
(312, 0, 0.7169811725616455)
(136, 0, 0.7137613892555237)
(2071, 0, 0.7115535497665405)
(922, 0, 0.7114155054092407)


(136, 0, 0.7344534695148468)
(312, 0, 0.7329229513804117)
(922, 0, 0.7321808735529581)
(2664, 0, 0.7298232217629751)
(2071, 0, 0.7278930445512136)


(2070, 0, 0.7500317096710205)
(311, 0, 0.7475379960877555)
(2663, 0, 0.744024932384491)
(235, 1, 0.7404747179576329)
(1234, 0, 0.7386661427361625)


(311, 0, 0.7612574249505997)
(2662, 0, 0.7544744610786438)
(1817, 0, 0.7518917061388493)
(235, 1, 0.7507723495364189)
(1234, 0, 0.7484545707702637)


(2661, 0, 0.7749066551526388)
(1233, 0, 0.7669255402353075)
(235, 1, 0.7644921806123521)
(440, 0, 0.7606132626533508)
(1816, 0, 0.7568590342998505)


(1233, 0, 0.7828294456005096)
(235, 1, 0.7760293364524842)
(440, 0, 0.7754770636558532)
(920, 0, 0.76244

In [153]:
quant/336

0.22916666666666666

###Selecionando 3 dos items que tiverem a maior similaridade a cada iteração

In [136]:
n_iter = 10
n_items_inicial = 4
np.random.seed(42)

X_emb = model.encode(X)

labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]
indices_escolhidos = np.random.choice(labels_1_idx, n_items_inicial, replace=False)
abstract_emb = X_emb[indices_escolhidos]
X_pool = np.delete(X_emb, indices_escolhidos, axis=0)
y_pool = np.delete(y, indices_escolhidos)

for iter in range(n_iter):
    results=[]
    for index, (label, abstract) in enumerate(zip(y_pool, X_pool)):
        similarity = util.cos_sim(abstract_emb, abstract)
        media = sum([item[0] for item in similarity.tolist()])/len(similarity.tolist())
        results.append((index, label, media, abstract))
    results.sort(key=lambda x: x[2], reverse=True)
    for result in results[:5]:
        print(result[:3])
    print('\n')
    for result in results[:3]:
        abstract_emb = np.append(abstract_emb, result[3].reshape(1, 768), axis=0)
        X_pool = np.delete(X_pool, result[0], axis=0)
        y_pool = np.delete(y_pool, result[0])



(227, 1, 0.6878334283828735)
(354, 1, 0.6861750036478043)
(283, 1, 0.6854076385498047)
(81, 1, 0.6798913180828094)
(279, 1, 0.67168328166008)


(282, 1, 0.7616781081472125)
(352, 1, 0.7585028069359916)
(81, 1, 0.7500306282724652)
(278, 1, 0.734009427683694)
(26, 1, 0.7322960325649807)


(350, 1, 0.7950103759765625)
(277, 1, 0.7591192722320557)
(58, 1, 0.7566482663154602)
(26, 1, 0.7520329833030701)
(306, 1, 0.7509094417095185)


(304, 1, 0.7682555272028997)
(26, 1, 0.7667842140564551)
(112, 1, 0.7601132140709803)
(181, 1, 0.7571098345976609)
(296, 0, 0.7569977182608384)


(111, 1, 0.7875750083476305)
(294, 0, 0.774870004504919)
(260, 0, 0.7668376918882132)
(300, 0, 0.7646535858511925)
(270, 1, 0.7633426729589701)


(292, 0, 0.7965556445874666)
(259, 0, 0.7883146452276331)
(268, 1, 0.7780089990088814)
(36, 1, 0.7735081534636649)
(297, 0, 0.7702107931438246)


(267, 1, 0.7948054766113107)
(36, 1, 0.7801214944232594)
(200, 1, 0.7795383862473748)
(228, 0, 0.7736261636018753)
(239, 1, 0.772