In [1]:
import numpy as np
import pandas as pd
import os
import csv

from synergy_dataset import Dataset, iter_datasets
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
def acuracia(lista, n):
    return len([i for i,_ in enumerate(lista[:n]) if lista[:n][i][0] == 1])/len(lista[:n])

def chunks(l, n):
    for i in range(0, len(l)):
        if len(l[i:i+n]) == n:
            yield l[i:i+n]
    # for item in zip(l[0::n], l[1::n]):
    #     yield(item)

In [39]:
a = [1, 2, 3, 4, 5, 6]
b = [6, 7, 8, 9, 10]
list(chunks(b, 2))

[[6, 7], [7, 8], [8, 9], [9, 10]]

In [4]:
def cria_dataset(dataset):
    ds = Dataset(dataset)
    ds = ds.to_frame()
    ds = ds.dropna()
    title = ds['title']
    abstract = ds['abstract']
    X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
    y = np.array(ds['label_included'])

    return X, y

In [5]:
model_checkpoints = ['sentence-transformers/all-MiniLM-L6-v2', 
                     'google-bert/bert-base-uncased', 
                     'allenai/scibert_scivocab_uncased']
datasets = ['Donners_2021', 'Jeyaraman_2020', 
            'Muthu_2021', 'van_der_Valk_2021']

In [6]:
model = SentenceTransformer('jordyvl/scibert_scivocab_uncased_sentence_transformer')
X, y = cria_dataset('Nelson_2002')

No sentence-transformers model found with name allenai/scibert_scivocab_uncased. Creating a new one with MEAN pooling.


In [8]:
n = 1
labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]
for index, example_ids in enumerate(chunks(labels_1_idx, n)):
    abstract_emb = model.encode(' '.join(X[example_ids]))
    X_embedded = model.encode([x for i, x in enumerate(X) if i not in example_ids])
    list_other_labels = [l for i, l in enumerate(y) if i not in example_ids]
    results = []
    for label, abstract in zip(list_other_labels, X_embedded):
        similarity = util.cos_sim(abstract_emb, abstract)
        results.append((label, similarity.item()))
    results.sort(key=lambda x: x[1], reverse=True)
    print(f'\nItem {index+1}: ')
    print(f'Acc1: {acuracia(results, 1)}')
    print(f'Acc2: {acuracia(results, 2)}')
    print(f'Acc3: {acuracia(results, 3)}')
    print(f'Acc5: {acuracia(results, 5)}')
    print(f'Acc10: {acuracia(results, 10)}')

    print(results)


Item 1: 
Acc1: 1.0
Acc2: 1.0
Acc3: 0.6666666666666666
Acc5: 0.6
Acc10: 0.8
[(1, 0.9681690335273743), (1, 0.9602785706520081), (0, 0.9575057029724121), (0, 0.9558804035186768), (1, 0.9540833234786987), (1, 0.9536093473434448), (1, 0.9535796642303467), (1, 0.9529985785484314), (1, 0.9522531628608704), (1, 0.9520325064659119), (0, 0.9512702226638794), (0, 0.9508208632469177), (1, 0.9504982233047485), (1, 0.9499329924583435), (1, 0.9496709108352661), (0, 0.9485480189323425), (1, 0.9485013484954834), (1, 0.9480311274528503), (1, 0.9478423595428467), (0, 0.9464579224586487), (0, 0.9462575316429138), (1, 0.9457685947418213), (1, 0.945672869682312), (1, 0.9454816579818726), (0, 0.9447012543678284), (1, 0.9443739652633667), (0, 0.9441110491752625), (1, 0.9441050887107849), (1, 0.9440381526947021), (0, 0.944037675857544), (0, 0.9435267448425293), (0, 0.9432528018951416), (1, 0.9428751468658447), (1, 0.9427585005760193), (0, 0.9425719976425171), (0, 0.9425344467163086), (1, 0.9425145387649536), 

KeyboardInterrupt: 

In [31]:
for model_checkpoint in model_checkpoints:
    model = SentenceTransformer(model_checkpoint)
    folder_model = model_checkpoint.split("/")[-1]
    for dataset in datasets:
        X, y = cria_dataset(dataset)

        X_embedded = model.encode(X)
        labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]

        for index, example_id in enumerate(labels_1_idx):
            label = y[example_id]
            abstract_emb = X_embedded[example_id]
            list_other_labels = [l for i, l in enumerate(y) if i != example_id]
            list_other_abstracts = [x for i, x in enumerate(X_embedded) if i != example_id]
            results = []
            for other_label, other_abstract_emb in zip(list_other_labels, list_other_abstracts):
                similarity = util.cos_sim(abstract_emb, other_abstract_emb)
                results.append((other_label, similarity.item()))
            results.sort(key=lambda x: x[1], reverse=True)

            if not os.path.exists(folder_model):
                os.makedirs(folder_model)
            if not os.path.exists(f'{folder_model}/{dataset}'):
                os.makedirs(f'{folder_model}/{dataset}')
            with open(f'{folder_model}/{dataset}/{folder_model} - {dataset} - {str(index).rjust(3, "0")}.csv', 'w+') as f:
                write = csv.writer(f)
                write.writerow(['label', 'similarity'])
                write.writerows(results)
                write.writerow(['acc3', acuracia(results, 3)])
                write.writerow(['acc5', acuracia(results, 5)])
                write.writerow(['acc7', acuracia(results, 7)])
                write.writerow(['acc10', acuracia(results, 10)])
                write.writerow(['acc total', acuracia(results, len(results))])

