In [26]:
import numpy as np
import pandas as pd
import os
import csv

from synergy_dataset import Dataset, iter_datasets
from sentence_transformers import SentenceTransformer, util

In [27]:
def acuracia(lista, n):
    return len([i for i,_ in enumerate(lista[:n]) if lista[:n][i][0] == 1])/len(lista[:n])

In [28]:
def cria_dataset(dataset):
    ds = Dataset(dataset)
    ds = ds.to_frame()
    ds = ds.dropna()
    title = ds['title']
    abstract = ds['abstract']
    X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
    y = np.array(ds['label_included'])

    return X, y

In [29]:
model_checkpoints = ['sentence-transformers/all-MiniLM-L6-v2', 'google-bert/bert-base-uncased', 'allenai/scibert_scivocab_uncased']
datasets = ['Nelson_2002', 'Donners_2021', 'Jeyaraman_2020', 'Muthu_2021', 'van_der_Valk_2021']

In [31]:
for model_checkpoint in model_checkpoints:
    model = SentenceTransformer(model_checkpoint)
    folder_model = model_checkpoint.split("/")[-1]
    for dataset in datasets:
        X, y = cria_dataset(dataset)

        X_embedded = model.encode(X)
        labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]

        for index, example_id in enumerate(labels_1_idx):
            label = y[example_id]
            abstract_emb = X_embedded[example_id]
            list_other_labels = [l for i, l in enumerate(y) if i != example_id]
            list_other_abstracts = [x for i, x in enumerate(X_embedded) if i != example_id]
            results = []
            for other_label, other_abstract_emb in zip(list_other_labels, list_other_abstracts):
                similarity = util.cos_sim(abstract_emb, other_abstract_emb)
                results.append((other_label, similarity.item()))
            results.sort(key=lambda x: x[1], reverse=True)

            if not os.path.exists(folder_model):
                os.makedirs(folder_model)
            if not os.path.exists(f'{folder_model}/{dataset}'):
                os.makedirs(f'{folder_model}/{dataset}')
            with open(f'{folder_model}/{dataset}/{folder_model} - {dataset} - {str(index).rjust(3, "0")}.csv', 'w+') as f:
                write = csv.writer(f)
                write.writerow(['label', 'similarity'])
                write.writerows(results)
                write.writerow(['acc3', acuracia(results, 3)])
                write.writerow(['acc5', acuracia(results, 5)])
                write.writerow(['acc7', acuracia(results, 7)])
                write.writerow(['acc10', acuracia(results, 10)])
                write.writerow(['acc total', acuracia(results, len(results))])

