<a href="https://colab.research.google.com/github/nildodnjunior/mestrado_comp_ifes_dissertacao/blob/master/cosine_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install synergy-dataset -q
!pip install sentence-transformers -q
!pip install transformers -q
!python -m synergy_dataset get

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25h
Due to legal constraints, paper abstracts in SYNERGY cannot be published in
plaintext. Abstracts are instead stored as an inverted index. Inverted
indexes store information about each word in a body of text, including
the number of occurrences and the position of each occurrence. Read
more:
- https://learn.microsoft.com/en-us/academic-services/graph/resources-faq
- https://docs.openalex.org/api-entities/works/work-object

For machine learning purposes, it can be helpful to convert the inverted
abstract back into plaintext locally. Keep in mind that paper abstracts
in SYNERGY cannot be published as plaintext again. Therefore you can refer
to the version of the SYNERGY dataset.

Would you like to convert the inverted abstract to plaintext? ([

In [2]:
import numpy as np
import pandas as pd
import os
import csv

from synergy_dataset import Dataset, iter_datasets
from sentence_transformers import SentenceTransformer, util

In [3]:
def acuracia(lista, n):
    return len([i for i, _ in lista[:n] if i == 1])/len(lista[:n])

'''
Retorna grupos de ordem n. Por exemplo, na lista [1, 2, 3, 4] com n=2,
retornará as listas [1, 2], [2, 3] e [3, 4]
'''
def chunks(l, n):
    for i in range(0, len(l)):
        if len(l[i:i+n]) == n:
            yield l[i:i+n]

In [4]:
def cria_dataset(dataset):
    ds = Dataset(dataset)
    ds = ds.to_frame()
    ds = ds.fillna('')
    title = ds['title']
    abstract = ds['abstract']
    X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
    y = np.array(ds['label_included'])

    return X, y

In [5]:
model_checkpoints = ['sentence-transformers/all-MiniLM-L6-v2', 'all-distilroberta-v1', 'sentence-transformers/allenai-specter']
datasets = ['Nelson_2002', 'Donners_2021', 'Oud_2018', 'van_der_Valk_2021']

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
for model_checkpoint in model_checkpoints:
    model = SentenceTransformer(model_checkpoint)
    folder_model = model_checkpoint.split("/")[-1]
    print(f'\nModelo: {model_checkpoint}')
    for dataset in datasets:
        print(f'\nDataset: {dataset}')
        X, y = cria_dataset(dataset)

        if not os.path.exists(os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model)):
            os.makedirs(os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model))
        if not os.path.exists(os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model, dataset)):
            os.makedirs(os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model, dataset))
        folder = os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model, dataset)

        #Loop que utiliza n amostras para comparação com as restantes, indo de 1 a 5
        for n in range(1, 6):
            with open(f'{folder}/{folder_model} - {dataset} - {str(n).rjust(2, "0")} exemplos.txt', 'w+') as f:

                #Separa todos os indices que possuem label = 1
                labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]

                print(f'\nUsando grupos de {n} exemplo(s):')

                #loop em todos os índices que possuem label = 1, divididos em chunks de tamanho n
                for index, example_ids in enumerate(chunks(labels_1_idx, n)):

                    #Encoding das n amostras juntas no mesmo vetor
                    abstract_emb = model.encode(' '.join(X[example_ids]))

                    #Encoding das outras amostras que não foram usadas acima
                    X_embedded = model.encode([x for i, x in enumerate(X) if i not in example_ids])

                    #Separa as labels que não foram usadas acima
                    list_other_labels = [l for i, l in enumerate(y) if i not in example_ids]
                    results = []

                    #Loop para calcular a similaridade entre as amostras selecionadas e todas as outras
                    for label, abstract in zip(list_other_labels, X_embedded):
                        similarity = util.cos_sim(abstract_emb, abstract)
                        results.append((label, similarity.item()))
                    results.sort(key=lambda x: x[1], reverse=True)
                    # print(f'\nGrupo {index+1}:')
                    # print(f'Acc1: {acuracia(results, 1)}')
                    # print(f'Acc2: {acuracia(results, 2)}')
                    # print(f'Acc3: {acuracia(results, 3)}')
                    # print(f'Acc5: {acuracia(results, 5)}')
                    # print(f'Acc10: {acuracia(results, 10)}')

                    f.write(f'\nGrupo {index+1}: \n')
                    f.write(f'Acc1: {acuracia(results, 1)}\n')
                    f.write(f'Acc2: {acuracia(results, 2)}\n')
                    f.write(f'Acc3: {acuracia(results, 3)}\n')
                    f.write(f'Acc5: {acuracia(results, 5)}\n')
                    f.write(f'Acc10: {acuracia(results, 10)}\n')
                    for result in results[:10]:
                        f.write(str(result)+'\n')
                    f.write('\n')




Modelo: sentence-transformers/all-MiniLM-L6-v2

Dataset: Nelson_2002

Usando grupos de 1 exemplo(s):

Usando grupos de 2 exemplo(s):

Usando grupos de 3 exemplo(s):

Usando grupos de 4 exemplo(s):

Usando grupos de 5 exemplo(s):

Dataset: Donners_2021

Usando grupos de 1 exemplo(s):

Usando grupos de 2 exemplo(s):

Usando grupos de 3 exemplo(s):

Usando grupos de 4 exemplo(s):

Usando grupos de 5 exemplo(s):

Dataset: Oud_2018

Usando grupos de 1 exemplo(s):

Usando grupos de 2 exemplo(s):

Usando grupos de 3 exemplo(s):

Usando grupos de 4 exemplo(s):

Usando grupos de 5 exemplo(s):

Dataset: van_der_Valk_2021

Usando grupos de 1 exemplo(s):

Usando grupos de 2 exemplo(s):

Usando grupos de 3 exemplo(s):

Usando grupos de 4 exemplo(s):

Usando grupos de 5 exemplo(s):


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Modelo: all-distilroberta-v1

Dataset: Nelson_2002

Usando grupos de 1 exemplo(s):

Usando grupos de 2 exemplo(s):

Usando grupos de 3 exemplo(s):

Usando grupos de 4 exemplo(s):

Usando grupos de 5 exemplo(s):

Dataset: Donners_2021

Usando grupos de 1 exemplo(s):

Usando grupos de 2 exemplo(s):

Usando grupos de 3 exemplo(s):

Usando grupos de 4 exemplo(s):

Usando grupos de 5 exemplo(s):

Dataset: Oud_2018

Usando grupos de 1 exemplo(s):

Usando grupos de 2 exemplo(s):

Usando grupos de 3 exemplo(s):

Usando grupos de 4 exemplo(s):

Usando grupos de 5 exemplo(s):

Dataset: van_der_Valk_2021

Usando grupos de 1 exemplo(s):

Usando grupos de 2 exemplo(s):

Usando grupos de 3 exemplo(s):


In [None]:
# for model_checkpoint in model_checkpoints:
#     model = SentenceTransformer(model_checkpoint)
#     folder_model = model_checkpoint.split("/")[-1]
#     for dataset in datasets:
#         X, y = cria_dataset(dataset)

#         X_embedded = model.encode(X)
#         labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]

#         for index, example_id in enumerate(labels_1_idx):
#             label = y[example_id]
#             abstract_emb = X_embedded[example_id]
#             list_other_labels = [l for i, l in enumerate(y) if i != example_id]
#             list_other_abstracts = [x for i, x in enumerate(X_embedded) if i != example_id]
#             results = []
#             for other_label, other_abstract_emb in zip(list_other_labels, list_other_abstracts):
#                 similarity = util.cos_sim(abstract_emb, other_abstract_emb)
#                 results.append((other_label, similarity.item()))
#             results.sort(key=lambda x: x[1], reverse=True)

#             if not os.path.exists(os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model)):
#                 os.makedirs(os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model))
#             if not os.path.exists(os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model, dataset)):
#                 os.makedirs(os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model, dataset))
#             folder = os.path.join('/content/drive/MyDrive/cosine_similarity/', folder_model, dataset)
#             with open(f'{folder}/{folder_model} - {dataset} - {str(index).rjust(3, "0")}.csv', 'w+') as f:
#                 write = csv.writer(f)
#                 write.writerow(['acc3', acuracia(results, 3)])
#                 write.writerow(['acc5', acuracia(results, 5)])
#                 write.writerow(['acc7', acuracia(results, 7)])
#                 write.writerow(['acc10', acuracia(results, 10)])
#                 write.writerow(['acc total', acuracia(results, len(results))])
#                 write.writerow(['label', 'similarity'])
#                 write.writerows(results)

In [None]:
# dataset = Dataset('Nelson_2002')

# dataset = dataset.to_frame()
# dataset = dataset.dropna()

# title = np.array(dataset['title'])
# abstract = np.array(dataset['abstract'])
# X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
# y = np.array(dataset['label_included'])

In [None]:
# model = SentenceTransformer(model_checkpoint)

In [None]:
# X_embedded = model.encode(X)

In [None]:
# print(len(y), '-', len(X), '-', len(X_embedded))

In [None]:
# labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]

# for example_id in labels_1_idx:

#   label = y[example_id]

#   abstract_emb = X_embedded[example_id]

#   list_other_labels = [l for i, l in enumerate(y) if i != example_id]
#   list_other_abstracts = [x for i, x in enumerate(X_embedded) if i != example_id]

#   results = []

#   for other_label, other_abstract_emb in zip(list_other_labels, list_other_abstracts):

#     similarity = util.cos_sim(abstract_emb, other_abstract_emb)

#     results.append((other_label, similarity.item()))

#   results.sort(key=lambda x: x[1], reverse=True)

#   print(results)

#   break

In [None]:
# df = pd.DataFrame(results, columns=['label', 'similarity'])