<a href="https://colab.research.google.com/github/nildodnjunior/mestrado_comp_ifes_dissertacao/blob/master/occ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install synergy-dataset -q
!pip install sentence-transformers -q
!pip install transformers -q
!python -m synergy_dataset get

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[?25h
Due to legal constraints, paper abstracts in SYNERGY cannot be published in
plaintext. Abstracts are instead stored as an inverted index. Inverted
indexes store information about each word in a body of text, including
the number of occurrences and the position of each occurrence. Read
more:
- https://learn.microsoft.com/en-us/academic-services/graph/resources-faq
- https://docs.openalex.org/api-entities/works/work-object

For machine learning purposes, it can be helpful to convert the inverted
abstract back into plaintext locally. Keep in mind that paper abstracts
in SYNERGY cannot be published as plaintext again. Therefore you can refer
to the version of the SYNERGY dataset.

Would you like to convert the inverted abstract to plaintext? ([

In [2]:
import numpy as np
import pandas as pd
import os
import csv
import random

from synergy_dataset import Dataset, iter_datasets
from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [3]:
def cria_dataset(dataset):
    ds = Dataset(dataset)
    ds = ds.to_dict(variables=['title', 'abstract'])
    ds = pd.DataFrame.from_dict(ds, orient='index')
    ds = ds.fillna(' ')
    title = ds['title']
    abstract = ds['abstract']
    X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
    y = np.array(ds['label_included'])

    return X, y

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
occ_classifiers = ['OneClassSVM',
              'Isolation Forest',
              'Local Outlier Factor']

model_checkpoints = ['sentence-transformers/all-MiniLM-L6-v2',
                     'all-distilroberta-v1',
                     'sentence-transformers/allenai-specter']

datasets = ['Nelson_2002', 'Muthu_2021', 'Hall_2012', 'Wassenaar_2017']
#Wassenaar_2017 e Hall_2012 foram incluídos por serem desbalanceados (1.4% e 1.2%),
#para verificar se os classificadores OCC se saem melhor em datasets desse tipo.

In [14]:
model = SentenceTransformer('all-distilroberta-v1')
for clf_name in occ_classifiers:
    print('\n\n' + clf_name)
    for dataset in datasets:
        print(dataset)
        X, y = cria_dataset(dataset)

        y[y==0] = -1

        if not os.path.exists(os.path.join('/content/drive/MyDrive/occ/', clf_name)):
            os.makedirs(os.path.join('/content/drive/MyDrive/occ/', clf_name))
        folder = os.path.join('/content/drive/MyDrive/occ/', clf_name)

        '''
        Primeiramente foi feito variando k de 1 a 10. Porém, o f1 score ficava maior
        que zero somente após k > 5, e variando muito pouco a cada iteração.
        Com isso, foi feito com k começando em 5 e variando de 5 em 5 até 20 amostras.
        '''
        for k in range(5, 21, 5):
            if clf_name == 'OneClassSVM':
                clf = OneClassSVM(gamma='scale', nu=0.1)
            elif clf_name == 'Isolation Forest':
                clf = IsolationForest(contamination=0.1)
            elif clf_name == 'Local Outlier Factor':
                clf = LocalOutlierFactor(contamination=0.1, novelty=True)

            labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]
            example_ids = np.random.choice(labels_1_idx, k, replace=False)
            y_other = [l for i, l in enumerate(y) if i not in example_ids]
            X_train = model.encode(X[example_ids])
            X_test = model.encode([x for i, x in enumerate(X) if i not in example_ids])

            clf.fit(X_train)
            y_pred = clf.predict(X_test)

            print(f'Treinamento com {k} amostras:')
            cr = classification_report(y_other, y_pred, labels=[-1, 1] , zero_division=0)
            print(cr)

            with open(f'{folder}/{clf_name} - {dataset}.txt', 'a') as f:
                f.write(f'\n\nTreinamento com {k} amostras\n{cr}')





OneClassSVM
Nelson_2002
Treinamento com 5 amostras:
              precision    recall  f1-score   support

          -1       0.79      1.00      0.88       286
           1       0.00      0.00      0.00        75

    accuracy                           0.79       361
   macro avg       0.40      0.50      0.44       361
weighted avg       0.63      0.79      0.70       361

Treinamento com 10 amostras:
              precision    recall  f1-score   support

          -1       0.82      0.94      0.88       286
           1       0.41      0.17      0.24        70

    accuracy                           0.79       356
   macro avg       0.62      0.56      0.56       356
weighted avg       0.74      0.79      0.75       356

Treinamento com 15 amostras:
              precision    recall  f1-score   support

          -1       0.84      0.95      0.89       286
           1       0.50      0.22      0.30        65

    accuracy                           0.81       351
   macro avg    



Treinamento com 5 amostras:
              precision    recall  f1-score   support

          -1       0.94      0.41      0.57       286
           1       0.29      0.91      0.43        75

    accuracy                           0.51       361
   macro avg       0.61      0.66      0.50       361
weighted avg       0.81      0.51      0.54       361





Treinamento com 10 amostras:
              precision    recall  f1-score   support

          -1       0.92      0.45      0.61       286
           1       0.27      0.84      0.41        70

    accuracy                           0.53       356
   macro avg       0.60      0.65      0.51       356
weighted avg       0.79      0.53      0.57       356





Treinamento com 15 amostras:
              precision    recall  f1-score   support

          -1       0.96      0.46      0.62       286
           1       0.28      0.92      0.43        65

    accuracy                           0.54       351
   macro avg       0.62      0.69      0.52       351
weighted avg       0.84      0.54      0.59       351

Treinamento com 20 amostras:
              precision    recall  f1-score   support

          -1       0.99      0.37      0.54       286
           1       0.25      0.98      0.40        60

    accuracy                           0.48       346
   macro avg       0.62      0.68      0.47       346
weighted avg       0.86      0.48      0.52       346

Muthu_2021




Treinamento com 5 amostras:
              precision    recall  f1-score   support

          -1       0.97      0.45      0.61      2383
           1       0.18      0.89      0.30       331

    accuracy                           0.50      2714
   macro avg       0.57      0.67      0.46      2714
weighted avg       0.87      0.50      0.57      2714





Treinamento com 10 amostras:
              precision    recall  f1-score   support

          -1       0.96      0.47      0.63      2383
           1       0.18      0.87      0.31       326

    accuracy                           0.52      2709
   macro avg       0.57      0.67      0.47      2709
weighted avg       0.87      0.52      0.59      2709





Treinamento com 15 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.14      0.25      2383
           1       0.14      1.00      0.24       321

    accuracy                           0.24      2704
   macro avg       0.57      0.57      0.24      2704
weighted avg       0.89      0.24      0.24      2704

Treinamento com 20 amostras:
              precision    recall  f1-score   support

          -1       0.98      0.45      0.62      2383
           1       0.18      0.92      0.31       316

    accuracy                           0.51      2699
   macro avg       0.58      0.69      0.46      2699
weighted avg       0.89      0.51      0.58      2699

Hall_2012




Treinamento com 5 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.99      1.00      8689
           1       0.57      0.60      0.58        99

    accuracy                           0.99      8788
   macro avg       0.78      0.80      0.79      8788
weighted avg       0.99      0.99      0.99      8788





Treinamento com 10 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.99      1.00      8689
           1       0.53      0.72      0.61        94

    accuracy                           0.99      8783
   macro avg       0.76      0.86      0.80      8783
weighted avg       0.99      0.99      0.99      8783





Treinamento com 15 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.99      0.99      8689
           1       0.41      0.87      0.56        89

    accuracy                           0.99      8778
   macro avg       0.71      0.93      0.78      8778
weighted avg       0.99      0.99      0.99      8778

Treinamento com 20 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.98      0.99      8689
           1       0.34      0.89      0.50        84

    accuracy                           0.98      8773
   macro avg       0.67      0.94      0.74      8773
weighted avg       0.99      0.98      0.99      8773

Wassenaar_2017




Treinamento com 5 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.83      0.90      7557
           1       0.07      0.89      0.12       106

    accuracy                           0.83      7663
   macro avg       0.53      0.86      0.51      7663
weighted avg       0.99      0.83      0.89      7663





Treinamento com 10 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.83      0.91      7557
           1       0.06      0.79      0.11       101

    accuracy                           0.83      7658
   macro avg       0.53      0.81      0.51      7658
weighted avg       0.98      0.83      0.89      7658





Treinamento com 15 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.80      0.89      7557
           1       0.05      0.88      0.10        96

    accuracy                           0.80      7653
   macro avg       0.53      0.84      0.49      7653
weighted avg       0.99      0.80      0.88      7653

Treinamento com 20 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.67      0.80      7557
           1       0.03      0.96      0.06        91

    accuracy                           0.67      7648
   macro avg       0.52      0.81      0.43      7648
weighted avg       0.99      0.67      0.79      7648



[5, 10, 15, 20]

In [None]:
# for model_encoding in model_checkpoints:
#     print('\n\n' + model_encoding)
#     for model_name in occ_models:
#         print('\n\n' + model_name)
#         for dataset in datasets:
#             folder_model = model_encoding.split("/")[-1]
#             print(dataset)

#             X, y = cria_dataset(dataset)

#             nu = len(y[y==1]) / len(y)

#             if model_name == 'OneClassSVM':
#                 model = OneClassSVM(gamma='scale', nu=nu)
#             elif model_name == 'Isolation Forest':
#                 model = IsolationForest(contamination=nu)
#             elif model_name == 'Local Outlier Factor':
#                 model = LocalOutlierFactor(contamination=nu, novelty=True)

#             enc = SentenceTransformer(model_encoding)

#             X = enc.encode(X)

#             X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#             y_test[y_test==1] = -1
#             y_test[y_test==0] = 1

#             model.fit(X_train[y_train==1])

#             y_pred = model.predict(X_test)
#             print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1] , zero_division=0))

#             if not os.path.exists(os.path.join('/content/drive/MyDrive/occ/sentence_transformers', folder_model)):
#                 os.makedirs(os.path.join('/content/drive/MyDrive/occ/sentence_transformers', folder_model))
#             if not os.path.exists(os.path.join('/content/drive/MyDrive/occ/sentence_transformers', folder_model, model_name)):
#                 os.makedirs(os.path.join('/content/drive/MyDrive/occ/sentence_transformers', folder_model, model_name))
#             folder = os.path.join('/content/drive/MyDrive/occ/sentence_transformers', folder_model, model_name)

#             df = pd.DataFrame(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1] , zero_division=0, output_dict=True)).transpose()
#             df.to_csv(f'{folder}/{dataset}.csv')

###Feature extraction usando TF-IDF

In [None]:
# # X, y = cria_dataset('Brouwer_2019')
# X, y = cria_dataset('Leenaars_2020')

# nu = len(y[y==1]) / len(y)
# print(f'Percentual da classe positiva: {nu}')

# vectorizer = TfidfVectorizer()
# X_vec = vectorizer.fit_transform(X)

# X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.3, random_state=42, stratify=y)
# y_test[y_test==1] = -1
# y_test[y_test==0] = 1

###Feature extraction usando sentence-transformers

In [None]:
# # X, y = cria_dataset('Moran_2021')
# X, y = cria_dataset('Brouwer_2019')

# nu = len(y[y==1]) / len(y)
# print(f'Percentual da classe positiva: {nu}')

# model = SentenceTransformer('google-bert/bert-base-uncased')

# X = model.encode(X)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# y_test[y_test==1] = -1
# y_test[y_test==0] = 1

###OneClassSVM

In [None]:
# model_ocsvm = OneClassSVM(gamma='scale', nu=nu)

# model_ocsvm.fit(X_train[y_train==1])

# y_pred = model_ocsvm.predict(X_test)

# print('F1 Score: %.3f' % f1_score(y_test, y_pred, pos_label=1))
# print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1]))

###Isolation Forest

In [None]:
# model_ocif = IsolationForest(contamination=nu)

# model_ocif.fit(X_train[y_train==1])

# y_pred = model_ocif.predict(X_test)

# print('F1 Score: %.3f' % f1_score(y_true=y_test, y_pred=y_pred, pos_label=1))
# print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1]))

###Local outlier factor

In [None]:
# model_oclof = LocalOutlierFactor(contamination=nu, novelty=True)

# model_oclof.fit(X_train[y_train==1])

# y_pred = model_oclof.predict(X_test)

# print('F1 Score: %.3f' % f1_score(y_true=y_test, y_pred=y_pred, pos_label=1))
# print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1]))

###Eliptic envelope

In [None]:
# model_ocee = EllipticEnvelope(contamination=nu)

# model_ocee.fit(X_train[y_train==1].toarray())

# y_pred = model_ocee.predict(X_test)

# print('F1 Score: %.3f' % f1_score(y_true=y_test, y_pred=y_pred, pos_label=1))
# print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1]))