In [1]:
import numpy as np
import pandas as pd
import os
import csv
import random

from synergy_dataset import Dataset, iter_datasets
from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def cria_dataset(dataset):
    ds = Dataset(dataset)
    ds = ds.to_frame()
    ds = ds.fillna('')
    title = ds['title']
    abstract = ds['abstract']
    X = np.array([x[0] + ' ' + x[1] for x in zip(title, abstract)])
    y = np.array(ds['label_included'])

    return X, y

In [5]:
occ_models = ['OneClassSVM',
              'Isolation Forest',
              'Local Outlier Factor']

model_checkpoints = ['sentence-transformers/all-MiniLM-L6-v2', 
                     'all-distilroberta-v1', 
                     'sentence-transformers/allenai-specter']

datasets = ['Nelson_2002', 'Donners_2021', 'Oud_2018', 'van_der_Valk_2021']

In [7]:
X, y = cria_dataset('Hall_2012')
nu = len(y[y==1]) / len(y)
model = SentenceTransformer('sentence-transformers/allenai-specter')

y[y==0] = -1
k = 10

clf = IsolationForest(contamination=0.1, random_state=42)
labels_1_idx = [i for i, _ in enumerate(y) if y[i] == 1]
example_ids = np.random.choice(labels_1_idx, k, replace=False)
y_other = [l for i, l in enumerate(y) if i not in example_ids]
X_train = model.encode(X[example_ids])
X_test = model.encode([x for i, x in enumerate(X) if i not in example_ids])
clf.fit(X_train)

y_pred = clf.predict(X_test)
print(f'Treinamento com {k} amostras:')
print(classification_report(y_other, y_pred, labels=[-1, 1] , zero_division=0))

f = open('report.txt', 'w')
f.write('Title\n\nClassification Report\n\n{}\n'.format(test))
f.close()



Treinamento com 10 amostras:
              precision    recall  f1-score   support

          -1       1.00      0.98      0.99      8689
           1       0.25      0.74      0.37        94

    accuracy                           0.97      8783
   macro avg       0.62      0.86      0.68      8783
weighted avg       0.99      0.97      0.98      8783



IndexError: Replacement index 1 out of range for positional args tuple

In [15]:
y1 = [1, 1, 1, 0, 1]
y2 = [1, 0, 1, 1, 0]
test = classification_report(y1, y2, zero_division=0) 
print(test)
f = open('report.txt', 'a')
f.write(f'Treinamento com {10} amostras\n{test}\n\n')
f.close()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      0.50      0.57         4

    accuracy                           0.40         5
   macro avg       0.33      0.25      0.29         5
weighted avg       0.53      0.40      0.46         5



In [87]:
X, y = cria_dataset('Hall_2012')
nu = len(y[y==1]) / len(y)
model = SentenceTransformer('sentence-transformers/allenai-specter')
y[y==1] = -1
y[y==0] = 1

for k in range (1, 4):
    clf = IsolationForest(contamination=nu)
    labels_1_idx = [i for i, _ in enumerate(y) if y[i] == -1]
    example_ids = np.random.choice(labels_1_idx, k, replace=False)
    y_other = [l for i, l in enumerate(y) if i not in example_ids]
    X_train = model.encode(X[example_ids])
    X_test = model.encode([x for i, x in enumerate(X) if i not in example_ids])
    clf.fit(X_train)

    y_pred = clf.predict(X_test)
    print(f'Treinamento com {k} amostras:')
    print(classification_report(y_other, y_pred, labels=[-1, 1] , zero_division=0))



Treinamento com 1 amostras:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        79
           1       0.78      1.00      0.88       286

    accuracy                           0.78       365
   macro avg       0.39      0.50      0.44       365
weighted avg       0.61      0.78      0.69       365

Treinamento com 2 amostras:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        78
           1       0.79      1.00      0.88       286

    accuracy                           0.79       364
   macro avg       0.39      0.50      0.44       364
weighted avg       0.62      0.79      0.69       364

Treinamento com 3 amostras:
              precision    recall  f1-score   support

          -1       0.18      0.73      0.29        77
           1       0.58      0.10      0.17       286

    accuracy                           0.23       363
   macro avg       0.38      0.41      0.23 

In [20]:
X, y = cria_dataset('Moran_2021')

nu = len(y[y==1]) / len(y)
print(f'Percentual da classe positiva: {nu}')

model = SentenceTransformer('google-bert/bert-base-uncased')

X = model.encode(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_test[y_test==1] = -1
y_test[y_test==0] = 1

No sentence-transformers model found with name google-bert/bert-base-uncased. Creating a new one with MEAN pooling.


###One Class SVM

In [4]:
model_ocsvm = OneClassSVM(gamma='scale', nu=nu)

model_ocsvm.fit(X_train[y_train==0])

y_pred = model_ocsvm.predict(X_test)

print(f1_score(y_true=y_test, y_pred=y_pred, pos_label=-1))
print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1]))

0.0
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        20
           1       1.00      0.95      0.97     10624

    accuracy                           0.95     10644
   macro avg       0.50      0.48      0.49     10644
weighted avg       1.00      0.95      0.97     10644



###Isolation Forest

In [6]:
model_ocif = IsolationForest(contamination=nu)

model_ocif.fit(X_train[y_train==0])

y_pred = model_ocif.predict(X_test)

print(f1_score(y_true=y_test, y_pred=y_pred, pos_label=-1))
print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1]))

ValueError: X should be in csr_matrix format, got <class 'scipy.sparse._csc.csc_matrix'>

###Local Outlier Factor

In [7]:
model_oclof = LocalOutlierFactor(contamination=nu)

model_oclof.fit(X_train[y_train==0])

y_pred = model_ocif.predict(X_test)

print(f1_score(y_true=y_test, y_pred=y_pred, pos_label=-1))
print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1]))

###Eliptic Envelope

In [68]:
model_ocee = EllipticEnvelope(contamination=nu)

model_ocee.fit(X_train[y_train==0])

y_pred = model_ocee.predict(X_test)

print(f1_score(y_true=y_test, y_pred=y_pred, pos_label=-1))
print(classification_report(y_true=y_test, y_pred=y_pred, labels=[-1, 1]))



KeyboardInterrupt: 