In [1]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import auc
from sklearn.svm import OneClassSVM

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import os, json

In [2]:
#preprocess
def get_data(hierClass, outlier):
    feature_list = pd.read_pickle('../../data_raw/features_RF_model.pkl')
    train = pd.read_pickle('../../data/train_data_filtered.pkl')
    test = pd.read_pickle('../../data/test_data_filtered.pkl')
    
    train = train[train.hierClass==hierClass]
    train['hierPredtmp'] = train['hierClass']
    
    test = test[test['hierPred']==hierClass]
    test['hierPredtmp'] = test['hierPred']
    
    test = pd.concat([test, train[train.classALeRCE==outlier]], sort=False)
    train = train[train.classALeRCE!=outlier]
    
    train = train[feature_list]
    scaler = QuantileTransformer(n_quantiles=1000)
    scaler.fit(train)
    train = scaler.transform(train)
    train[np.isnan(train)] = 0 #NaN to 0.
    
    test_features = test[feature_list]
    test_features = scaler.transform(test_features)
    test_features[np.isnan(test_features)] = 0
    
    test_labels = np.where((test['classALeRCE']!= outlier), 0, test['classALeRCE']) #Inlier:0
    test_labels = np.where(test['hierClass']!=test['hierPredtmp'], 1, test_labels) #Type1:1
    test_labels = np.where(test['classALeRCE']==outlier, 2, test_labels) #Type2:2
    test_labels = test_labels.reshape(-1,).astype('int8')
    return train, test_features, test_labels

In [3]:
#utils 
def save_metrics(metrics, root_dir, mode='val'):
    """save all the metrics."""
    mt_dir = os.path.join(root_dir, 'metrics_{}.json'.format(mode))
    with open(mt_dir, 'w') as mt:
        json.dump(metrics, mt)

def plot_histogram(in_scores, out1_scores, out2_scores, directory):
    plt.hist(in_scores, color='k', alpha=0.3, density=True, label='Inlier')
    plt.hist(out1_scores, color='b', alpha=0.3, density=True, label='Outlier1')
    plt.hist(out2_scores, color='purple', alpha=0.3, density=True, label='Outlier2')
    plt.title('Inliers vs Outliers (OCSVM)')
    plt.legend()
    plt.savefig('{}/plots/histogram.png'.format(directory))
    plt.close()
    
def compute_metrics(scores, labels, plot_hist=True, directory=None):
    """
    Computing the Area under the curve ROC and PR.
    """
    in_scores = scores[labels==0]
    out2_scores = scores[labels==1]
    out1_scores = scores[labels==2]

    auroc_out1, aupr_out1 = compute_roc_pr(in_scores, out1_scores)
    auroc_out2, aupr_out2 = compute_roc_pr(in_scores, out2_scores)
    auroc_out12, aupr_out12 = compute_roc_pr(in_scores, 
                              np.concatenate((out1_scores, out2_scores), axis=0))
    metrics = {'AU ROC Out1': auroc_out1,
               'AU PR Out1': aupr_out1,
               'AU ROC Out2': auroc_out2,
               'AU PR Out2': aupr_out2,
               'AU ROC Out12': auroc_out12,
               'AU PR Out12': aupr_out12,
               }
    if plot_hist:
        plot_histogram(in_scores, out1_scores, out2_scores, directory)
    return metrics

def compute_roc_pr(inliers_scores, outlier_scores):
    auroc_score = auroc(inliers_scores, outlier_scores)
    aupr_score = aupr(inliers_scores, outlier_scores)
    return auroc_score, aupr_score

def auroc(in_scores, out_scores):
    scores = np.concatenate((in_scores, out_scores), axis=0)
    start = np.min(scores)
    end = np.max(scores)   
    gap = (end- start)/100000

    aurocBase = 0.0
    fprTemp = 1.0
    tprs = []
    fprs = []
    for delta in np.arange(start, end, gap):
        tpr = np.sum(np.sum(out_scores < delta)) / np.float(len(out_scores))
        fpr = np.sum(np.sum(in_scores <= delta)) / np.float(len(in_scores))
        tprs.append(tpr)
        fprs.append(fpr)
    return auc(fprs, tprs)

def aupr(in_scores, out_scores):
    scores = np.concatenate((in_scores, out_scores), axis=0)
    start = np.min(scores)
    end = np.max(scores)   
    gap = (end- start)/100000
    
    precisions = []
    recalls = []
    for delta in np.arange(start, end, gap):
        tp = np.sum(np.sum(out_scores <= delta)) #/ np.float(len(out_scores))
        fp = np.sum(np.sum(in_scores <= delta)) #/ np.float(len(in_scores))
        if tp + fp == 0: continue
        precision = tp / (tp + fp)
        recall = tp / np.float(len(out_scores))
        precisions.append(precision)
        recalls.append(recall)
    return auc(recalls, precisions)

def print_metrics(metrics, directory):
    for metric, value in metrics.items():
        print("{}: {:.3f}".format(metric, value))
    print("##########################################")

In [4]:
def train(hierClass, outliers, train_features, directory):
    np.random.shuffle(train_features)
    clf = OneClassSVM(kernel='rbf', nu=0.001).fit(train_features)
    pickle.dump(clf, open('{}/model.pkl'.format(directory), 'wb'))
    return clf

def test(model, test_features, test_labels, directory):
    scores = model.score_samples(test_features)
    metrics = compute_metrics(scores, test_labels, plot_hist=True, directory=directory)
    print_metrics(metrics, directory)
    save_metrics(metrics, directory, 'test')

## Transient Experiments

In [5]:
hierClass = 'Transient'
outliers = ['SLSN',
            'SNII',
            'SNIa',
            'SNIbc']

for outlier in outliers:
    for run in range(5):
        directory = '../experiments/OCSVM_{}_{}_run{}'.format(hierClass, outlier, run)
        if not os.path.exists(directory):
            os.makedirs(directory)
        plots_dir = '{}/plots'.format(directory)
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)
        
        train_features, test_features, test_labels = get_data(hierClass, outlier)
        model = train(hierClass, outlier, train_features, directory)
        test(model, test_features, test_labels, directory)

  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.615
AU PR Out1: 0.256
AU ROC Out2: 0.836
AU PR Out2: 0.297
AU ROC Out12: 0.696
AU PR Out12: 0.400
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.614
AU PR Out1: 0.256
AU ROC Out2: 0.835
AU PR Out2: 0.297
AU ROC Out12: 0.695
AU PR Out12: 0.399
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.614
AU PR Out1: 0.255
AU ROC Out2: 0.835
AU PR Out2: 0.297
AU ROC Out12: 0.695
AU PR Out12: 0.399
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.614
AU PR Out1: 0.255
AU ROC Out2: 0.835
AU PR Out2: 0.297
AU ROC Out12: 0.695
AU PR Out12: 0.399
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.615
AU PR Out1: 0.256
AU ROC Out2: 0.835
AU PR Out2: 0.297
AU ROC Out12: 0.695
AU PR Out12: 0.399
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.694
AU PR Out1: 0.748
AU ROC Out2: 0.894
AU PR Out2: 0.506
AU ROC Out12: 0.707
AU PR Out12: 0.774
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.693
AU PR Out1: 0.748
AU ROC Out2: 0.894
AU PR Out2: 0.500
AU ROC Out12: 0.706
AU PR Out12: 0.774
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.692
AU PR Out1: 0.747
AU ROC Out2: 0.894
AU PR Out2: 0.499
AU ROC Out12: 0.705
AU PR Out12: 0.773
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.695
AU PR Out1: 0.749
AU ROC Out2: 0.894
AU PR Out2: 0.505
AU ROC Out12: 0.707
AU PR Out12: 0.775
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.700
AU PR Out1: 0.748
AU ROC Out2: 0.893
AU PR Out2: 0.488
AU ROC Out12: 0.713
AU PR Out12: 0.774
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.492
AU PR Out1: 0.905
AU ROC Out2: 0.774
AU PR Out2: 0.414
AU ROC Out12: 0.483
AU PR Out12: 0.910
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.477
AU PR Out1: 0.905
AU ROC Out2: 0.779
AU PR Out2: 0.422
AU ROC Out12: 0.483
AU PR Out12: 0.910
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.476
AU PR Out1: 0.905
AU ROC Out2: 0.778
AU PR Out2: 0.419
AU ROC Out12: 0.482
AU PR Out12: 0.910
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.478
AU PR Out1: 0.905
AU ROC Out2: 0.777
AU PR Out2: 0.421
AU ROC Out12: 0.484
AU PR Out12: 0.910
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.474
AU PR Out1: 0.905
AU ROC Out2: 0.791
AU PR Out2: 0.421
AU ROC Out12: 0.496
AU PR Out12: 0.910
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.547
AU PR Out1: 0.234
AU ROC Out2: 0.834
AU PR Out2: 0.291
AU ROC Out12: 0.608
AU PR Out12: 0.349
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.547
AU PR Out1: 0.234
AU ROC Out2: 0.834
AU PR Out2: 0.294
AU ROC Out12: 0.609
AU PR Out12: 0.349
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.547
AU PR Out1: 0.235
AU ROC Out2: 0.833
AU PR Out2: 0.288
AU ROC Out12: 0.608
AU PR Out12: 0.350
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.546
AU PR Out1: 0.234
AU ROC Out2: 0.833
AU PR Out2: 0.293
AU ROC Out12: 0.608
AU PR Out12: 0.349
##########################################


  % (self.n_quantiles, n_samples))


AU ROC Out1: 0.547
AU PR Out1: 0.234
AU ROC Out2: 0.832
AU PR Out2: 0.290
AU ROC Out12: 0.608
AU PR Out12: 0.348
##########################################


## Stochastic Experiments

In [6]:
hierClass = 'Stochastic'
outliers = ['AGN',
            'Blazar',
            'CV/Nova',
            'QSO',
            'YSO']

for outlier in outliers:
    for run in range(5):
        directory = '../experiments/OCSVM_{}_{}_run{}'.format(hierClass, outlier, run)
        if not os.path.exists(directory):
            os.makedirs(directory)
        plots_dir = '{}/plots'.format(directory)
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)
        
        train_features, test_features, test_labels = get_data(hierClass, outlier)
        model = train(hierClass, outlier, train_features, directory)
        test(model, test_features, test_labels, directory)

AU ROC Out1: 0.559
AU PR Out1: 0.439
AU ROC Out2: 0.647
AU PR Out2: 0.008
AU ROC Out12: 0.559
AU PR Out12: 0.441
##########################################
AU ROC Out1: 0.559
AU PR Out1: 0.439
AU ROC Out2: 0.646
AU PR Out2: 0.008
AU ROC Out12: 0.559
AU PR Out12: 0.441
##########################################
AU ROC Out1: 0.558
AU PR Out1: 0.439
AU ROC Out2: 0.646
AU PR Out2: 0.008
AU ROC Out12: 0.559
AU PR Out12: 0.441
##########################################
AU ROC Out1: 0.558
AU PR Out1: 0.439
AU ROC Out2: 0.646
AU PR Out2: 0.008
AU ROC Out12: 0.559
AU PR Out12: 0.441
##########################################
AU ROC Out1: 0.559
AU PR Out1: 0.439
AU ROC Out2: 0.647
AU PR Out2: 0.008
AU ROC Out12: 0.559
AU PR Out12: 0.441
##########################################
AU ROC Out1: 0.560
AU PR Out1: 0.165
AU ROC Out2: 0.641
AU PR Out2: 0.007
AU ROC Out12: 0.562
AU PR Out12: 0.170
##########################################
AU ROC Out1: 0.560
AU PR Out1: 0.165
AU ROC Out2: 0.641
AU PR Ou

## Periodic Experiments

In [7]:
hierClass = 'Periodic'
outliers = ['CEP',
            'DSCT',
            'E',
            'RRL',
            'LPV']


for outlier in outliers:
    for run in range(5):
        directory = '../experiments/OCSVM_{}_{}_run{}'.format(hierClass, outlier, run)
        if not os.path.exists(directory):
            os.makedirs(directory)
        plots_dir = '{}/plots'.format(directory)
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)
        
        train_features, test_features, test_labels = get_data(hierClass, outlier)
        model = train(hierClass, outlier, train_features, directory)
        test(model, test_features, test_labels, directory)

AU ROC Out1: 0.456
AU PR Out1: 0.018
AU ROC Out2: 0.601
AU PR Out2: 0.024
AU ROC Out12: 0.513
AU PR Out12: 0.038
##########################################
AU ROC Out1: 0.456
AU PR Out1: 0.018
AU ROC Out2: 0.601
AU PR Out2: 0.024
AU ROC Out12: 0.513
AU PR Out12: 0.038
##########################################
AU ROC Out1: 0.456
AU PR Out1: 0.018
AU ROC Out2: 0.601
AU PR Out2: 0.024
AU ROC Out12: 0.513
AU PR Out12: 0.038
##########################################
AU ROC Out1: 0.456
AU PR Out1: 0.018
AU ROC Out2: 0.601
AU PR Out2: 0.024
AU ROC Out12: 0.513
AU PR Out12: 0.038
##########################################
AU ROC Out1: 0.456
AU PR Out1: 0.018
AU ROC Out2: 0.601
AU PR Out2: 0.024
AU ROC Out12: 0.513
AU PR Out12: 0.038
##########################################
AU ROC Out1: 0.533
AU PR Out1: 0.032
AU ROC Out2: 0.601
AU PR Out2: 0.024
AU ROC Out12: 0.554
AU PR Out12: 0.052
##########################################
AU ROC Out1: 0.533
AU PR Out1: 0.032
AU ROC Out2: 0.601
AU PR Ou