In [141]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing, impute
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import precision_recall_curve

# Load training feats
train = pd.read_hdf('../data/train_feats_od.h5')

# Load tgt and add to feats
train['tgt'] = np.load('../data/target_dummy.npy')

In [65]:
train.head()

Unnamed: 0,hostgal_photoz,flux_mean_0,flux_mean_5,flux_min_1,flux_min_2,flux_min_5,flux_std_0,flux_std_1,flux_std_2,flux_std_5,...,abs_magnitude_max_5,absmagmax_ratio_bands_2_3,absmagmax_ratio_bands_2_4,absmagmax_ratio_bands_2_5,absmagmax_ratio_bands_3_4,absmagmax_ratio_bands_3_5,absmagmax_ratio_bands_4_5,spike_back_mean,spike_front_mean,tgt
0,0.0,-10.051806,-54.414902,-1100.440063,-681.858887,-422.815094,83.275841,596.576904,451.180817,292.182281,...,,,,,,,,17.451452,107.970627,92
1,1.6267,-3.119513,-1.848958,-11.715749,-10.067919,-14.211164,7.062516,5.661101,5.718981,7.030447,...,-48.329815,0.998342,,0.992395,,0.994043,,2.357979,1.904276,88
2,0.2262,-0.042413,4.834683,-3.39308,-2.848838,-19.159811,1.816127,1.789767,5.505767,13.201397,...,-44.443485,0.988435,0.983498,0.980152,0.995005,0.99162,0.996598,1.175826,1.0,42
3,0.2813,1.479138,9.389122,-3.61841,-2.159753,-10.249387,4.34396,25.731789,31.671373,25.822132,...,-46.172092,1.00193,1.004307,1.01046,1.002372,1.008514,1.006127,,1.029715,90
4,0.2415,0.747304,6.299269,-2.622109,-2.084535,-10.86054,2.341279,8.037329,21.135263,21.245771,...,-45.511734,0.993583,0.992954,0.99945,0.999367,1.005906,1.006542,,1.062956,90


In [66]:
train['tgt'].value_counts()

90    2313
42    1193
65     981
16     924
15     495
62     484
88     370
92     239
67     208
52     183
95     175
6      151
64     102
53      30
Name: tgt, dtype: int64

In [157]:
# Lets first try novelty detection on class 95 eg.

# First, build the training and test sets from one start kfold
y_tgt = train['tgt'].values
num_folds = 8
folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1)
_train, _eval = next(folds.split(y_tgt, y_tgt))
test_X = train.iloc[_eval]

# Build train data - test class eg. 95 (will remain out of train data)
TEST_CLASS = 95
train_X = train.iloc[_train]
pure_mask = train_X['tgt']!=TEST_CLASS
train_X = train_X[pure_mask]

# Get balanced weights
w = compute_sample_weight('balanced', y_tgt)
train_w = w[_train]
train_w = train_w[pure_mask]

# Preprocessing
X_train = train_X.values[:,:-1]
X_test = test_X.values[:,:-1]

# Scale
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Impute
imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_train)
X_train = imp.transform(X_train)
X_test = imp.transform(X_test)

# Fit oneclass SVM
nu = train['tgt'].value_counts()[TEST_CLASS] / train.shape[0]
clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='auto')

clf.fit(X_train, sample_weight=train_w)

# Predict on train and test set
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# Setup target vectors
y_train = np.copy(train_X.values[:,-1])
y_train[y_train==TEST_CLASS] = -1
y_train[np.logical_and(y_train!=TEST_CLASS, y_train!=-1)] = 1

y_test = np.copy(test_X.values[:,-1])
y_test[y_test==TEST_CLASS] = -1
y_test[np.logical_and(y_test!=TEST_CLASS, y_test!=-1)] = 1

# Compute precision / recall for train and test, at different thresholds

def get_stats(y_true, y_pred, pos_label):
    '''
    return tp, fp, fn, tn tuple
    '''
    
    pmask = y_true == pos_label # Positive mask on truth
    nmask = np.logical_not(pmask) # negative mask on truth
    
    tp = np.sum(y_true[pmask] == y_pred[pmask])
    fp = np.sum(y_true[pmask] != y_pred[pmask])
    tn = np.sum(y_true[nmask] == y_pred[nmask])
    fn = np.sum(y_true[nmask] != y_pred[nmask])
    
    return tp, fp, fn ,tn

for name, iset, truth in zip(['train', 'test'], [X_train, X_test], [y_train, y_test]):
    
    # Get scoring function
    y_pred_raw = clf.decision_function(iset)
    
    # Compute precision / recall for each thresh
    for thresh in np.linspace(np.min(y_pred_raw), np.max(y_pred_raw), 30):
        
        y_pred = np.copy(y_pred_raw)
        y_pred[y_pred_raw <= thresh] = -1
        y_pred[y_pred_raw > thresh] = 1
        
        # Get stats
        tp, fp, tn, fn = get_stats(truth, y_pred, pos_label=-1)
        
        # Compute precision and recall
        prec = tp / (tp + fp)
        recall = tp / (tp + fn)
        
        # Print results
        print(f'> {name}, threshold {thresh:.2f} : Precision = {prec:.2f}, Recall = {recall:.2f}')

> train, threshold -1.65 : Precision = nan, Recall = 0.00
> train, threshold -1.48 : Precision = nan, Recall = 0.00
> train, threshold -1.31 : Precision = nan, Recall = 0.00
> train, threshold -1.14 : Precision = nan, Recall = 0.00
> train, threshold -0.96 : Precision = nan, Recall = 0.00
> train, threshold -0.79 : Precision = nan, Recall = 0.00
> train, threshold -0.62 : Precision = nan, Recall = 0.00
> train, threshold -0.45 : Precision = nan, Recall = 0.00
> train, threshold -0.28 : Precision = nan, Recall = 0.00
> train, threshold -0.11 : Precision = nan, Recall = 0.00
> train, threshold 0.06 : Precision = nan, Recall = 0.00
> train, threshold 0.23 : Precision = nan, Recall = 0.00
> train, threshold 0.41 : Precision = nan, Recall = 0.00
> train, threshold 0.58 : Precision = nan, Recall = 0.00
> train, threshold 0.75 : Precision = nan, Recall = 0.00
> train, threshold 0.92 : Precision = nan, Recall = 0.00
> train, threshold 1.09 : Precision = nan, Recall = 0.00
> train, threshold 1.



In [154]:
precision

array([0.02831403, 0.02706186, 0.02709677, 0.02713178, 0.02716688,
       0.02720207, 0.02723735, 0.02727273, 0.02730819, 0.02734375,
       0.0273794 , 0.02741514, 0.02745098, 0.02748691, 0.02752294,
       0.02755906, 0.02759527, 0.02631579, 0.02635046, 0.02638522,
       0.02642008, 0.02645503, 0.02649007, 0.0265252 , 0.02656042,
       0.02659574, 0.02663116, 0.02666667, 0.02670227, 0.02673797,
       0.02677376, 0.02680965, 0.02684564, 0.02688172, 0.0269179 ,
       0.02695418, 0.02699055, 0.02702703, 0.0270636 , 0.02710027,
       0.02713704, 0.02717391, 0.02721088, 0.02724796, 0.02728513,
       0.0273224 , 0.02735978, 0.02739726, 0.02743484, 0.02747253,
       0.02751032, 0.02754821, 0.02758621, 0.02762431, 0.02766252,
       0.02770083, 0.02773925, 0.02777778, 0.02781641, 0.02785515,
       0.027894  , 0.02793296, 0.02797203, 0.0280112 , 0.02805049,
       0.02808989, 0.0281294 , 0.02816901, 0.02820874, 0.02824859,
       0.02828854, 0.02832861, 0.02836879, 0.02840909, 0.02844

In [131]:
dists = clf.decision_function(X_test)
argsort = np.argsort(dists)
y_test_classes = np.copy(test_X.values[:,-1])
print(dists[argsort][:50])
print(y_pred_test[argsort][:50])
print(y_test_classes[argsort][:50])

[-2.20288233 -2.20288233 -2.20288233 -2.20288233 -1.886992   -1.67446009
 -1.39338274 -1.29191496 -1.07049611 -1.03685624 -0.85022202 -0.76589362
 -0.75658388 -0.67849053 -0.6636646  -0.65950955 -0.60731244 -0.51514714
 -0.4916576  -0.46814636 -0.41664931 -0.36481261 -0.31444321 -0.25179382
 -0.20764888 -0.20195471 -0.18245475 -0.17493513 -0.10651349 -0.05986415
 -0.04314528 -0.0298297  -0.01930595 -0.01420751  0.00868229  0.01044209
  0.0359878   0.05462911  0.06130867  0.08823436  0.13244776  0.13653954
  0.14163357  0.14210526  0.15506513  0.16744092  0.19276852  0.19900146
  0.25272167  0.2695222 ]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1]
[65. 92.  6. 53. 65. 16. 64. 65. 88. 16. 64. 42. 64. 64. 15.  6. 65. 90.
 62. 42. 62. 16. 16. 64. 53. 92.  6. 16. 67. 64. 65. 16. 16. 88. 88. 65.
 65. 65. 65. 65. 16. 90. 15. 65. 65. 42. 65. 16. 92. 95.]


In [158]:
# TEMP MLP MERGE ANALYSIS

gal_oof = pd.read_hdf('../level_1_preds/mlp_v8.27_galactic_0.3062_oof.h5')

In [161]:
gal_oof.shape

(4044, 15)

In [169]:
extra_oof = pd.read_hdf('../level_1_preds/mlp_v8.28_extra_1.1252_oof.h5')

In [177]:
extra_oof.shape

(5523, 15)

In [176]:
_oof = pd.read_hdf('../level_1_preds/mlp_v8.20_0.8799_oof.h5')

In [175]:
full_oof.head()

Unnamed: 0,object_id,mlp_v8.20_0.8799__6,mlp_v8.20_0.8799__15,mlp_v8.20_0.8799__16,mlp_v8.20_0.8799__42,mlp_v8.20_0.8799__52,mlp_v8.20_0.8799__53,mlp_v8.20_0.8799__62,mlp_v8.20_0.8799__64,mlp_v8.20_0.8799__65,mlp_v8.20_0.8799__67,mlp_v8.20_0.8799__88,mlp_v8.20_0.8799__90,mlp_v8.20_0.8799__92,mlp_v8.20_0.8799__95
0,13,0.000141,0.020444,4e-06,0.766997,0.071683,0.000253,0.115005,0.000251,9.7e-05,0.003228,0.000499,0.019538,0.000101,0.001759
1,14,0.022175,0.087481,0.001768,0.139899,0.327391,0.00296,0.019983,0.000316,0.102005,0.004038,0.000505,0.287927,0.000579,0.002974
2,17,0.002427,0.071703,0.007001,0.163758,0.211467,0.001264,0.152283,0.09478,0.007937,0.096456,0.002374,0.184011,0.000531,0.004006
3,23,0.003023,0.015836,0.001396,0.07392,0.231724,0.001057,0.159054,0.009946,0.001073,0.309197,0.000219,0.191808,0.00021,0.001538
4,34,0.002734,0.011857,0.000295,0.160961,0.379045,0.000308,0.073131,0.000142,0.004454,0.01811,6.8e-05,0.341364,0.000113,0.007417
