In [1]:
from keras.layers import Input, Dense, TimeDistributed, LSTM, Dropout, Activation
from keras.layers import Convolution2D, MaxPooling2D, Flatten
from sklearn.model_selection import StratifiedShuffleSplit
from keras.layers.normalization import BatchNormalization
from sklearn.model_selection import train_test_split
from keras.layers.advanced_activations import ELU
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential, Model
from keras.callbacks import Callback
from keras.optimizers import Adam
from scipy.ndimage import imread
from keras.utils import np_utils
from space_utils import metrics
from time import process_time
from astropy.io import fits
from os.path import isfile
from keras import backend

import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

import pickle
import random
import json
import sys
import os

pd.set_option("max_columns", 999)

np.random.seed(1)

%matplotlib inline

Using Theano backend.


In [2]:
# Load engineered data from pickle
data = pickle.load(open('/home/ubuntu/transients-data-processing/data/engineered-data.pkl', 'rb'))

In [3]:
targets = [
    "OBJECT_TYPE",
]

ids = [
    "ID",
]

continuous = [
    "AMP",
    "A_IMAGE",
    "A_REF",
    "B_IMAGE",
    "B_REF",
    "COLMEDS",
    "DIFFSUMRN",
    "ELLIPTICITY",
    "FLUX_RATIO",
    "GAUSS",
    "GFLUX",
    "L1",
    "LACOSMIC",
    "MAG",
    "MAGDIFF",
    "MAG_FROM_LIMIT",
    "MAG_REF",
    "MAG_REF_ERR",
    "MASKFRAC",
    "MIN_DISTANCE_TO_EDGE_IN_NEW",
    "NN_DIST_RENORM",
    "SCALE",
    "SNR",
    "SPREADERR_MODEL",
    "SPREAD_MODEL",
]

categorical = [
    "BAND",
    "CCDID",
    "FLAGS",
]

ordinal = [
    "N2SIG3",
    "N2SIG3SHIFT",
    "N2SIG5",
    "N2SIG5SHIFT",
    "N3SIG3",
    "N3SIG3SHIFT",
    "N3SIG5",
    "N3SIG5SHIFT",
    "NUMNEGRN",
]

booleans = [
    "MAGLIM",
]

# continuous = [c for c in columns if c not in (special + categorical + ordinal + booleans)]

In [4]:
# One-hot encode categorical
data = pd.get_dummies(
    data, 
    prefix = categorical, 
    prefix_sep = '_',
    dummy_na = True, 
    columns = categorical, 
    sparse = False, 
    drop_first = False
)

In [5]:
target = data[targets]
inputs = data.drop(columns=ids+targets)

In [6]:
# Shuffle and split the data
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42, stratify=target.as_matrix())

In [7]:
train_x, train_y, valid_x, valid_y = X_train.as_matrix(), y_train.as_matrix(), X_test.as_matrix(), y_test.as_matrix()

# save dmatrices
xgtrain = xgb.DMatrix(train_x, label=train_y, feature_names=X_train.columns)
xgvalid = xgb.DMatrix(valid_x, label=valid_y, feature_names=X_test.columns)

In [31]:
param = {
    'max_depth': 6,
    'learning_rate': 0.1,
    'silent': 1,
    'objective': 'binary:logistic',
    'scale_pos_weight': 0.5,
    'n_estimators': 40,
    "gamma": 0,
    "min_child_weight": 1,
    "max_delta_step": 0, 
    "subsample": 0.9, 
    "colsample_bytree": 0.8, 
    "colsample_bylevel": 0.9, 
    "reg_alpha": 0, 
    "reg_lambda": 1, 
    "scale_pos_weight": 1, 
    "base_score": 0.5,  
    "seed": 23,  
}

param['nthread'] = 4
blah_metric = ['error', 'auc']

evallist = [(xgtrain, 'train'), (xgvalid, 'valid')]

bst = xgb.XGBClassifier(**param)

bst. train_x, train_y), (valid_x, valid_y)], 
    eval_metric=blah_metric, 
    verbose=True
)

# clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=’logloss’, verbose=True

# bst = xgb.train(param, xgtrain, evals=evallist, num_boost_round=param['num_round'])

# bst.save_model('xgb_' + str(2018) + '_v1.model')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[0]	validation_0-error:0.068671	validation_0-auc:0.970949	validation_1-error:0.068896	validation_1-auc:0.971142
[1]	validation_0-error:0.062219	validation_0-auc:0.980348	validation_1-error:0.062327	validation_1-auc:0.980248
[2]	validation_0-error:0.058633	validation_0-auc:0.981885	validation_1-error:0.058918	validation_1-auc:0.981718
[3]	validation_0-error:0.05676	validation_0-auc:0.983614	validation_1-error:0.057305	validation_1-auc:0.983367
[4]	validation_0-error:0.054767	validation_0-auc:0.984728	validation_1-error:0.055074	validation_1-auc:0.98452
[5]	validation_0-error:0.053922	validation_0-auc:0.985305	validation_1-error:0.054502	validation_1-auc:0.985167
[6]	validation_0-error:0.053553	validation_0-auc:0.985502	validation_1-error:0.054107	validation_1-auc:0.985338
[7]	validation_0-error:0.052818	validation_0-auc:0.985721	validation_1-error:0.053662	validation_1-auc:0.985568
[8]	validation_0-error:0.052549	validation_0-auc:0.986001	validation_1-error:0.053117	validation_1-auc:0.9

XGBClassifier(base_score=0.5, colsample_bylevel=0.9, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=40, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=23, silent=1, subsample=0.9)

In [54]:
# ytrue = xgvalid.get_label()
ytrue = valid_y


# bst = xgb.Booster({'nthread': 4}) #init model
# bst.load_model("xgb_" + str(2018) + "_v0.model") # load data

ypred = bst.predict_proba(valid_x)[:, 1:]

In [36]:
a = bst.predict_proba(valid_x)

In [49]:
print(ytrue.shape)
print(ypred.shape)

(179793, 1)
(179793, 1)


In [55]:
metrics(ypred, ytrue, threshold=0.5)

{'FPR': 0.046201785969896166, 'MDR': 0.038213410659293727}

In [56]:
ypred

array([[ 0.98915815],
       [ 0.04424884],
       [ 0.01285026],
       ..., 
       [ 0.01371142],
       [ 0.1729524 ],
       [ 0.05201735]], dtype=float32)