# 1) Using the DataProcessor class to preprocess the data

Run the run_preprocessing.py file to execute preprocessing, this will store 4 pickle files:
- train_data.p: a list [X_train, y_train] with some cleaned but not preprocessed data
- test-data.p: a similar list [X_test, y_test]
- train_preproc: a list [X_train_preproc, y_train] with the preprocessed training data
- test_preproc.p: a similar list [X_test_preproc, y_test]

These files can then be imported.

To rerun the preprocessing pipeline on some data X, call

>> dp = DatProcessor(X)

>> X_train = dp.preprocess_data(scale)

where scale (True by default) determines whether Standardization is applied.

In [1]:
import pickle
import os
import numpy as np
from Processing.DataProcessor import DataProcessor

In [16]:
# Import data that has already been preprocessed
data_dir = os.getcwd()
with open(os.path.join(data_dir, "Processing", r"train_preproc.p"), 'rb') as data_file:
    train_data = pickle.load(data_file)
X_train, y_train = train_data[0], train_data[1]

with open(os.path.join(data_dir, "Processing", r"test_preproc.p"), 'rb') as data_file:
    test_data = pickle.load(data_file)
X_test, y_test = test_data[0], test_data[1]

In [4]:
# If preprocessing needs to be run again, the full pipeline can be run like this
data_proc = DataProcessor(X_test)
X_test = data_proc.preprocess_data(False)

In [17]:
# Test that there are no nans in train data
for column in X_train.columns:
    assert sum(X_train[column].isnull()) == 0


# 2) Evaluating models with the ModelEvaluator class

The ModelEvaluator class is abstract, so that it is general enough to work for different types of models. You might have to build your child class if the syntax differs (see SKEvaluator example)

In [19]:
import xgboost as xgb
from ModelEvaluationTools.SKEvaluator import SKEvaluator

In [20]:
xg = xgb.XGBClassifier(learning_rate=0.3, max_depth=8, subsample=0.5, objective='binary:logistic', verbosity=2)

Create instance of RFEvaluator, a child of the ModelEvaluator class

In [21]:
xg_model = SKEvaluator(xg, 'xg')

In [22]:
xg_model.fit(X_train, y_train)



[10:11:41] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/tree/updater_prune.cc:101: tree pruning end, 58 extra nodes, 0 pruned nodes, max_depth=8
[10:11:41] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/tree/updater_prune.cc:101: tree pruning end, 60 extra nodes, 0 pruned nodes, max_depth=8
[10:11:42] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/tree/updater_prune.cc:101: tree pruning end, 70 extra nodes, 0 pruned nodes, max_depth=8
[10:11:42] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/tree/updater_prune.cc:101: tree pruning end, 108 extra nodes, 0 pruned nodes, max_depth=8
[10:11:43] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/tree/updater_prune.cc:101: tree pruning end, 92 extra nodes, 0 pruned nodes, max_depth=8
[10:11:43] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/tree/updater_prune.cc:101: tree pruning end, 136 extra nodes, 

In [23]:
test_auc = xg_model.evaluate(X_test, y_test)

Accuracy - Test: 0.9980333333333333
AUC - Test: 0.9267513760582924


In [None]:
# saves model to fitted_models folder
xg_model.save_model()


In [24]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,Flatten
from keras.layers.normalization import BatchNormalization
import keras
import tensorflow as tf
from keras import optimizers, regularizers


In [71]:
print('Building Neural Network model...')
adam = optimizers.Adam(lr = 0.001)
model = Sequential()

model.add(Dense(4, input_dim=X_train.shape[1],
                kernel_initializer='normal',
                activation="relu"))
model.add(BatchNormalization())
# model.add(Dropout(0.3))
model.add(Dense(4,
                activation="relu"))
# model.add(Dropout(0.3))
model.add(Dense(1,activation="sigmoid"))

Building Neural Network model...


NameError: name 'BatchNormalization' is not defined

In [72]:
METRICS = [
     tf.keras.metrics.AUC()
]

In [73]:
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=METRICS)

In [74]:
history = model.fit(np.asarray(X_train).astype(np.float32), np.asarray(y_train), epochs=20, validation_data = (np.asarray(X_test).astype(np.float32),np.asarray(y_test)),batch_size=100)

Epoch 1/20


ValueError: in user code:

    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:759 train_step
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:409 update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\utils\metrics_utils.py:90 decorated
        update_op = update_state_fn(*args, **kwargs)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\metrics.py:176 update_state_fn
        return ag_update_state(*args, **kwargs)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\metrics.py:2069 update_state  **
        return metrics_utils.update_confusion_matrix_variables(
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\keras\utils\metrics_utils.py:353 update_confusion_matrix_variables
        y_pred.shape.assert_is_compatible_with(y_true.shape)
    C:\Users\oscar\anaconda3\lib\site-packages\tensorflow\python\framework\tensor_shape.py:1134 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (100, 4) and (100, 1) are incompatible


In [None]:
from sklearn.metrics import roc_curve
y_pred_keras =model.predict_proba(X_test).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred_keras)
from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)



In [None]:
print(np.unique(y_pred_keras))