In [35]:
import numpy as np
import csv
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

In [36]:
# import models
from xgboost import XGBClassifier, XGBRegressor
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [37]:
# import metrics, other utilities
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, average_precision_score,  mean_squared_error, confusion_matrix, r2_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import RandomOverSampler

In [42]:
# load data
# notice the windows size of feature points
reader = csv.reader(open('Calve_gpr_win_12_core.csv', "r"), delimiter=",")
data = list(reader)
dataset = [np.array([float(x) if x != '' else 1e6 for x in y]) for y in data]
dataset = np.array(dataset)

feature_dim = [0, 96]
label_dim = [96]

# split raw data into features X and labels y
X = dataset[0:2000, feature_dim[0]: feature_dim[1]]
Y = dataset[0:2000, label_dim[0]]

In [43]:
# Set alarm period, notice that alarm hours = alarm period * 2
Alarm_period = 2
# Reassign labels according to alarm period
Y = [0 if x > Alarm_period else 1 for x in Y]
Y = np.array(Y)

# Show number of label=1
print("Number of Y=1:\n", sum(Y))

# Preprocess the data
X = StandardScaler().fit_transform(X)

Number of Y=1:
 57


In [44]:
# Handle the imbalanced labels of data
# Oversampling for data with minority label
ros = RandomOverSampler()
X_resampled, Y_resampled = ros.fit_sample(X, Y)

# Show new number of label=1
print("Number of Y=1:\n", sum(Y_resampled))

Number of Y=1:
 1943


In [45]:
# split features for train and test sets
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X_resampled, Y_resampled,
                                                    test_size=test_size,
                                                    random_state=seed)

In [46]:
# Grid Search of parameters
# Set a smaller search set


search_size = 0.1
X_, X_search, y_, y_search = train_test_split(X_resampled, Y_resampled,
                                             test_size = search_size,
                                             random_state=seed)

X_search.shape

(389, 96)

In [47]:
# Start searching
model = SVC(kernel="linear")
# grid search
C_ = [0.025, 0.25, 2.5, 25., 250.]
param_grid = dict(C=C_)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="f1", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_search, y_search)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.875856 using {'C': 25.0}


In [48]:
# Initiate Models with parameters

names = ["Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree",
         "Naive Bayes", "XGBoost"]

classifiers = [
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    GaussianNB(), 
    XGBClassifier()]

In [49]:
# iterate over models
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    # count true positive/false negative/true negative/false potivie
    result = confusion_matrix(y_test, y_pred)
    FN_r = round(result[0,1]/ (result[0,1] + result[1,1]), 2)
    TP_r = round(result[1,1]/ (result[1,1] + result[1,0]), 2)

    print(name,'MSE: %s' % round(mean_squared_error(y_test, y_pred),4), '\n')
    print("FN rate is:", FN_r, '\n')
    print("TP rate is:", TP_r, '\n')
    print(result)

Linear SVM MSE: 0.0938 

FN rate is: 0.11 

TP rate is: 0.92 

[[362  44]
 [ 29 343]]
RBF SVM MSE: 0.0 

FN rate is: 0.0 

TP rate is: 1.0 

[[406   0]
 [  0 372]]
Gaussian Process MSE: 0.0051 

FN rate is: 0.01 

TP rate is: 1.0 

[[402   4]
 [  0 372]]
Decision Tree MSE: 0.0578 

FN rate is: 0.11 

TP rate is: 1.0 

[[361  45]
 [  0 372]]
Naive Bayes MSE: 0.2121 

FN rate is: 0.26 

TP rate is: 0.86 

[[293 113]
 [ 52 320]]
XGBoost MSE: 0.0064 

FN rate is: 0.01 

TP rate is: 1.0 

[[401   5]
 [  0 372]]


  if diff:
