In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import metrics

import os, sys
from time import time

from phm08ds.models import experiment

## Load Dataset

In [2]:
folderpath = '../../../data/interim/'
data_op_04 = pd.read_csv(folderpath + 'data_op_04.csv')
data_op_04.head()

Unnamed: 0,unit,time_step,operational_setting_1,operational_setting_2,operational_setting_3,Sensor_2,Sensor_3,Sensor_4,Sensor_7,Sensor_11,Sensor_12,Sensor_15,Operational_condition,Health_state
4,1,5,42.0041,0.8405,40.0,1354.48,1124.32,3.91,2211.8,130.44,2387.89,0.02,4,1
6,1,7,41.9998,0.84,40.0,1354.43,1131.44,3.91,2211.82,130.16,2387.88,0.02,4,1
9,1,10,42.0066,0.84,40.0,1353.19,1125.09,3.91,2211.84,130.32,2387.91,0.02,4,1
12,1,13,42.0029,0.8409,40.0,1350.9,1123.53,3.91,2211.78,130.6,2387.88,0.02,4,1
20,1,21,42.003,0.8404,40.0,1349.16,1117.33,3.91,2211.82,130.48,2387.9,0.02,4,1


## Data preprocessing

### Get rid of informations there are not sensor readings

Wang (2008) reports Sensor 15 has importat information. However, there are no relevant informations of this sensor. The data seems to be corrupted like this:

Let's remove it from our database creating an object transformer.

In [3]:
from phm08ds.features.feature_selection import RemoveSensor

tf_remove_sensor_15 = RemoveSensor(sensors=[15])
data_op_04 = tf_remove_sensor_15.fit_transform(data_op_04)
data_op_04.head()

Unnamed: 0,unit,time_step,operational_setting_1,operational_setting_2,operational_setting_3,Sensor_2,Sensor_3,Sensor_4,Sensor_7,Sensor_11,Sensor_12,Operational_condition,Health_state
4,1,5,42.0041,0.8405,40.0,1354.48,1124.32,3.91,2211.8,130.44,2387.89,4,1
6,1,7,41.9998,0.84,40.0,1354.43,1131.44,3.91,2211.82,130.16,2387.88,4,1
9,1,10,42.0066,0.84,40.0,1353.19,1125.09,3.91,2211.84,130.32,2387.91,4,1
12,1,13,42.0029,0.8409,40.0,1350.9,1123.53,3.91,2211.78,130.6,2387.88,4,1
20,1,21,42.003,0.8404,40.0,1349.16,1117.33,3.91,2211.82,130.48,2387.9,4,1


Before feeding to the classifier, let's remove unwanted information, such as unit, time_step and operational settings.

In [4]:
from phm08ds.features.feature_selection import RemoveInfo

tf_remove_info = RemoveInfo()

data_with_features = tf_remove_info.fit_transform(data_op_04)
data_with_features.head()

Unnamed: 0,Sensor_2,Sensor_3,Sensor_4,Sensor_7,Sensor_11,Sensor_12,Health_state
4,1354.48,1124.32,3.91,2211.8,130.44,2387.89,1
6,1354.43,1131.44,3.91,2211.82,130.16,2387.88,1
9,1353.19,1125.09,3.91,2211.84,130.32,2387.91,1
12,1350.9,1123.53,3.91,2211.78,130.6,2387.88,1
20,1349.16,1117.33,3.91,2211.82,130.48,2387.9,1


We need to normalize our data. Let's use Z-score standardization.

In [5]:
from sklearn.preprocessing import StandardScaler

tf_std_scaller = preprocessing.StandardScaler()
data_with_features_std = tf_std_scaller.fit_transform(data_with_features.drop(labels='Health_state', axis=1))
data_with_features_std

array([[-0.03471011, -0.51398323,  1.        , -0.50007728, -0.27185   ,
        -0.42928696],
       [-0.04353727,  0.4286228 ,  1.        , -0.43380467, -1.08895102,
        -0.45999999],
       [-0.26245095, -0.41204409,  1.        , -0.36753205, -0.62203615,
        -0.36786091],
       ...,
       [ 1.87019199,  2.54551472,  1.        , -2.05748362, -0.94304012,
        -1.96493836],
       [ 1.91962411,  1.85444681,  1.        , -2.19002884, -0.85549359,
        -2.30278167],
       [ 0.80033965,  2.9056114 ,  1.        , -2.35571037, -1.96441641,
        -2.14921653]])

In [6]:
labels = np.array(data_with_features['Health_state'])
labels

array([1, 1, 1, ..., 4, 4, 4])

# Classification steps

## Load Experiment model

In [7]:
from phm08ds.models import experiment

## Define classifiers and its specifications

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [9]:
# SVM
svm_linear_clf = SVC(kernel='linear')
svm_rbf_clf = SVC(kernel='rbf')
svm_poly_clf = SVC(kernel='poly')
svm_sigmoid_clf = SVC(kernel='sigmoid')

## Put all clf in a dictionary:

In [10]:
classifiers = {'SVM-Linear': svm_linear_clf, 'SVM-RBF': svm_rbf_clf, 'SVM-Poly': svm_poly_clf, 'SVM-Sigmoid': svm_sigmoid_clf}

Since we are using SVM and MLP we need to extract all power from those methods. Let's perform a Random Search to parameters optimizations.

### Hyperparameter tunning

In [11]:
from sklearn.pipeline import Pipeline

data_preprocessing = Pipeline([('remove_sensor_15', tf_remove_sensor_15),
                               ('remove_info', tf_remove_info),
                               ('std_scaler', tf_std_scaller)
                              ])

In [12]:
from sklearn.model_selection import RandomizedSearchCV

random_search = dict((k,[]) for k in classifiers.keys())

In [13]:
param_dist_dict = {
                   'SVM-Linear': {'C': [2**i for i in range(-5,15)]},
                   'SVM-RBF': {'gamma': [2**i for i in range(-15,3)], 'C': [2**i for i in range(-5,15)]},
                   'SVM-Poly': {'gamma': [2**i for i in range(-15,3)], 'C': [2**i for i in range(-5,15)]},
                   'SVM-Sigmoid': {'gamma': [2**i for i in range(-15,3)], 'C': [2**i for i in range(-5,15)]}
                  }

In [None]:
for clf in param_dist_dict.keys():
    start = time()
    random_search[clf] = RandomizedSearchCV(classifiers[clf], param_dist_dict[clf], cv=10, n_iter=5, verbose=5, n_jobs=100, scoring='accuracy')
    random_search[clf].fit(data_with_features_std, labels)
    
    experiment.save_models(random_search, name='clf_svm')
    experiment.save_pipeline(data_preprocessing)
    
    print('Elapsed time:')
    print(time() - start)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=256 ...........................................................
[CV] C=512 ...........................................................
[CV] C=512 ...........................................................
[CV] C=512 ...........................................................
[CV] C=512 .....

[Parallel(n_jobs=100)]: Done   5 out of  50 | elapsed:   45.9s remaining:  6.9min


[CV] ............... C=0.0625, score=0.5936151855047455, total=  29.2s
[CV] ............... C=0.03125, score=0.616580310880829, total=  29.6s
[CV] ............... C=0.0625, score=0.6300777873811582, total=  29.4s
[CV] ............... C=0.0625, score=0.6183074265975821, total=  29.6s
[CV] ............... C=0.0625, score=0.6034632034632035, total=  29.7s
[CV] ............... C=0.0625, score=0.6387208297320657, total=  29.6s
[CV] ............................. C=0.03125, score=0.6, total=  29.4s
[CV] ................ C=0.0625, score=0.616580310880829, total=  30.8s
[CV] ............... C=0.0625, score=0.6041486603284356, total=  30.6s
[CV] ............... C=0.0625, score=0.6029411764705882, total=  30.6s
[CV] .............. C=0.03125, score=0.6032843560933449, total=  31.4s


[Parallel(n_jobs=100)]: Done  16 out of  50 | elapsed:   48.5s remaining:  1.7min


[CV] ............... C=0.0625, score=0.6091458153580673, total=  31.3s
[CV] ............... C=0.0625, score=0.6008658008658009, total=  31.6s
[CV] .................... C=8, score=0.5936151855047455, total=  55.6s
[CV] .................... C=8, score=0.6008658008658009, total=  56.2s
[CV] .................... C=8, score=0.6034632034632035, total=  56.4s
[CV] .................... C=8, score=0.6108714408973253, total=  57.8s
[CV] .................... C=8, score=0.6041486603284356, total=  57.9s
[CV] ..................... C=8, score=0.616580310880829, total=  57.8s
[CV] .................... C=8, score=0.6309420916162489, total=  58.3s


[Parallel(n_jobs=100)]: Done  27 out of  50 | elapsed:  1.2min remaining:   59.2s


[CV] .................... C=8, score=0.6191709844559585, total=  59.5s
[CV] .................... C=8, score=0.6369922212618842, total=  59.7s
[CV] .................... C=8, score=0.6020761245674741, total= 1.0min


## Savel results, models and pipeline to a .pkl file 

In [None]:
from sklearn.pipeline import Pipeline

data_preprocessing = Pipeline([('remove_sensor_15', tf_remove_sensor_15),
                               ('remove_info', tf_remove_info),
                               ('std_scaler', tf_std_scaller)
                              ])

In [None]:
experiment.save_models(random_search, name='clf_svm')
experiment.save_pipeline(data_preprocessing)