In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import metrics

import os, sys
from time import time

from phm08ds.models import experiment

## Load Dataset

In [2]:
folderpath = '../../../data/interim/'
data_op_06 = pd.read_csv(folderpath + 'data_op_06.csv')
data_op_06.head()

Unnamed: 0,unit,time_step,operational_setting_1,operational_setting_2,operational_setting_3,Sensor_2,Sensor_3,Sensor_4,Sensor_7,Sensor_11,Sensor_12,Sensor_15,Operational_condition,Health_state
3,1,4,20.0031,0.7005,0.0,1488.44,1249.18,9.35,2323.85,314.84,2388.07,0.02,6,1
5,1,6,20.0032,0.7017,0.0,1480.46,1258.9,9.35,2323.94,315.36,2388.05,0.02,6,1
22,1,23,20.0025,0.7011,0.0,1482.68,1252.0,9.35,2323.91,314.87,2388.1,0.02,6,1
30,1,31,20.0045,0.7006,0.0,1483.93,1256.88,9.35,2323.91,314.94,2388.12,0.02,6,2
44,1,45,20.0062,0.7,0.0,1477.33,1250.58,9.35,2323.91,315.06,2388.09,0.02,6,2


## Data preprocessing

### Get rid of informations there are not sensor readings

Wang (2008) reports Sensor 15 has importat information. However, there are no relevant informations of this sensor. The data seems to be corrupted like this:

Let's remove it from our database creating an object transformer.

In [3]:
from phm08ds.features.feature_selection import RemoveSensor

tf_remove_sensor_15 = RemoveSensor(sensors=[15])
data_op_06 = tf_remove_sensor_15.fit_transform(data_op_06)
data_op_06.head()

Unnamed: 0,unit,time_step,operational_setting_1,operational_setting_2,operational_setting_3,Sensor_2,Sensor_3,Sensor_4,Sensor_7,Sensor_11,Sensor_12,Operational_condition,Health_state
3,1,4,20.0031,0.7005,0.0,1488.44,1249.18,9.35,2323.85,314.84,2388.07,6,1
5,1,6,20.0032,0.7017,0.0,1480.46,1258.9,9.35,2323.94,315.36,2388.05,6,1
22,1,23,20.0025,0.7011,0.0,1482.68,1252.0,9.35,2323.91,314.87,2388.1,6,1
30,1,31,20.0045,0.7006,0.0,1483.93,1256.88,9.35,2323.91,314.94,2388.12,6,2
44,1,45,20.0062,0.7,0.0,1477.33,1250.58,9.35,2323.91,315.06,2388.09,6,2


Before feeding to the classifier, let's remove unwanted information, such as unit, time_step and operational settings.

In [4]:
from phm08ds.features.feature_selection import RemoveInfo

tf_remove_info = RemoveInfo()

data_with_features = tf_remove_info.fit_transform(data_op_06)
data_with_features.head()

Unnamed: 0,Sensor_2,Sensor_3,Sensor_4,Sensor_7,Sensor_11,Sensor_12,Health_state
3,1488.44,1249.18,9.35,2323.85,314.84,2388.07,1
5,1480.46,1258.9,9.35,2323.94,315.36,2388.05,1
22,1482.68,1252.0,9.35,2323.91,314.87,2388.1,1
30,1483.93,1256.88,9.35,2323.91,314.94,2388.12,2
44,1477.33,1250.58,9.35,2323.91,315.06,2388.09,2


We need to normalize our data. Let's use Z-score standardization.

In [5]:
from sklearn.preprocessing import StandardScaler

tf_std_scaller = preprocessing.StandardScaler()
data_with_features_std = tf_std_scaller.fit_transform(data_with_features.drop(labels='Health_state', axis=1))
data_with_features_std

array([[ 0.42240019, -0.57734709,  1.        , -1.33580941, -0.01630742,
        -0.65158097],
       [-0.9665274 ,  0.66883265,  1.        , -0.63627355,  1.04756387,
        -0.80400561],
       [-0.58013401, -0.21580111,  1.        , -0.86945217,  0.04506977,
        -0.42294403],
       ...,
       [ 0.90452168,  0.80857915,  1.        , -0.63627355, -0.60962026,
        -0.42294403],
       [ 0.66781221, -0.15810761,  1.        , -0.0921901 , -0.50732494,
         0.03432987],
       [ 1.2474023 ,  1.29064045,  1.        , -0.01446389, -0.54824307,
        -0.49915634]])

In [6]:
labels = np.array(data_with_features['Health_state'])
labels

array([1, 1, 1, ..., 4, 4, 4])

# Classification steps

In [7]:
from phm08ds.models import experiment

## Load best mlp from random search

In [8]:
from sklearn.externals import joblib

random_search_mlp = joblib.load('Results/old_models/clf_mlp.pkl')
# random_search_svm = joblib.load('Results/old_models/clf_svm.pkl')

In [9]:
clf_mlp = random_search_mlp['MLP'].best_estimator_

## Put all clf in a dictionary:

In [10]:
classifiers = {'MLP': clf_mlp}

Since we are using SVM and MLP we need to extract all power from those methods. Let's perform a Random Search to parameters optimizations.

In [11]:
kfolds = 10
clf_outputs = experiment.run_classifiers(data_with_features_std, labels, classifiers, kfolds)

## Performance assessment

Saving variables in a dictionary:

In [12]:
results = {}
results['train'] = experiment.results_clf(4, clf_outputs['train']['true'], clf_outputs['train']['pred'])
results['test'] = experiment.results_clf(4, clf_outputs['test']['true'], clf_outputs['test']['pred'])

## Savel results, models and pipeline to a .pkl file 

In [13]:
from sklearn.pipeline import Pipeline

data_preprocessing = Pipeline([('remove_sensor_15', tf_remove_sensor_15),
                               ('remove_info', tf_remove_info),
                               ('std_scaler', tf_std_scaller)
                              ])

In [14]:
experiment.save_models(classifiers)
experiment.save_pipeline(data_preprocessing)

## Save results to CSVs and figures

In [15]:
experiment.export_results(results['test'], 'test')
experiment.export_results(results['train'], 'train')

/mnt/Work/Mestrado/Mestrado/MP-Safety_ITA/Analyses_and_Experiments/PHM08_data_science/notebooks/E03_PHM08-train-best_MLP_and_SVM/model_selection-OP_06-navarmn
MLP
/mnt/Work/Mestrado/Mestrado/MP-Safety_ITA/Analyses_and_Experiments/PHM08_data_science/notebooks/E03_PHM08-train-best_MLP_and_SVM/model_selection-OP_06-navarmn
MLP


## Savel results, models and pipeline to a .pkl file 

In [16]:
from sklearn.pipeline import Pipeline

data_preprocessing = Pipeline([('remove_sensor_15', tf_remove_sensor_15),
                               ('remove_info', tf_remove_info),
                               ('std_scaler', tf_std_scaller)
                              ])

In [17]:
experiment.save_models(clf_mlp, name='clf_mlp')
experiment.save_pipeline(data_preprocessing)