In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import metrics
import os, sys

from phm08ds.models import experiment

## Load Dataset

In [2]:
folderpath = '../../../data/interim/'
data_op_1 = pd.read_csv(folderpath + 'data_op_01.csv')
data_op_1.head()

Unnamed: 0,unit,time_step,operational_setting_1,operational_setting_2,operational_setting_3,Sensor_0,Sensor_1,Sensor_2,Sensor_3,Sensor_4,...,Sensor_13,Sensor_14,Sensor_15,Sensor_16,Sensor_17,Sensor_18,Sensor_19,Sensor_20,Health_state,Operational_condition
2,1,3,34.9986,0.8401,60.0,449.44,555.42,1368.17,1122.49,5.48,...,8063.84,9.3557,0.02,334,2223,100.0,14.83,8.8555,1,1
11,1,12,35.0029,0.8413,60.0,449.44,555.85,1360.54,1130.69,5.48,...,8063.79,9.2878,0.02,335,2223,100.0,14.69,8.7988,1,1
23,1,24,34.9987,0.84,60.0,449.44,555.19,1359.28,1131.95,5.48,...,8063.93,9.3254,0.02,334,2223,100.0,14.96,8.8405,2,1
38,1,39,35.001,0.84,60.0,449.44,555.85,1370.11,1126.43,5.48,...,8065.11,9.3512,0.02,333,2223,100.0,15.0,8.7529,2,1
40,1,41,35.0018,0.84,60.0,449.44,555.3,1359.77,1129.48,5.48,...,8065.92,9.2991,0.02,334,2223,100.0,14.79,9.0533,2,1


## Data preprocessing

Get sensors that a I like it better

In [3]:
from phm08ds.features.feature_selection import SelectSensors

tf_select_sensors = SelectSensors(kind='custom', sensors=[3,14])
data_op_1 = tf_select_sensors.fit_transform(data_op_1)
data_op_1.head()

Unnamed: 0,unit,time_step,operational_setting_1,operational_setting_2,operational_setting_3,Sensor_3,Sensor_14,Operational_condition,Health_state
2,1,3,34.9986,0.8401,60.0,1122.49,9.3557,1,1
11,1,12,35.0029,0.8413,60.0,1130.69,9.2878,1,1
23,1,24,34.9987,0.84,60.0,1131.95,9.3254,1,2
38,1,39,35.001,0.84,60.0,1126.43,9.3512,1,2
40,1,41,35.0018,0.84,60.0,1129.48,9.2991,1,2


Before feeding to the classifier, let's remove unwanted information, such as unit, time_step and operational settings.

In [4]:
from phm08ds.features.feature_selection import RemoveInfo

tf_remove_info = RemoveInfo()

data_with_features = tf_remove_info.fit_transform(data_op_1)
data_with_features.head()

Unnamed: 0,Sensor_3,Sensor_14,Health_state
2,1122.49,9.3557,1
11,1130.69,9.2878,1
23,1131.95,9.3254,2
38,1126.43,9.3512,2
40,1129.48,9.2991,2


We need to normalize our data. Let's use Z-score standardization.

In [5]:
from sklearn.preprocessing import StandardScaler

tf_std_scaller = preprocessing.StandardScaler()
data_with_features_std = tf_std_scaller.fit_transform(data_with_features.drop(labels='Health_state', axis=1))
data_with_features_std

array([[-1.28328598e+00,  5.50787303e-01],
       [-1.71913583e-01, -1.23028949e+00],
       [-1.14172757e-03, -2.44008378e-01],
       ...,
       [ 9.06930839e-01,  1.19082079e+00],
       [ 9.55722798e-01,  1.94889324e+00],
       [ 1.83397806e+00,  2.26890998e+00]])

In [6]:
labels_op_1 = np.array(data_with_features['Health_state'])
labels_op_1

array([1, 1, 2, ..., 4, 4, 4])

# Classification steps

## Load Experiment model

In [7]:
from phm08ds.models import experiment

## Define classifiers and its specifications

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

  from numpy.core.umath_tests import inner1d


In [9]:
knn_clf = KNeighborsClassifier()
random_forest_clf = RandomForestClassifier()
naive_bayes_clf = GaussianNB()
gaussian_linear_clf = LinearDiscriminantAnalysis()
gaussian_quadratic_clf = QuadraticDiscriminantAnalysis()
perceptron_clf = Perceptron()
sgd_clf = SGDClassifier()

## Put all clf in a dictionary:

In [10]:
classifiers = {'KNN': knn_clf, 'RF': random_forest_clf, 'Naive_bayes': naive_bayes_clf,
               'Gaussian_linear': gaussian_linear_clf, 'Gaussian_quadratic': gaussian_quadratic_clf,
               'Perceptron': perceptron_clf, 'SGDClassifier': sgd_clf}

## Train Classifiers and test them

Stratified cross-validation for model selection are going to be used.

In [11]:
kfolds = 10
clf_outputs = experiment.run_classifiers(data_with_features_std, labels_op_1, classifiers, kfolds)



## Performance assessment

Saving variables in a dictionary:

In [12]:
results = {}
results['train'] = experiment.results_clf(4, clf_outputs['train']['true'], clf_outputs['train']['pred'])
results['test'] = experiment.results_clf(4, clf_outputs['test']['true'], clf_outputs['test']['pred'])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  metrics_class[i,2] = TP / (TP + FP)


## Savel results, models and pipeline to a .pkl file 

In [13]:
from sklearn.pipeline import Pipeline

data_preprocessing = Pipeline([('select_sensors', tf_select_sensors),
                               ('remove_info', tf_remove_info),
                               ('std_scaler', tf_std_scaller)
                              ])

In [14]:
experiment.save_models(classifiers)
experiment.save_pipeline(data_preprocessing)

## Save results to CSVs and figures

In [15]:
experiment.export_results(results['test'], 'test')
experiment.export_results(results['train'], 'train')

/mnt/Work/Mestrado/Mestrado/MP-Safety_ITA/Analyses_and_Experiments/PHM08_data_science/notebooks/E06_PHM08-train_CLF/model_selection-OP_01-navarmn
KNN
RF
Naive_bayes
Gaussian_linear
Gaussian_quadratic
Perceptron
SGDClassifier
/mnt/Work/Mestrado/Mestrado/MP-Safety_ITA/Analyses_and_Experiments/PHM08_data_science/notebooks/E06_PHM08-train_CLF/model_selection-OP_01-navarmn
KNN
RF
Naive_bayes
Gaussian_linear
Gaussian_quadratic
Perceptron
SGDClassifier
