In [1]:
import keras
import keras.backend as K
from keras.layers.core import Activation
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import os
from sklearn import preprocessing

# Setting seed for reproducibility
np.random.seed(1234)
PYTHONHASHSEED = 0

# define path to save model
model_path = 'regression_model.h5'


# read training data - It is the aircraft engine run-to-failure data. (failure and testing)
# read test data - It is the aircraft engine operating data without failure events recorded.
# read ground truth data - It contains the information of true remaining cycles for each engine in the testing data.
train_df = pd.read_csv('Dataset/PM_train.txt', sep=" ", header=None)
test_df = pd.read_csv('Dataset/PM_test.txt', sep=" ", header=None)
truth_df = pd.read_csv('Dataset/PM_truth.txt', sep=" ", header=None)

# Drop missing data columns(redundant)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

# Sorting and indicating columns
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

train_df = train_df.sort_values(['id','cycle'])

test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']


# print ("train_df  \n", train_df)
# print ("test_df \n", test_df)
# print ("truth_df \n", truth_df)





Using TensorFlow backend.


In [2]:
##################################
# Data Preprocessing
##################################

#######
# TRAIN
#######
# Data Labeling - generate column RUL(Remaining Usefull Life or Time to Failure)
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index() # fail occurred cycle eahc R2F simulation
rul.columns = ['id', 'max'] # assign column labels
train_df = train_df.merge(rul, on=['id'], how='left') # merge the cycle of failure for each id
train_df['RUL'] = train_df['max'] - train_df['cycle'] # Calulate and assign RUL
train_df.drop('max', axis=1, inplace=True) #Drop the cycle of failure for each id(max)




# generate label columns for training data
# we will only make use of "label1" for binary classification,
# while trying to answer the question: is a specific engine going to fail within w1 cycles?
# column 'RUL' is used for regression / 'label1' and 'label2' are used for classification
w1 = 30 # binary classification  - going to fail within 30 cycles?
w0 = 15 # binary classification  - going to fail within 15 cycles?
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0)
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2



## preprocessing(normalization) with sklearn lib for using DL
# MinMax normalization (from 0 to 1)
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id', 'cycle', 'RUL', 'label1', 'label2'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]),
                             columns=cols_normalize,
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns=train_df.columns)

# processed data save into csv if needed
# train_df.to_csv('PredictiveManteinanceEngineTraining.csv', encoding='utf-8',index = None)





######
# TEST
######
## preprocessing(normalization) with sklearn lib for using DL
# MinMax normalization (from 0 to 1)
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]),
                            columns=cols_normalize,
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns=test_df.columns)
test_df = test_df.reset_index(drop=True)
print(test_df.head())



# We use the ground truth dataset to generate labels for the test data.
# generate column max for test data, for calculating RUL
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)

# generate label columns w0 and w1 for test data (It is for classification)
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0)
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2

# test_df.to_csv('PredictiveManteinanceEngineValidation.csv', encoding='utf-8',index = None)




   id  cycle  setting1  setting2  setting3   s1        s2        s3        s4  \
0   1      1  0.632184  0.750000       0.0  0.0  0.545181  0.310661  0.269413   
1   1      2  0.344828  0.250000       0.0  0.0  0.150602  0.379551  0.222316   
2   1      3  0.517241  0.583333       0.0  0.0  0.376506  0.346632  0.322248   
3   1      4  0.741379  0.500000       0.0  0.0  0.370482  0.285154  0.408001   
4   1      5  0.580460  0.500000       0.0  0.0  0.391566  0.352082  0.332039   

    s5  ...       s13       s14       s15  s16       s17  s18  s19       s20  \
0  0.0  ...  0.220588  0.132160  0.308965  0.0  0.333333  0.0  0.0  0.558140   
1  0.0  ...  0.264706  0.204768  0.213159  0.0  0.416667  0.0  0.0  0.682171   
2  0.0  ...  0.220588  0.155640  0.458638  0.0  0.416667  0.0  0.0  0.728682   
3  0.0  ...  0.250000  0.170090  0.257022  0.0  0.250000  0.0  0.0  0.666667   
4  0.0  ...  0.220588  0.152751  0.300885  0.0  0.166667  0.0  0.0  0.658915   

        s21  cycle_norm  
0  0.6

In [3]:

# pick a large window size of 50 cycles
sequence_length = 50


# function to reshape features into (samples, time steps, features)
# The output of the function is a number of numpy array with fixed sequence length(window size) 
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    # for one id I put all the rows in a single matrix
    # only take features and its values (convert DataFrame to numpy array)
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0] # number of rows in the array (the number of instances for each machine)
    # Iterate over two lists in parallel.
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,142),(50,192)
    # zip iteration : Iterate over multiple lists simultaneously
    # Iterate minimum among all the lists  
    # 0 50 -> from row 0 to row 50
    # 1 51 -> from row 1 to row 51
    # 2 52 -> from row 2 to row 52
    # ...
    # 142 191 -> from row 142 to 191
    # 142 number of numpy array, each has the shape of (sequence length, number of features) = (50,25)
    for start, stop in zip(range(0, num_elements - seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]


# pick the feature columns (exclude RUL and labels )
sensor_cols = ['s' + str(i) for i in range(1, 22)]
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']
sequence_cols.extend(sensor_cols)


print ("sequence_cols \n",  sequence_cols)


# id_df = train_df[train_df['id'] == 1]
# print ("id_df \n", id_df)


# print ("id_df[sequence_cols] \n", id_df[sequence_cols])



# print ("id_df[sequence_cols].values \n", id_df[sequence_cols].values)


# id_df_values = id_df[sequence_cols].values

# print ("type(id_df_values)", type(id_df_values))
# print ("id_df_values.shape", id_df_values.shape)



# num_elements = id_df_values.shape[0]
# seq_length = sequence_length
# i = 0
# for start, stop in zip(range(0, num_elements - seq_length), range(seq_length, num_elements)):
#     print ("i: ", i)
#     print ("start,stop:", start, stop)
#     print ("id_df_values[start:stop, :] \n", id_df_values[start:stop, :]) 
#     i +=1
    

# print ("range(0, num_elements - seq_length) \n", range(0, num_elements - seq_length))
# print ("range(seq_length, num_elements) \n", range(seq_length, num_elements))





# val is a list of 192 - 50 = 142 bi-dimensional array (50 rows x 25 columns)



# val = list(gen_sequence(train_df[train_df['id'] == 1], sequence_length, sequence_cols))
# print(len(val))

# print ("type(val)", type(val))
# print ("val[0]", val[0])
# print ("type(val[0])", type(val[0]))
# print ("val[0].shape", val[0].shape)

# generator for the sequences
# transform each id of the train dataset in a sequence
seq_gen = (list(gen_sequence(train_df[train_df['id'] == id], sequence_length, sequence_cols))
           for id in train_df['id'].unique())



print ("seq_gen", seq_gen)
print ("type(seq_gen)", type(seq_gen))

print ("type(list(seq_gen))", type(list(seq_gen)))


# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
print(seq_array.shape)



print("end of lines")

sequence_cols 
 ['setting1', 'setting2', 'setting3', 'cycle_norm', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
seq_gen <generator object <genexpr> at 0x7ff09d8c6050>
type(seq_gen) <class 'generator'>
type(list(seq_gen)) <class 'list'>


ValueError: need at least one array to concatenate

In [None]:
# function to generate labels
def gen_labels(id_df, seq_length, label):
    """ Only sequences that meet the window-length are considered, no padding is used. 
    This means for testing we need to drop those which are below the window-length. 
    An alternative would be to pad  sequences so that we can use shorter ones """
    # For one id I put all the labels in a single matrix.
    # For example:
    # [[1]
    # [4]
    # [1]
    # [5]
    # [9]
    # ...
    # [200]]
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    # I have to remove the first seq_length labels
    # because for one id the first sequence of seq_length size have as target
    # the last label (the previus ones are discarded).
    # All the next id's sequences will have associated step by step one label as target.
    return data_matrix[seq_length:num_elements, :]


# generate labels
label_gen = [gen_labels(train_df[train_df['id'] == id], sequence_length, ['RUL'])
             for id in train_df['id'].unique()]

print (gen_labels(train_df[train_df['id'] == id], sequence_length, ['RUL'])
             for id in train_df['id'].unique())

# print ("label_gen", label_gen)
print ("len(label_gen)", len(label_gen))

label_array = np.concatenate(label_gen).astype(np.float32)
print ("label_array.shape", label_array.shape)



df_one = train_df[train_df['id'] == 1]
print ("df_one \n",df_one )

print ("df_one[['RUL']]", df_one['RUL'].values)
print ("df_one[['RUL']]", df_one[['RUL']].values)




In [None]:

def r2_keras(y_true, y_pred):
    """Coefficient of Determination
    """
    SS_res =  K.sum(K.square( y_true - y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# Next, we build a deep network.
# The first layer is an LSTM layer with 100 units followed by another LSTM layer with 50 units.
# Dropout is also applied after each LSTM layer to control overfitting.
# Final layer is a Dense output layer with single unit and linear activation since this is a regression problem.
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()
model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(
          units=50,
          return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=nb_out))
model.add(Activation("linear"))
model.compile(loss='mean_squared_error', optimizer='rmsprop',metrics=['mae',r2_keras])

print(model.summary())



