In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt


from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error, make_scorer


#from btb.tuning import GPTuner, UniformTuner
#from btb import HyperParameter, ParamTypes

# Read Data

In [3]:
folder = 'CMAPSSData'
df_rul = pd.read_table(os.path.join(folder,'RUL_FD001.txt'), header=None)
df_train = pd.read_table(os.path.join(folder, 'train_FD001.txt'), sep=' ', header=None)
df_test = pd.read_table(os.path.join(folder, 'test_FD001.txt'), sep=' ', header=None)

In [4]:
ranges = [(0, 2), (6, 9), (11, 14), (15, 20), (21, 22), (24,26)]
print([item for start, end in ranges for item in df_train.columns[start:end]])
df_train = df_train[[item for start, end in ranges for item in df_train.columns[start:end]]]

[0, 1, 6, 7, 8, 11, 12, 13, 15, 16, 17, 18, 19, 21, 24, 25]


In [5]:
ranges = [(0, 2), (6, 9), (11, 14), (15, 20), (21, 22), (24,26)]
print([item for start, end in ranges for item in df_test.columns[start:end]])
df_test = df_test[[item for start, end in ranges for item in df_test.columns[start:end]]]

[0, 1, 6, 7, 8, 11, 12, 13, 15, 16, 17, 18, 19, 21, 24, 25]


In [6]:
# remove redundant columns due to extra spacing
df_train = df_train[df_train.columns[:26]]
df_test = df_test[df_test.columns[:26]]

In [7]:
# see data structure
print('RUL: ', df_rul.shape)
print('Train: ', df_train.shape)
print('Test: ', df_test.shape)

RUL:  (100, 1)
Train:  (20631, 16)
Test:  (13096, 16)


In [8]:
#df_train = pd.concat([df_train, df_test])

# Feature List

 * 1)	unit number
 * 2)	time, in cycles
 * 3)	operational setting 1
 * 4)	operational setting 2
 * 5)	operational setting 3
 * 6)	sensor measurement  1
 * 7)	sensor measurement  2
 * ...
 * 26)	sensor measurement  26

# Preprocess Train

generate RUL for y-label by reversing the cycle till failure

In [9]:
df_train.rename(columns = {0 : 'unit', 1 : 'cycle'}, inplace = True)

total_cycles = df_train.groupby(['unit']).agg({'cycle' : 'max'}).reset_index()
total_cycles.rename(columns = {'cycle' : 'total_cycles'}, inplace = True)
print(df_train.head())
df_train = df_train.merge(total_cycles, how = 'left', left_on = 'unit', right_on = 'unit')
df_train['RUL'] = df_train.apply(lambda r: int(min(r['total_cycles'] - r['cycle'], 130)), axis = 1)


df_train2 = df_train.copy()
del df_train2['cycle']

X_train = df_train2[df_train2.columns[:15]]
print(X_train.head())
y_train = df_train['RUL']

   unit  cycle       6        7        8      11       12       13     15  \
0     1      1  641.82  1589.70  1400.60  554.36  2388.06  9046.19  47.47   
1     1      2  642.15  1591.82  1403.14  553.75  2388.04  9044.07  47.49   
2     1      3  642.35  1587.99  1404.20  554.26  2388.08  9052.94  47.27   
3     1      4  642.35  1582.79  1401.87  554.45  2388.11  9049.48  47.13   
4     1      5  642.37  1582.85  1406.22  554.00  2388.06  9055.15  47.28   

       16       17       18      19   21     24       25  
0  521.66  2388.02  8138.62  8.4195  392  39.06  23.4190  
1  522.28  2388.07  8131.49  8.4318  392  39.00  23.4236  
2  522.42  2388.03  8133.23  8.4178  390  38.95  23.3442  
3  522.86  2388.08  8133.83  8.3682  392  38.88  23.3739  
4  522.19  2388.04  8133.80  8.4294  393  38.90  23.4044  
   unit       6        7        8      11       12       13     15      16  \
0     1  641.82  1589.70  1400.60  554.36  2388.06  9046.19  47.47  521.66   
1     1  642.15  1591.82  1

# Preprocess Test

Get last cycle for each engine to match with the true test_y RUL label

In [10]:
# number of engines
engines = df_test[0].unique()

df_list = []
# get last cycle for each engine
for i in engines:
    df = df_test[df_test[0]==i]
    last = (df[-1:])
    df_list.append(last)

# union all rows in a dataframe
X_test = pd.concat(df_list)
del X_test[1]

In [11]:
df_test.columns

Int64Index([0, 1, 6, 7, 8, 11, 12, 13, 15, 16, 17, 18, 19, 21, 24, 25], dtype='int64')

In [12]:
y_test = df_rul.values.flatten()

In [13]:
y_train

0        130
1        130
2        130
3        130
4        130
        ... 
20626      4
20627      3
20628      2
20629      1
20630      0
Name: RUL, Length: 20631, dtype: int64

In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
#X_test = sc.fit_transform(X_test)

## RFR

In [111]:
model = RandomForestRegressor(n_estimators=1050, max_depth=9, n_jobs=-1)
model.fit(X_train, y_train)

y_predicted_train = model.predict(X_train)
y_predicted_test = model.predict(X_test)

MAE_train = mean_absolute_error(y_train, y_predicted_train)
MAE_test = mean_absolute_error(y_test, y_predicted_test)

print('Train MAE', MAE_train)
print('Train MAE', MAE_test)

MSE_train = mean_squared_error(y_train, y_predicted_train)
MSE_test = mean_squared_error(y_test, y_predicted_test)

print('Train RMSE:', np.sqrt(MSE_train))
print('Test RMSE:', np.sqrt(MSE_test))

Train MAE 12.482906553615507
Train MAE 15.417990651038757
Train RMSE: 16.912544284784396
Test RMSE: 21.289836333185818


## SVR

In [112]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

model = SVR(gamma='auto')
parameters = {'C':[0.8, 1, 1.2, 1.5, 2]}
clf = GridSearchCV(model, parameters, verbose=2, n_jobs=-1)
clf.fit(X_train, y_train)

y_predicted_train = clf.predict(X_train)
y_predicted_test = clf.predict(X_test)

MAE_train = mean_absolute_error(y_train, y_predicted_train)
MAE_test = mean_absolute_error(y_test, y_predicted_test)

print('Train MAE', MAE_train)
print('Train MAE', MAE_test)

MSE_train = mean_squared_error(y_train, y_predicted_train)
MSE_test = mean_squared_error(y_test, y_predicted_test)

print('Train RMSE:', np.sqrt(MSE_train))
print('Test RMSE:', np.sqrt(MSE_test))

print(clf.best_score_)



[CV] C=0.8 ...........................................................
[CV] C=0.8 ...........................................................
[CV] C=0.8 ...........................................................
[CV] C=1 .............................................................
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] ............................................ C=0.8, total=  27.5s
[CV] C=1 .............................................................
[CV] .............................................. C=1, total=  27.5s
[CV] ............................................ C=0.8, total=  27.6s
[CV] C=1 .............................................................
[CV] C=1.2 ...........................................................
[CV] ............................................ C=0.8, total=  27.6s
[CV] C=1.2 ...........................................................
[CV] .............................................. C=1, total=  26.3s
[CV] ............

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.7min finished


Train MAE 13.606406972128797
Train MAE 17.047916667036393
Train RMSE: 19.731502404148273
Test RMSE: 23.036102651202118
0.7669348270539598


In [118]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=100)
model.fit(X_train, y_train)

y_predicted_train = model.predict(X_train)
y_predicted_test = model.predict(X_test)

MAE_train = mean_absolute_error(y_train, y_predicted_train)
MAE_test = mean_absolute_error(y_test, y_predicted_test)

print('Train MAE', MAE_train)
print('Train MAE', MAE_test)

MSE_train = mean_squared_error(y_train, y_predicted_train)
MSE_test = mean_squared_error(y_test, y_predicted_test)

print('Train RMSE:', np.sqrt(MSE_train))
print('Test RMSE:', np.sqrt(MSE_test))

Train MAE 12.46653364956291
Train MAE 15.977951652487286
Train RMSE: 16.635657549894752
Test RMSE: 22.13147864475551


In [None]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(hidden_layer_sizes=(150,100,50), max_iter=100,activation = 'relu',solver='adam',random_state=1)
model.fit(X_train, y_train)

y_predicted_train = model.predict(X_train)
y_predicted_test = model.predict(X_test)

MAE_train = mean_absolute_error(y_train, y_predicted_train)
MAE_test = mean_absolute_error(y_test, y_predicted_test)

print('Train MAE', MAE_train)
print('Train MAE', MAE_test)

MSE_train = mean_squared_error(y_train, y_predicted_train)
MSE_test = mean_squared_error(y_test, y_predicted_test)

print('Train RMSE:', np.sqrt(MSE_train))
print('Test RMSE:', np.sqrt(MSE_test))

print(np.mean(np.sqrt(abs(cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)))))

In [183]:
from sklearn.neighbors import KNeighborsRegressor


model = KNeighborsRegressor(n_neighbors=8)
model.fit(X_train, y_train)

y_predicted_train = model.predict(X_train)
y_predicted_test = model.predict(X_test)

MAE_train = mean_absolute_error(y_train, y_predicted_train)
MAE_test = mean_absolute_error(y_test, y_predicted_test)

print('Train MAE', MAE_train)
print('Train MAE', MAE_test)

MSE_train = mean_squared_error(y_train, y_predicted_train)
MSE_test = mean_squared_error(y_test, y_predicted_test)

print('Train RMSE:', np.sqrt(MSE_train))
print('Test RMSE:', np.sqrt(MSE_test))

Train MAE 12.406996752459891
Train MAE 74.895
Train RMSE: 17.566236383486626
Test RMSE: 85.65121496511301


In [195]:
from sklearn.model_selection import cross_val_score

for i in np.arange(78, 100, 4):
    print(i)
    model = KNeighborsRegressor(n_neighbors=i)

    print(np.mean(np.sqrt(abs(cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)))))

78
19.963534210110772
82
19.965186199832388
86
19.963009625530322
90
19.963229920276923
94
19.95695666636781
98
19.961911370266556


In [16]:
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

for i in np.arange(0.5, 2.4, 0.2):
    print(i)
    model = SVR(gamma='auto', C=i)
    print(np.mean(np.sqrt(abs(cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)))))
    

0.5


KeyboardInterrupt: 

In [22]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

for i in np.arange(30, 80, 5):
    print(i)
    model = GradientBoostingRegressor(n_estimators=i ,loss='huber')
    print(np.mean(np.sqrt(abs(cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)))))

30
20.443343559000255
35
20.264698893564912
40
20.237306580131897
45
20.263781699545266
50


KeyboardInterrupt: 

In [192]:
for i in np.arange(900, 1201, 50):
    print(i)
    model = RandomForestRegressor(n_estimators=i)
    print(np.mean(np.sqrt(abs(cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)))))

900
20.482129503739593
950
20.48090157870141
1000
20.47063601497059
1050
20.47823276938245
1100


KeyboardInterrupt: 

In [25]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(hidden_layer_sizes=(150,100,50), max_iter=150,activation = 'relu',solver='adam',random_state=1, n_iter_no_change=10)
print(np.mean(np.sqrt(abs(cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=10)))))



24.157762303303773


