In [6]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.externals import joblib

In [7]:
# Importing the dataset
dataset = pd.read_csv('../../data/engine_data_error_egt.csv')
X = dataset.iloc[:, 1:9]
y = dataset.iloc[:, [16]]

In [8]:
# Splitting the dataset manually
X_train = X.iloc[0:4500, :]
X_test = X.iloc[4500:6000, :]

y_train = y.iloc[0:4500, :].values
y_test = y.iloc[4500:6000, :].values

In [9]:
# Encoding categorical data
X_train_encode = X_train.values
X_test_encode = X_test.values

labelencoder_X_train = LabelEncoder()
X_train_encode[:, 0] = labelencoder_X_train.fit_transform(X_train_encode[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X_train_encode = onehotencoder.fit_transform(X_train_encode).toarray()

labelencoder_X_test = LabelEncoder()
X_test_encode[:, 0] = labelencoder_X_test.fit_transform(X_test_encode[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X_test_encode = onehotencoder.fit_transform(X_test_encode).toarray()

In [10]:
# Avoiding dummy variable trap
categories = [0]
dummies = []
dummies_sum = 0

for category in categories:
    dummies_sum += (dataset.iloc[:, category].unique().size) * category
    dummies.append(dummies_sum)

X_train_encode = np.delete(X_train_encode, dummies, 1)
X_test_encode = np.delete(X_test_encode, dummies, 1)

In [11]:
# Fitting Multiple Linear Regression to the Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train_encode, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [12]:
# Predicting the Test Set results
y_pred = regressor.predict(X_test_encode)
y_pred[y_pred > 100] = 100
y_pred = y_pred.ravel()
y_p = np.abs(y_pred) / 100

In [47]:
dataset[dataset['normal'] == False].head()

Unnamed: 0,engine,month,air_temp,noise,epr,egt,ff,n1,n2,noise_n,epr_n,egt_n,ff_n,n1_n,n2_n,normal,failure_prob
14,1,15,-7,133,50,1760,5134,12519,10523,True,True,False,True,True,True,False,1
15,1,16,0,133,50,1770,5142,12526,10528,True,True,False,True,True,True,False,2
16,1,17,-3,133,48,1780,5147,12567,10568,True,True,False,True,True,True,False,3
17,1,18,-14,133,50,1793,5153,12596,10617,True,True,False,True,True,True,False,4
18,1,19,-14,133,49,1799,5166,12654,10650,True,True,False,True,True,True,False,5


In [49]:
#Final Output
normal_false = dataset[dataset['normal'] == False].head(46)
normal_false

Unnamed: 0,engine,month,air_temp,noise,epr,egt,ff,n1,n2,noise_n,epr_n,egt_n,ff_n,n1_n,n2_n,normal,failure_prob
14,1,15,-7,133,50,1760,5134,12519,10523,True,True,False,True,True,True,False,1
15,1,16,0,133,50,1770,5142,12526,10528,True,True,False,True,True,True,False,2
16,1,17,-3,133,48,1780,5147,12567,10568,True,True,False,True,True,True,False,3
17,1,18,-14,133,50,1793,5153,12596,10617,True,True,False,True,True,True,False,4
18,1,19,-14,133,49,1799,5166,12654,10650,True,True,False,True,True,True,False,5
19,1,20,-2,134,50,1811,5176,12682,10666,True,True,False,True,True,True,False,6
20,1,21,-14,134,50,1825,5180,12717,10716,True,True,False,True,True,True,False,7
21,1,22,-9,134,48,1841,5189,12746,10746,True,True,False,True,True,True,False,8
22,1,23,-9,134,48,1843,5199,12776,10787,True,True,False,True,True,True,False,9
23,1,24,-14,134,48,1858,5212,12815,10821,True,True,False,True,True,True,False,10


In [50]:
all_normal_columns = normal_false[['noise_n', 'epr_n', 'egt_n', 'ff_n', 'n1_n', 'n2_n']].columns
all_normal_columns = list(all_normal_columns.values)
cols = normal_false[all_normal_columns].columns[(normal_false[all_normal_columns] == False).iloc[0]]


In [52]:
for i in cols:
    print(i, normal_false[i[:-2]])

egt_n 14    1760
15    1770
16    1780
17    1793
18    1799
19    1811
20    1825
21    1841
22    1843
23    1858
24    1865
25    1876
26    1887
27    1898
28    1911
29    1928
30    1940
31    1945
32    1959
33    1965
34    1979
35    1987
36    1997
37    2013
38    2019
39    2034
40    2042
41    2055
42    2068
43    2083
44    2088
45    2097
46    2113
47    2119
48    2128
49    2143
50    2152
51    2168
52    2180
53    2187
54    2194
55    2212
56    2218
57    2235
58    2245
59    2256
Name: egt, dtype: int64


In [None]:
## Saving the model
#joblib.dump(regressor, 'multiple_reg_error.pkl')
#
## Visualising the results
#
## First Observation
#plt.plot(X_test['month'][0:60], y_normal, color = 'red')
#plt.plot(X_test['month'][0:60], y_p[0:60], color = 'blue')
#plt.xticks(np.arange(0, 61, 2))
#plt.yticks(np.arange(0, 1.05, 0.05))
#plt.title('Age (in months) vs Probability of Failure')
#plt.xlabel('Age (in months)')
#plt.ylabel('Probability of Failure')
#plt.show()