# Machine failure forecast using sensor data
## 1 Setting up the environment
### 1.1 Importing libraries

In [8]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns

In [29]:
# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

# 2 Creating functions for models

In [30]:
# To extract names of column in dataset
def data_col(data):
    return list(data.columns.values)

#train logistic regression model
def train_log_reg(x_val, y_val):
    log_reg = LogisticRegression()
    log_reg.fit(x_val, y_val)
    return log_reg

#train random forest model
# Importing the model
# Initiating model with 1000 decision trees
def train_rf(x_val, y_val):
    rf = RandomForestRegressor(n_estimators = 1000)
    rf.fit(x_val, y_val)
    return rf


# Function to return Predicted values
def score_data(trained_model, x_val):
    ypredict = trained_model.predict(x_val)
    return ypredict

#Prediction accuracy calculation
def model_accuracy(trained_model, variables, targets):
    accuracy_score = trained_model.score(variables, targets)
    return accuracy_score

# Confusion matrix function
def confusion_matrix(actfail, predictfail):
  # Compute Confusion matrix
  print("Actual, Predicted Observations:  ",len(actfail), len(predictfail))
  anpn = 0
  anpy = 0
  aypn = 0
  aypy = 0
  
  for i in range(len(actfail)):
      if (actfail[i]==0 and predictfail[i]==0):
          anpn = anpn + 1
      elif (actfail[i]==0 and predictfail[i]==1):
          anpy = anpy + 1
      elif (actfail[i]==1 and predictfail[i]==0):
          aypn = aypn + 1
      else:
          aypy = aypy + 1

  print ("--------------------------------------------")
  print ("Confusion Matrix")
  print ("--------------------------------------------")
  print ("              ", "Predicted N", "Predicted Y")
  print ("Actual N      ", anpn,"          ", anpy) 
  print ("Actual Y      ", aypn,"          ", aypy)
  print ("--------------------------------------------")
  print ("Total observations  :  ", anpn+anpy+aypn+aypy)
  print ("False Positives     :  ", anpy)
  print ("False Negatives     :  ", aypn)
  print ("Overall Accuracy    :  ", round((float(anpn+aypy)/float(anpn+anpy+aypn+aypy))*100, 2), "%")
  print ("Sensitivity/Recall  :  ", round((float(aypy)/float(aypn+aypy))*100, 2), "%")
  print ("Specificity         :  ", round((float(anpn)/float(anpn+anpy))*100, 2), "%")
  print ("Precision           :  ", round((float(aypy)/float(anpy+aypy))*100, 2), "%")
  print ("--------------------------------------------") 

# 3 Read the data

In [11]:
data_df = pd.read_csv('machine_dataset.csv')
display(data_df)

Unnamed: 0,id,a,b,c,d,e,outpressure,inpressure,fail
0,1,7,7,1,6,6,36,3,1
1,2,1,3,3,5,1,20,4,0
2,3,7,2,2,6,1,24,6,0
3,4,4,3,4,5,1,28,6,0
4,5,7,5,6,4,0,68,6,0
...,...,...,...,...,...,...,...,...,...
939,940,7,7,1,6,4,73,6,1
940,941,7,5,2,6,6,50,6,1
941,942,3,6,2,7,5,43,6,1
942,943,6,6,2,5,6,46,7,1


In [14]:
# Printing the dataset column names
datacol = data_col(data_df)
print("Dataset columns : ", list(datacol))

Dataset columns :  ['id', 'a', 'b', 'c', 'd', 'e', 'outpressure', 'inpressure', 'fail']


In [15]:
# Features
col_to_retain = ['a', 'b', 'c', 'd','e', 'outpressure', 'inpressure']


# 4 Running the algorithm for machine failure
## 4.1 Data splitting into training and testing

In [16]:
# Training and testing data split
train_x, test_x, train_y, test_y = train_test_split(data_df[col_to_retain], data_df['fail'], train_size=0.7)
print("Train x count : ", len(train_x), len(train_x.columns.values))
print("Train y count : ", len(train_y))

print("Test x count : ", len(test_x), len(test_x.columns.values))
print("Test y count : ", len(test_y))


Train x count :  660 7
Train y count :  660
Test x count :  284 7
Test y count :  284


## 5.2 Training the predictive model

In [18]:
# Logistic regression model training
trained_log_reg = train_log_reg(train_x, train_y)
train_accuracy = model_accuracy(trained_log_reg, train_x, train_y)


# Logistic regression model testing
test_accuracy = model_accuracy(trained_log_reg, test_x, test_y)
print ("Training Accuracy : ", round(train_accuracy * 100, 2), "%")


Training Accuracy :  90.91 %


## 5.3 Scoring test data

In [19]:
# Accuracy of model : Constructing confusion matrix for test data and score

actfail = test_y.values
predictfail = score_data(trained_log_reg, test_x)
# Calculate the absolute errors
errors = abs(predictfail - actfail)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


Mean Absolute Error: 0.07


# 6 Prediction accuracy analysis using confusion matrix

In [20]:
# Printing actual fails and predicted fail
confusion_matrix(actfail, predictfail)

Actual, Predicted Observations:   284 284
--------------------------------------------
Confusion Matrix
--------------------------------------------
               Predicted N Predicted Y
Actual N       160            9
Actual Y       12            103
--------------------------------------------
Total observations  :   284
False Positives     :   9
False Negatives     :   12
Overall Accuracy    :   92.61 %
Sensitivity/Recall  :   89.57 %
Specificity         :   94.67 %
Precision           :   91.96 %
--------------------------------------------


# 7 Applying Random Forest

In [28]:
# Random Forest model training
trained_rf = train_rf(train_x, train_y)
train_accuracy = model_accuracy(trained_rf, train_x, train_y)


# Logistic regression model testing
test_accuracy = model_accuracy(trained_rf, test_x, test_y)
print ("Training Accuracy : ", round(train_accuracy * 100, 2), "%")

Training Accuracy :  95.27 %


## 7.1 Training the predictive model

In [22]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_x)
# Calculate the absolute errors
errors = abs(predictions - test_y)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


Mean Absolute Error: 0.13
