In [116]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os


"""
AUTHOR'S WORDS:


[Machine Predictive Maintenance Classification Dataset]

Since real predictive maintenance datasets are generally difficult to obtain and in particular difficult to publish, we present and provide a synthetic 
dataset that reflects real predictive maintenance encountered in the industry to the best of our knowledge.

The dataset consists of 10 000 data points stored as rows with 14 features in columns


UID                     : unique identifier ranging from 1 to 10000

productID               : consisting of a letter L, M, or H for low (50% of all products), medium (30%), and high (20%) as product quality variants 
                          and a variant-specific serial number

air temperature [K]     : generated using a random walk process later normalized to a standard deviation of 2 K around 300 K

process temperature [K] : generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.

rotational speed [rpm]  : calculated from powepower of 2860 W, overlaid with a normally distributed noise

torque [Nm]             : torque values are normally distributed around 40 Nm with an Ïƒ = 10 Nm and no negative values.

tool wear [min]         : The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process. and a 'machine failure' label that indicates, whether the machine has failed in this particular data point for any of the following failure modes are true.


Important : There are two Targets - Do not make the mistake of using one of them as feature, as it will lead to leakage.

Target       : Failure or Not
Failure Type : Type of Failure

In [117]:
df = pd.read_csv('./predictive_maintenance.csv',index_col='UDI')
df.head()

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,78912314860,789123,298.1,308.6,1551,42.8,0,0,0
2,65432147181,654321,298.2,308.7,1408,46.3,3,0,0
3,65432147182,654321,298.1,308.5,1498,49.4,5,0,0
4,65432147183,654321,298.2,308.6,1433,39.5,7,0,0
5,65432147184,654321,298.2,308.7,1408,40.0,9,0,0


* Product ID is unique for every member of the dataframe, so they contain no usefull data and will be deleted
* The next column (Type) shows the quality of the tool used, which is usefull in predicting the life of the tool

Remaining columns until Target are all usefull data about the working conditions of the tool

* Target column indicates wether the tool has failed (Target==1) or not (Target==0)
* Failure Type column shows the same data as the target column, but in the cases where there was a failure it also mentiones the type of the failure the tool has experienced

So based on the prediction we are going to make (wether we only want to predict the failure or we want to predict the type sa well), we have to use only one of the last two columns in order to stop any data leakage from happening.
The intention of this nptebook is predicting failure, not failure type. Because of this, the failure type column will be deleted later.

In [118]:
df.drop(columns = ['Product ID'],inplace=True)

In [119]:
from sklearn.preprocessing import LabelEncoder
type_enc = LabelEncoder()
df['Type'] = type_enc.fit_transform(df['Type'])
#encoding 'Type' column to int values
df.isna().sum()

Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Target                     0
Failure Type               0
dtype: int64

We have to make sure there are no NA values inside the dataframe as well, since if there are any we have to resolve them.

In [120]:
df[(df['Target']==1)&(df['Failure Type']=='No Failure')]

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In some cases, the data available in the Target column indicates that the tool has failed, but the Failure Type column indicated that the tool has not had any failure. 

In [121]:
df.drop(df[(df['Target']==1)&(df['Failure Type']=='No Failure')].index,inplace=True)

In [122]:
# The first part of the project is to predict whether the tool fails or not, so we drop the failure type
#we will address it later
df.drop(columns = ['Failure Type'],inplace=True)
df.groupby('Target').size()

Target
0    9661
1     339
dtype: int64

It can be seen that the dataset is severly imbalanced. So first we divide the dataframe into training and testing sections and then apply oversampling to the training section of the dataset in order to balance the information used for training the model. This is done after dividing the datasets in order to stop any data leakage from happening and also for the testing dataset of the model to be representative of the real world.

In [123]:
from sklearn.model_selection import train_test_split
#making test and train datasets from the base dataset
train_data,test_data = train_test_split(df,stratify=df['Target'],test_size=0.2)
#stratifying based on the target column so that the data imbalance of the original dataset is also present in the training and testing datasets

In [124]:
from imblearn.over_sampling import SMOTE

#Oversampling function written with SMOTE to improve group balance in the data
def sm_oversamp(inp_x,inp_y):
    ov_samp = SMOTE(k_neighbors=4,sampling_strategy='minority')
    return ov_samp.fit_resample(inp_x,inp_y)

x_train = train_data.drop(columns=['Target'])
y_train = train_data['Target']

#variables with _os are oversampled and will be used for training the model
x_train_os,y_train_os = sm_oversamp(x_train,y_train)

In [125]:
#Scaling the feature columns to imptove model efficiency
from sklearn.preprocessing import MinMaxScaler
feature_scaler = MinMaxScaler()
#feature scaler is first fit on the features of the training dataset
feature_scaler.fit(x_train_os)
print(x_train_os.head())

   Type  Air temperature [K]  Process temperature [K]  Rotational speed [rpm]  \
0     2                299.0                    310.2                    1628   
1     1                298.7                    309.9                    1593   
2     0                298.6                    309.4                    1439   
3     1                297.5                    308.5                    1340   
4     2                298.8                    308.5                    1510   

   Torque [Nm]  Tool wear [min]  
0         35.8               64  
1         42.5              146  
2         47.1              149  
3         57.4              143  
4         38.7               66  


In [126]:
#feature scaler is then applied to the features of both the training and testing datasets
train_features_scaled = pd.DataFrame(feature_scaler.transform(x_train_os))
test_features_scaled = pd.DataFrame(feature_scaler.transform(test_data.drop(columns=['Target'])))
print(x_train_os)
train_features_scaled

       Type  Air temperature [K]  Process temperature [K]  \
0         2           299.000000               310.200000   
1         1           298.700000               309.900000   
2         0           298.600000               309.400000   
3         1           297.500000               308.500000   
4         2           298.800000               308.500000   
...     ...                  ...                      ...   
15453     1           302.837939               311.237939   
15454     1           302.162455               311.037545   
15455     1           300.415707               311.705758   
15456     2           302.688236               311.411764   
15457     1           300.486991               311.747806   

       Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  
0                        1628    35.800000               64  
1                        1593    42.500000              146  
2                        1439    47.100000              149  
3                  

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.402174,0.550000,0.267753,0.439560,0.252964
1,0.5,0.369565,0.512500,0.247381,0.531593,0.577075
2,0.0,0.358696,0.450000,0.157742,0.594780,0.588933
3,0.5,0.239130,0.337500,0.100116,0.736264,0.565217
4,1.0,0.380435,0.337500,0.199069,0.479396,0.260870
...,...,...,...,...,...,...
15453,0.5,0.819341,0.679742,0.096624,0.694116,0.375494
15454,0.5,0.745919,0.654693,0.058207,0.852750,0.731225
15455,0.5,0.556055,0.738220,0.059953,0.778897,0.786561
15456,1.0,0.803069,0.701470,0.115832,0.691846,0.462451


In [127]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
#XGBClassifier which is based on ensemble learning is chosen for this application
xgb_model = XGBClassifier()

params = {
    'n_estimators': [50, 75, 100, 150, 200],
    'max_depth': [2, 5, 8, 10, 12],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

#for hyperparameter tuning, RandomizedSearchCV is chosen. For filure prediction, recal is the most important scoring method
#Because of this it is chosen as the value to be optimized
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=params,
    n_iter=10,
    scoring='recall',
    cv=5
)

random_search.fit(train_features_scaled,y_train_os)

In [128]:
best_model = random_search.best_estimator_
random_search.best_params_

{'subsample': 0.9,
 'n_estimators': 100,
 'max_depth': 8,
 'learning_rate': 0.1,
 'colsample_bytree': 0.8}

In [129]:
y_pred = best_model.predict(test_features_scaled)

from sklearn.metrics import classification_report
print(classification_report(test_data['Target'],y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1932
           1       0.46      0.76      0.57        68

    accuracy                           0.96      2000
   macro avg       0.73      0.87      0.78      2000
weighted avg       0.97      0.96      0.97      2000



In [132]:
data = [654321,297.9,309.8,1336,91.6,31]
print(feature_scaler.transform([data]))
features = feature_scaler.transform([data])
response = best_model.predict(features)
print(response)


[[3.27160500e+05 2.82608696e-01 5.00000000e-01 9.77881257e-02
  1.20604396e+00 1.22529644e-01]]
[1]




In [131]:
import pickle
with (open('model.pkl','wb')) as file:
    pickle.dump(best_model,file)