In [56]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [57]:
df = pd.read_csv('./engine_data.csv')
df.head()

Unnamed: 0,Engine rpm,Lub oil pressure,Fuel pressure,Coolant pressure,lub oil temp,Coolant temp,Engine Condition
0,700,2.493592,11.790927,3.178981,84.144163,81.632187,1
1,876,2.941606,16.193866,2.464504,77.640934,82.445724,0
2,520,2.961746,6.553147,1.064347,77.752266,79.645777,1
3,473,3.707835,19.510172,3.727455,74.129907,71.774629,1
4,619,5.672919,15.738871,2.052251,78.396989,87.000225,0


* Product ID is unique for every member of the dataframe, so they contain no usefull data and will be deleted
* The next column (Type) shows the quality of the tool used, which is usefull in predicting the life of the tool

Remaining columns until Target are all usefull data about the working conditions of the tool

* Target column indicates wether the tool has failed (Target==1) or not (Target==0)
* Failure Type column shows the same data as the target column, but in the cases where there was a failure it also mentiones the type of the failure the tool has experienced

So based on the prediction we are going to make (wether we only want to predict the failure or we want to predict the type sa well), we have to use only one of the last two columns in order to stop any data leakage from happening.
The intention of this nptebook is predicting failure, not failure type. Because of this, the failure type column will be deleted later.

We have to make sure there are no NA values inside the dataframe as well, since if there are any we have to resolve them.

In some cases, the data available in the Target column indicates that the tool has failed, but the Failure Type column indicated that the tool has not had any failure. 

It can be seen that the dataset is severly imbalanced. So first we divide the dataframe into training and testing sections and then apply oversampling to the training section of the dataset in order to balance the information used for training the model. This is done after dividing the datasets in order to stop any data leakage from happening and also for the testing dataset of the model to be representative of the real world.

In [58]:
from sklearn.model_selection import train_test_split
#making test and train datasets from the base dataset
train_data,test_data = train_test_split(df,stratify=df['Engine Condition'],test_size=0.2)
#stratifying based on the target column so that the data imbalance of the original dataset is also present in the training and testing datasets

In [59]:
from imblearn.over_sampling import SMOTE

#Oversampling function written with SMOTE to improve group balance in the data
def sm_oversamp(inp_x,inp_y):
    ov_samp = SMOTE(k_neighbors=4,sampling_strategy='minority')
    return ov_samp.fit_resample(inp_x,inp_y)

x_train = train_data.drop(columns=['Engine Condition'])
y_train = train_data['Engine Condition']

#variables with _os are oversampled and will be used for training the model
x_train_os,y_train_os = sm_oversamp(x_train,y_train)

In [60]:
#Scaling the feature columns to imptove model efficiency
from sklearn.preprocessing import MinMaxScaler
feature_scaler = MinMaxScaler()
#feature scaler is first fit on the features of the training dataset
feature_scaler.fit(x_train_os)

In [61]:
#feature scaler is then applied to the features of both the training and testing datasets
train_features_scaled = pd.DataFrame(feature_scaler.transform(x_train_os))
test_features_scaled = pd.DataFrame(feature_scaler.transform(test_data.drop(columns=['Engine Condition'])))
train_features_scaled

Unnamed: 0,0,1,2,3,4,5
0,0.282369,0.242287,0.382989,0.343306,0.353045,0.198605
1,0.475666,0.253317,0.437041,0.144579,0.160878,0.179935
2,0.426079,0.540938,0.247937,0.431505,0.286702,0.143792
3,0.501377,0.344000,0.176570,0.145761,0.841608,0.145583
4,0.187787,0.333320,0.295986,0.379724,0.252844,0.165459
...,...,...,...,...,...,...
19703,0.570707,0.415113,0.383832,0.311093,0.645464,0.178684
19704,0.417355,0.625902,0.253190,0.233757,0.290511,0.129190
19705,0.579890,0.455444,0.202361,0.297460,0.529309,0.195200
19706,0.280533,0.423098,0.168330,0.371520,0.356181,0.157558


In [62]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
#XGBClassifier which is based on ensemble learning is chosen for this application
xgb_model = XGBClassifier()

params = {
    'n_estimators': [50, 75, 100, 150, 200],
    'max_depth': [2, 5, 8, 10, 12],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

#for hyperparameter tuning, RandomizedSearchCV is chosen. For filure prediction, recal is the most important scoring method
#Because of this it is chosen as the value to be optimized
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=params,
    n_iter=10,
    scoring='recall',
    cv=5
)

random_search.fit(train_features_scaled,y_train_os)

In [63]:
best_model = random_search.best_estimator_
random_search.best_params_

{'subsample': 0.9,
 'n_estimators': 200,
 'max_depth': 10,
 'learning_rate': 0.1,
 'colsample_bytree': 0.9}

In [64]:
y_pred = best_model.predict(test_features_scaled)

from sklearn.metrics import classification_report
print(classification_report(test_data['Engine Condition'],y_pred))

              precision    recall  f1-score   support

           0       0.49      0.58      0.53      1444
           1       0.73      0.65      0.68      2463

    accuracy                           0.62      3907
   macro avg       0.61      0.62      0.61      3907
weighted avg       0.64      0.62      0.63      3907



In [67]:
data = [700,2.493591821,11.79092738,3.178980794,84.14416293,81.6321865]
print("Scaler Data: ", feature_scaler.transform([data]))
result = best_model.predict(feature_scaler.transform([data]))
print(result)

Scaler Data:  [[0.29338843 0.34290078 0.55773187 0.42790588 0.70655695 0.14910854]]
[1]




In [66]:
import pickle
with open("fake_model.pkl","wb") as files:
    pickle.dump(best_model,files)
