In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
df = pd.read_csv('./predictive_maintenance.csv',index_col='UDI')
df.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 2


* Product ID is unique for every member of the dataframe, so they contain no usefull data and will be deleted
* The next column (Type) shows the quality of the tool used, which is usefull in predicting the life of the tool

Remaining columns until Target are all usefull data about the working conditions of the tool

* Target column indicates wether the tool has failed (Target==1) or not (Target==0)
* Failure Type column shows the same data as the target column, but in the cases where there was a failure it also mentiones the type of the failure the tool has experienced

So based on the prediction we are going to make (wether we only want to predict the failure or we want to predict the type sa well), we have to use only one of the last two columns in order to stop any data leakage from happening.
The intention of this nptebook is predicting failure, not failure type. Because of this, the failure type column will be deleted later.

In [None]:
df.drop(columns = ['Product ID'],inplace=True)

In [3]:
from sklearn.preprocessing import LabelEncoder
type_enc = LabelEncoder()
df['Type'] = type_enc.fit_transform(df['Type'])
#encoding 'Type' column to int values
df.isna().sum()

NameError: name 'df' is not defined

We have to make sure there are no NA values inside the dataframe as well, since if there are any we have to resolve them.

In [None]:
df[(df['Target']==1)&(df['Failure Type']=='No Failure')]

In some cases, the data available in the Target column indicates that the tool has failed, but the Failure Type column indicated that the tool has not had any failure. 

In [6]:
df.drop(df[(df['Target']==1)&(df['Failure Type']=='No Failure')].index,inplace=True)

In [7]:
# The first part of the project is to predict whether the tool fails or not, so we drop the failure type
#we will address it later
df.drop(columns = ['Failure Type'],inplace=True)
df.groupby('Target').size()

Target
0    9661
1     330
dtype: int64

It can be seen that the dataset is severly imbalanced. So first we divide the dataframe into training and testing sections and then apply oversampling to the training section of the dataset in order to balance the information used for training the model. This is done after dividing the datasets in order to stop any data leakage from happening and also for the testing dataset of the model to be representative of the real world.

In [8]:
from sklearn.model_selection import train_test_split
#making test and train datasets from the base dataset
train_data,test_data = train_test_split(df,stratify=df['Target'],test_size=0.2)
#stratifying based on the target column so that the data imbalance of the original dataset is also present in the training and testing datasets

In [9]:
from imblearn.over_sampling import SMOTE

#Oversampling function written with SMOTE to improve group balance in the data
def sm_oversamp(inp_x,inp_y):
    ov_samp = SMOTE(k_neighbors=4,sampling_strategy='minority')
    return ov_samp.fit_resample(inp_x,inp_y)

x_train = train_data.drop(columns=['Target'])
y_train = train_data['Target']

#variables with _os are oversampled and will be used for training the model
x_train_os,y_train_os = sm_oversamp(x_train,y_train)

In [10]:
#Scaling the feature columns to imptove model efficiency
from sklearn.preprocessing import MinMaxScaler
feature_scaler = MinMaxScaler()
#feature scaler is first fit on the features of the training dataset
feature_scaler.fit(x_train_os)

In [11]:
#feature scaler is then applied to the features of both the training and testing datasets
train_features_scaled = pd.DataFrame(feature_scaler.transform(x_train_os))
test_features_scaled = pd.DataFrame(feature_scaler.transform(test_data.drop(columns=['Target'])))
train_features_scaled

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.913043,0.888889,0.193548,0.510989,0.130435
1,1.0,0.489130,0.679012,0.307918,0.348901,0.584980
2,0.0,0.347826,0.456790,0.229912,0.442308,0.383399
3,1.0,0.445652,0.530864,0.194135,0.541209,0.185771
4,0.5,0.347826,0.469136,0.227566,0.510989,0.604743
...,...,...,...,...,...,...
15451,0.5,0.678461,0.656043,0.073900,0.623084,0.885375
15452,1.0,0.310104,0.384539,0.044575,0.902891,0.426877
15453,0.5,0.358255,0.480481,0.101466,0.681374,0.833992
15454,0.5,0.778438,0.596420,0.085044,0.666523,0.569170


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
#XGBClassifier which is based on ensemble learning is chosen for this application
xgb_model = XGBClassifier()

params = {
    'n_estimators': [50, 75, 100, 150, 200],
    'max_depth': [2, 5, 8, 10, 12],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

#for hyperparameter tuning, RandomizedSearchCV is chosen. For filure prediction, recal is the most important scoring method
#Because of this it is chosen as the value to be optimized
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=params,
    n_iter=10,
    scoring='recall',
    cv=5
)

random_search.fit(train_features_scaled,y_train_os)

In [13]:
best_model = random_search.best_estimator_
random_search.best_params_

{'subsample': 1.0,
 'n_estimators': 75,
 'max_depth': 12,
 'learning_rate': 0.1,
 'colsample_bytree': 0.9}

In [14]:
y_pred = best_model.predict(test_features_scaled)

from sklearn.metrics import classification_report
print(classification_report(test_data['Target'],y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1933
           1       0.58      0.85      0.69        66

    accuracy                           0.97      1999
   macro avg       0.79      0.91      0.84      1999
weighted avg       0.98      0.97      0.98      1999

