In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

In [2]:
# trying to understand how our dataset changes once we change it to a multi-label dataset by finding out the different
# types of failure

In [3]:
# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
# Machine failure counts
data.groupby('Machine failure').count()['UDI']

Machine failure
0    9661
1     339
Name: UDI, dtype: int64

In [5]:
# Now we need to find out what type of failures are there, because there is also a possibility for a 
# combination of two failures or more 

# So to get the different combinations we will change all the 1s to the name of the failure and 0s to an empty value ""

In [6]:
data['TWF'] = data['TWF'].replace(1, "TWF") # This means that if column TWF is equal to 1 then the new column will be equal to 1
data['TWF'] = data['TWF'].replace(0, "") # This means that if column TWF is equal to 0 then the new column value will depend on the other failure types
data['HDF'] = data['HDF'].replace(1, "HDF")
data['HDF'] = data['HDF'].replace(0, "")
data["PWF"] = data['PWF'].replace(1,"PWF")
data["PWF"] = data['PWF'].replace(0,"")
data["OSF"] = data['OSF'].replace(1,"OSF")
data["OSF"] = data['OSF'].replace(0,"")
data["RNF"] = data['RNF'].replace(1,"RNF")
data["RNF"] = data['RNF'].replace(0,"")

data["error_class"] = data["TWF"] + data["HDF"]+data["PWF"]+data["OSF"]+data["RNF"]
data["error_class"].unique()



array(['', 'PWF', 'PWFOSF', 'TWF', 'OSF', 'RNF', 'HDF', 'TWFRNF',
       'HDFPWF', 'HDFOSF', 'TWFOSF', 'TWFPWFOSF'], dtype=object)

In [7]:
# Based on the above results we see that there are 12 labels, and out of that 11 labels are machine failures 
# and 1 label is no machine failures.
# So now lets check the frequency of each label

In [8]:
data.groupby('error_class').count()['UDI']

error_class
             9652
HDF           106
HDFOSF          6
HDFPWF          3
OSF            78
PWF            80
PWFOSF         11
RNF            18
TWF            42
TWFOSF          2
TWFPWFOSF       1
TWFRNF          1
Name: UDI, dtype: int64

In [9]:
# From the above results we see that the No machine failure should be equal to 9661 but we get 9652 which shows
# there must be something more in the data 

# By going through the whole dataset based on the failures we see that when RNF failure occurs the machine does not
# fail

In [10]:
# If the data is not converted to string and if the binary values are used then the below commented code is
# executed

# data[(data['Machine failure'] == 0) & (data['RNF'] == '1')]

data[(data['Machine failure'] == 0) & (data['RNF'] == 'RNF')]

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,error_class
1221,1222,M16081,M,297.0,308.3,1399,46.4,132,0,,,,,RNF,RNF
1302,1303,L48482,L,298.6,309.8,1505,45.7,144,0,,,,,RNF,RNF
1748,1749,H31162,H,298.4,307.7,1626,31.1,166,0,,,,,RNF,RNF
2072,2073,L49252,L,299.6,309.5,1570,35.5,189,0,,,,,RNF,RNF
2559,2560,L49739,L,299.3,309.0,1447,50.4,140,0,,,,,RNF,RNF
3065,3066,M17925,M,300.1,309.2,1687,27.7,95,0,,,,,RNF,RNF
3452,3453,H32866,H,301.6,310.5,1602,32.3,2,0,,,,,RNF,RNF
5471,5472,L52651,L,302.7,312.3,1346,61.2,170,0,,,,,RNF,RNF
5489,5490,L52669,L,302.6,312.1,1499,35.0,215,0,,,,,RNF,RNF
5495,5496,H34909,H,302.9,312.5,1357,55.0,12,0,,,,,RNF,RNF


In [11]:
# Based on the above results we see that RNF failure does not cause any machine failure but may require fixing or
# repairs which is beyond the scope of the project but RNF is a failure but not machine failure

# But it still does not solve the issue of No machine errors being 9652 as shown above. So this could mean that there
# could occur a machine failure without any of the failures occuring. In short, if all the known failures are 0, 
# the machine can still fail. Let's find out if this is true

In [12]:
# If the data is not converted to string and if the binary values are used then the below commented code is
# executed

# data[(data['Machine failure'] == 1) & (data['PWF'] == 0) & (data['TWF'] == 0) & (data['RNF'] == 0) & 
#            (data['OSF'] == 0) & (data['HDF'] == 0)]

data[(data['Machine failure'] == 1) & (data['PWF'] == "") & (data['TWF'] == "") & (data['RNF'] == "") &
           (data['OSF'] == "") & (data['HDF'] == "")]

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,error_class
1437,1438,H30851,H,298.8,309.9,1439,45.2,40,1,,,,,,
2749,2750,M17609,M,299.7,309.2,1685,28.9,179,1,,,,,,
4044,4045,M18904,M,301.9,310.9,1419,47.7,20,1,,,,,,
4684,4685,M19544,M,303.6,311.8,1421,44.8,101,1,,,,,,
5536,5537,M20396,M,302.3,311.8,1363,54.0,119,1,,,,,,
5941,5942,L53121,L,300.6,310.7,1438,48.5,78,1,,,,,,
6478,6479,L53658,L,300.5,309.8,1663,29.1,145,1,,,,,,
8506,8507,L55686,L,298.4,309.6,1710,27.3,163,1,,,,,,
9015,9016,L56195,L,297.2,308.1,1431,49.7,210,1,,,,,,


In [13]:
# So based on the above results we were right, there are instances where the machine failure occurs but none 
# of the other failures occur or are set to 1, so we need to distinguish it to another type. Hence bringing
# the total types to 13 labels which are 

# 0 = No Error
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UD, Machine failure equal to 1 without any known failure occuring, so we categorize it as UnDefined


In [14]:
# so now lets try to classify the multi-label dataset

# We will be doing it differently since we ran into a few errors when using the above dataset since there were 
# NaN values that were difficult to remove and hence we had to make a few changes to the code as well

In [15]:
# Unbalanced dataset of multi-labels

In [16]:
# Gaussian Naive Bayes using Log Probabilities

# using  additive smoothing so that probability will not be 0
def Additive_smoothing(numerator, denominator, alpha, features):
    return (numerator + alpha) / (denominator + (alpha * features))

# applying log to the probabilities for easy computations
def Log_probability(smoothing_probability):
    return np.log(smoothing_probability)


# The predictions is done by getting the probability for each class-feature using additive smoothing
# and log transformation
# query consits of the target values or y values
def predict(query, number_of_features):
   
    # stores log probabilities of each class
    log_class = {}
    n_features = query.shape[0]
    
    # for each feature class we will calculate the log probabilities and sum them up to get 
    for target in class_list:
        
        logProbability = 0
        
        for col, feature in enumerate(query):
            
            # storing the name of current column
            column = column_seq[col]
            
                
            # For calculating log probability we will apply additive smoothing first P(A/B)
            # P(A/B) = P(B/A) P(A) / P(B)
            # P(A/B) = posterior (Log probability)
            # P(B/A) = number_of_features[target][feature] likelihood
            # P(B) = class_count[target], alpha, n_features, probbaility of B being true
            logProbability += Log_probability(Additive_smoothing(number_of_features[target][feature],\
                                                                        class_count[target], alpha, n_features))
        # We are calculating the prior here P(A)
        #  adding log prior probability 
        # P(A) = class_count[target],\ n_rows, alpha, n_features
        logProbability += Log_probability(Additive_smoothing(class_count[target],\
                                                                    n_rows, alpha, n_features))
        
        log_class[target] = logProbability
    return log_class

In [17]:
# Reading the csv and store it in a dataframe
import pandas as pd
test_data = pd.read_csv('ai4i2020.csv')
test_data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [18]:
# Here we set the machine failure to different unique values in order to distinguish them, by doing encoding instead
# it would mix up the labels causing a confusion on what is what. 

# If we try to match the frequency after encoding we then fall into a problem since the frequency 1 is repeated twice
# and so we won't be able to identify which is which.

# Hence we use .where and set a unique value to each, then match and convert into an order from 0-12

test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['PWF'] == 1)  & (test_data['OSF'] == 1), 1, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['OSF'] == 1), 4, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['OSF'] == 1), 8, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['PWF'] == 1), 12, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['RNF'] == 1), 16, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1) & (test_data['OSF'] == 1), 20, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1), 24, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1), 30, 0)
test_data['Machine failure'] += np.where((test_data['OSF'] == 1), 36, 0)
test_data['Machine failure'] += np.where((test_data['RNF'] == 1), 42, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1), 48, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 0) & (test_data['TWF'] == 0) & (test_data['OSF'] == 0) &
               (test_data['HDF'] == 0) & (test_data['Machine failure'] == 1), 52, 0)



In [19]:
# Here the unique labels are converted to a proper order 

test_data.loc[test_data['Machine failure'] == 116, "Machine failure"] = 1 # TWFPWFOSF
test_data.loc[test_data['Machine failure'] == 89, "Machine failure"] = 2 # TWFRNF
test_data.loc[test_data['Machine failure'] == 71, "Machine failure"] = 3 # TWFOSF
test_data.loc[test_data['Machine failure'] == 81, "Machine failure"] = 4 # PWFOSF
test_data.loc[test_data['Machine failure'] == 85, "Machine failure"] = 5 # HDFPWF
test_data.loc[test_data['Machine failure'] == 93, "Machine failure"] = 6 # HDFOSF
test_data.loc[test_data['Machine failure'] == 31, "Machine failure"] = 7 # TWF
test_data.loc[test_data['Machine failure'] == 42, "Machine failure"] = 8 # RNF
test_data.loc[test_data['Machine failure'] == 25, "Machine failure"] = 9 # PWF
test_data.loc[test_data['Machine failure'] == 37, "Machine failure"] = 10 # OSF
test_data.loc[test_data['Machine failure'] == 49, "Machine failure"] = 11 # HDF
test_data.loc[test_data['Machine failure'] == 53, "Machine failure"] = 12 # UF


# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure



In [20]:
# Cluster sampling the dataset to 8000 where 6000 is used for training and 2000 is used for testing
test_data = test_data.sample(frac=0.8, replace=True)

In [21]:
# We check to see whether any of the label is missing, since we did cluster sampling, there is a possibility that
# the label with a low frequency would have been in the set of the remove data accounting to 0.2

test_data.groupby('Machine failure').count()

Unnamed: 0_level_0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
Machine failure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7694,7694,7694,7694,7694,7694,7694,7694,7694,7694,7694,7694,7694
2,2,2,2,2,2,2,2,2,2,2,2,2,2
3,3,3,3,3,3,3,3,3,3,3,3,3,3
4,9,9,9,9,9,9,9,9,9,9,9,9,9
5,2,2,2,2,2,2,2,2,2,2,2,2,2
6,4,4,4,4,4,4,4,4,4,4,4,4,4
7,30,30,30,30,30,30,30,30,30,30,30,30,30
8,18,18,18,18,18,18,18,18,18,18,18,18,18
9,68,68,68,68,68,68,68,68,68,68,68,68,68
10,69,69,69,69,69,69,69,69,69,69,69,69,69


In [22]:
# We remove the failure columns since we have merged them with machine_failure
test_data = test_data.drop(labels = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1)

In [23]:
# Setting the column 'Type' with datatype object to int64 by encoding
le =LabelEncoder()

test_data['Type'] = le.fit_transform(test_data['Type'])
test_data['Air temperature [K]'] = le.fit_transform(test_data['Air temperature [K]'])
test_data['Process temperature [K]'] = le.fit_transform(test_data['Process temperature [K]'])
test_data['Rotational speed [rpm]'] = le.fit_transform(test_data['Rotational speed [rpm]'])
test_data['Torque [Nm]'] = le.fit_transform(test_data['Torque [Nm]'])
test_data['Tool wear [min]'] = le.fit_transform(test_data['Tool wear [min]'])




In [24]:

# Defining X and y
X = test_data.drop(labels = ['UDI', 'Product ID', 'Machine failure'], axis =1 )
y = test_data['Machine failure']

# X=test_data.iloc[:, 2:8]
# y = test_data.iloc[:,8]


# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101)


# defining terms to be used for calculation
n_rows = X_train.shape[0] # Instances in X_train
class_list = y_train.unique() # Number of labels
column_seq = X_train.columns # Column names
df = X_train.copy() # storing X_train (features) in a dataframe for calculating
column_list = df.columns # List of column names in df

# add target column
df["target"] = y_train

In [25]:
# Computing the probabilities

feature_count = defaultdict(lambda: defaultdict(int)) # stores class- feature pair count
class_count = defaultdict(int) # store frequency of class globally
class_probability = defaultdict(int) # prior probability of class (class count / number of rows)
alpha = 1

if not df.empty:
    
    # for each row we will count the number of features
    for row in df.values:
        # storing the target class values
        target_row = row[-1] 
        # going through each feature and add 1 to sum up to the number of features
        for column in (row[:-1]):
            feature_count[target_row][column] += 1 

for target in y_train:
    # storing the number of classes
    target_class = len(class_count) 
    class_count[target] += 1

# calculation class_count into probabilities
for feature_class, count in class_count.items():
    # converting counts to probabilities
    class_probability[feature_class] = count / n_rows 




In [26]:

# hold predictions by our model
y_pred = []

data = X_test.copy()
data['target'] = y_test

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)



In [27]:
# Printing out the test labels vs the predicted labels
print("10 Test labels: ", y_test[:10].values)
print("Predicted labels:", y_pred[:10])

10 Test labels:  [0 0 0 0 0 0 0 0 0 0]
Predicted labels: [ 5 12  4  2  8  2  5  3  3  2]


In [28]:
# Based on the predicted labels we can match the label number to the below machine failure except 
# for No machine failure and RNF since the machine does not fail in these two cases

# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure

In [29]:
# compute accuracy
test_accuracy = accuracy_score(y_test, y_pred)

In [30]:
# Printing accuracy
# We notice that the accuracy is less and this is due to how the Naive bayes is created, since we are doing a 
# Probabilistic Naive Bayes we are getting a very low accuracy
print("Test Accuracy: ",test_accuracy)

Test Accuracy:  0.0155


In [31]:
# printing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 14, 576, 585, 145, 194, 238,   5,  73,   0,   3,   0,  87],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,   1,   1,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   1,   2,   1,   0,   1,   2,   0,   0,   0,   0,   1],
       [  0,   0,   0,   2,   0,   0,   0,   5,   0,   0,   0,   0],
       [  0,   3,   3,   2,   3,   1,   0,   2,   4,   0,   0,   1],
       [  0,   6,   0,   1,   0,   3,   0,   2,   0,   2,   0,   0],
       [  0,   9,   1,   1,   4,   3,   0,   2,   0,   0,   2,   4],
       [  0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [32]:
# printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.01      0.01      1920
           2       0.00      0.00      0.00         0
           3       0.00      1.00      0.00         1
           4       0.01      0.33      0.01         3
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.29      0.25      0.27         8
           8       0.06      0.71      0.11         7
           9       1.00      0.21      0.35        19
          10       0.40      0.14      0.21        14
          11       1.00      0.08      0.14        26
          12       0.00      0.00      0.00         1

    accuracy                           0.02      2000
   macro avg       0.31      0.23      0.09      2000
weighted avg       0.99      0.02      0.02      2000



  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# hold predictions by our model
y_pred = []

data = X_train.copy()
data['target'] = y_train

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)



In [34]:
print("Train Accuracy: ",accuracy_score(y_train, y_pred))

Train Accuracy:  0.023


In [35]:
# Unbalanced dataset binning

In [36]:
# The file is read again so that any changes made will not affect the output this experimentation for this section
# Also taking into consideration of the memory space

In [37]:
# Reading the csv and store it in a dataframe
import pandas as pd
test_data = pd.read_csv('ai4i2020.csv')
test_data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [38]:
# Here we set the machine failure to different unique values in order to distinguish them, by doing encoding instead
# it would mix up the labels causing a confusion on what is what. 

# If we try to match the frequency after encoding we then fall into a problem since the frequency 1 is repeated twice
# and so we won't be able to identify which is which.

# Hence we use .where and set a unique value to each, then match and convert into an order from 0-12

test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['PWF'] == 1)  & (test_data['OSF'] == 1), 1, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['OSF'] == 1), 4, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['OSF'] == 1), 8, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['PWF'] == 1), 12, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['RNF'] == 1), 16, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1) & (test_data['OSF'] == 1), 20, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1), 24, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1), 30, 0)
test_data['Machine failure'] += np.where((test_data['OSF'] == 1), 36, 0)
test_data['Machine failure'] += np.where((test_data['RNF'] == 1), 42, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1), 48, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 0) & (test_data['TWF'] == 0) & (test_data['OSF'] == 0) &
               (test_data['HDF'] == 0) & (test_data['Machine failure'] == 1), 52, 0)



In [39]:
# Here the unique labels are converted to a proper order 

test_data.loc[test_data['Machine failure'] == 116, "Machine failure"] = 1 # TWFPWFOSF
test_data.loc[test_data['Machine failure'] == 89, "Machine failure"] = 2 # TWFRNF
test_data.loc[test_data['Machine failure'] == 71, "Machine failure"] = 3 # TWFOSF
test_data.loc[test_data['Machine failure'] == 81, "Machine failure"] = 4 # PWFOSF
test_data.loc[test_data['Machine failure'] == 85, "Machine failure"] = 5 # HDFPWF
test_data.loc[test_data['Machine failure'] == 93, "Machine failure"] = 6 # HDFOSF
test_data.loc[test_data['Machine failure'] == 31, "Machine failure"] = 7 # TWF
test_data.loc[test_data['Machine failure'] == 42, "Machine failure"] = 8 # RNF
test_data.loc[test_data['Machine failure'] == 25, "Machine failure"] = 9 # PWF
test_data.loc[test_data['Machine failure'] == 37, "Machine failure"] = 10 # OSF
test_data.loc[test_data['Machine failure'] == 49, "Machine failure"] = 11 # HDF
test_data.loc[test_data['Machine failure'] == 53, "Machine failure"] = 12 # UF


# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure



In [40]:
# Cluster sampling the dataset to 8000 where 6000 is used for training and 2000 is used for testing
test_data = test_data.sample(frac=0.8, replace=True)

In [41]:
# We remove the failure columns since we have merged them with machine_failure
test_data = test_data.drop(labels = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1)

In [42]:
test_data.groupby('Machine failure').count()['UDI']

Machine failure
0     7700
1        2
2        1
3        2
4       10
5        2
6        1
7       41
8        7
9       62
10      70
11      97
12       5
Name: UDI, dtype: int64

In [43]:
# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
test_data['Type'] = le.fit_transform(test_data['Type'])

In [44]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high
min_value = test_data['Air temperature [K]'].min()
max_value = test_data['Air temperature [K]'].max()
print(min_value)
print(max_value)

# Store in Bins 
bins = np.linspace(min_value, max_value, 4)
bins

# Create labels
labels = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Air temperature [K]'] = pd.cut(test_data['Air temperature [K]'], bins=bins, labels=labels, include_lowest=True)

295.3
304.4


In [45]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_2 = test_data['Process temperature [K]'].min()
max_value_2 = test_data['Process temperature [K]'].max()
print(min_value_2)
print(max_value_2)

# Store in Bins 
bins_2 = np.linspace(min_value_2, max_value_2, 4)
bins_2

# Create labels
labels_2 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Process temperature [K]'] = pd.cut(test_data['Process temperature [K]'], bins=bins_2, labels=labels_2, include_lowest=True)

305.8
313.8


In [46]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_3 = test_data['Rotational speed [rpm]'].min()
max_value_3 = test_data['Rotational speed [rpm]'].max()
print(min_value_3)
print(max_value_3)

# Store in Bins 
bins_3 = np.linspace(min_value_3, max_value_3, 4)
bins_3

# Create labels
labels_3 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Rotational speed [rpm]'] = pd.cut(test_data['Rotational speed [rpm]'], bins=bins_3, labels=labels_3, include_lowest=True)





1168
2886


In [47]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_4 = test_data['Torque [Nm]'].min()
max_value_4 = test_data['Torque [Nm]'].max()
print(min_value_4)
print(max_value_4)

# Store in Bins 
bins_4 = np.linspace(min_value_4, max_value_4, 4)
bins_4

# Create labels
labels_4 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Torque [Nm]'] = pd.cut(test_data['Torque [Nm]'], bins=bins_4, labels=labels_4, include_lowest=True)




3.8
76.2


In [48]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_5 = test_data['Tool wear [min]'].min()
max_value_5 = test_data['Tool wear [min]'].max()
print(min_value_5)
print(max_value_5)

# Store in Bins 
bins_5 = np.linspace(min_value_5, max_value_5, 4)
bins_5

# Create labels
labels_5 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Tool wear [min]'] = pd.cut(test_data['Tool wear [min]'], bins=bins_5, labels=labels_5, include_lowest=True)





0
253


In [49]:
# Encoding the features for getting a better result

le =LabelEncoder()

test_data['Air temperature [K]'] = le.fit_transform(test_data['Air temperature [K]'])
test_data['Process temperature [K]'] = le.fit_transform(test_data['Process temperature [K]'])
test_data['Rotational speed [rpm]'] = le.fit_transform(test_data['Rotational speed [rpm]'])
test_data['Torque [Nm]'] = le.fit_transform(test_data['Torque [Nm]'])
test_data['Tool wear [min]'] = le.fit_transform(test_data['Tool wear [min]'])



In [50]:
# Defining X and y
X=test_data.iloc[:, 2:8]
y = test_data.iloc[:,8]

# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101)



# defining terms to be used for calculation
n_rows = X_train.shape[0] # Instances in X_train
class_list = y_train.unique() # Number of labels
column_seq = X_train.columns # Column names
df = X_train.copy() # storing X_train (features) in a dataframe for calculating
column_list = df.columns # List of column names in df

# add target column
df["target"] = y_train

In [51]:
# Computing the probabilities

feature_count = defaultdict(lambda: defaultdict(int)) # stores class- feature pair count
class_count = defaultdict(int) # store frequency of class globally
class_probability = defaultdict(int) # prior probability of class (class count / number of rows)
alpha = 1

if not df.empty:
    
    # for each row we will count the number of features
    for row in df.values:
        # storing the target class values
        target_row = row[-1] 
        # going through each feature and add 1 to sum up to the number of features
        for column in (row[:-1]):
            feature_count[target_row][column] += 1 

for target in y_train:
    # storing the number of classes
    target_class = len(class_count) 
    class_count[target] += 1

# calculation class_count into probabilities
for feature_class, count in class_count.items():
    # converting counts to probabilities
    class_probability[feature_class] = count / n_rows 




In [52]:
# This will take some time since we have 120,000 instances

# hold predictions by our model
y_pred = []

data = X_test.copy()
data['target'] = y_test

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)

In [53]:
# Printing out the test labels vs the predicted labels
print("10 Test labels: ", y_test[:10].values)
print("Predicted labels:", y_pred[:10])

10 Test labels:  [0 0 0 0 0 0 0 0 0 0]
Predicted labels: [0 0 0 0 0 0 0 0 0 0]


In [54]:
# Based on the predicted labels we can match the label number to the below machine failure except 
# for No machine failure and RNF since the machine does not fail in these two cases

# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure


In [55]:
# compute accuracy
test_accuracy = accuracy_score(y_test, y_pred)

# Printing accuracy
print("Test Accuracy: ",test_accuracy)

Test Accuracy:  0.9655


In [56]:
# printing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1931,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   1,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   4,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   3,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [  13,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [  16,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [  26,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,    0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [57]:
# printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1931
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        13
          10       0.00      0.00      0.00        16
          11       0.00      0.00      0.00        26
          12       0.00      0.00      0.00         2

    accuracy                           0.97      2000
   macro avg       0.10      0.10      0.10      2000
weighted avg       0.93      0.97      0.95      2000



  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
# hold predictions by our model
y_pred = []

data = X_train.copy()
data['target'] = y_train

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)



In [59]:
print("Train Accuracy: ",accuracy_score(y_train, y_pred))

Train Accuracy:  0.9615


In [60]:
# Balancing the dataset by undersampling
# 1000 instances for each label

In [61]:
# The file is read again so that any changes made will not affect the output this experimentation for this section
# Also taking into consideration of the memory space

In [62]:
import pandas as pd
# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [63]:
# Here we set the machine failure to different unique values in order to distinguish them, by doing encoding instead
# it would mix up the labels causing a confusion on what is what. 

# If we try to match the frequency after encoding we then fall into a problem since the frequency 1 is repeated twice
# and so we won't be able to identify which is which.

# Hence we use .where and set a unique value to each, then match and convert into an order from 0-12

test_data = data.copy()

                                         
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['PWF'] == 1)  & (test_data['OSF'] == 1), 1, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['OSF'] == 1), 4, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['OSF'] == 1), 8, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['PWF'] == 1), 12, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['RNF'] == 1), 16, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1) & (test_data['OSF'] == 1), 20, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1), 24, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1), 30, 0)
test_data['Machine failure'] += np.where((test_data['OSF'] == 1), 36, 0)
test_data['Machine failure'] += np.where((test_data['RNF'] == 1), 42, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1), 48, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 0) & (test_data['TWF'] == 0) & (test_data['OSF'] == 0) &
               (test_data['HDF'] == 0) & (test_data['Machine failure'] == 1), 52, 0)


# Here the unique labels are converted to a proper order 

test_data.loc[test_data['Machine failure'] == 116, "Machine failure"] = 1 # TWFPWFOSF
test_data.loc[test_data['Machine failure'] == 89, "Machine failure"] = 2 # TWFRNF
test_data.loc[test_data['Machine failure'] == 71, "Machine failure"] = 3 # TWFOSF
test_data.loc[test_data['Machine failure'] == 81, "Machine failure"] = 4 # PWFOSF
test_data.loc[test_data['Machine failure'] == 85, "Machine failure"] = 5 # HDFPWF
test_data.loc[test_data['Machine failure'] == 93, "Machine failure"] = 6 # HDFOSF
test_data.loc[test_data['Machine failure'] == 31, "Machine failure"] = 7 # TWF
test_data.loc[test_data['Machine failure'] == 42, "Machine failure"] = 8 # RNF
test_data.loc[test_data['Machine failure'] == 25, "Machine failure"] = 9 # PWF
test_data.loc[test_data['Machine failure'] == 37, "Machine failure"] = 10 # OSF
test_data.loc[test_data['Machine failure'] == 49, "Machine failure"] = 11 # HDF
test_data.loc[test_data['Machine failure'] == 53, "Machine failure"] = 12 # UF


# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure

In [64]:
# We remove the failure columns since we have merged them with machine_failure
test_data = test_data.drop(labels = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1)

In [65]:
# Undersampling the dataset to 1000 for each label
sample = 1000
test_data = test_data.groupby('Machine failure').apply(lambda x: x.sample(sample,replace=True))

In [66]:
# Since we had use group by machine failure, the index is now machine failure so in X_train we will be able to
# see the target label (Machine failure) so for that we need to change the index from group by machine failure
# to the normal count index
test_data = test_data.droplevel(0).reset_index()
test_data = test_data.drop(labels = 'index', axis =1)

In [67]:
len(test_data)

13000

In [68]:
# Cluster sampling the dataset to 8060 where 6045 is used for training and 2015 is used for testing
test_data = test_data.sample(frac=0.62, replace=True) # 8060 instances

In [69]:
# Setting the column'Type' with datatype object to int64 by encoding

le =LabelEncoder()
test_data['Type'] = le.fit_transform(test_data['Type'])


In [70]:
# Defining X and y
X=test_data.iloc[:, 2:8]
y = test_data.iloc[:,8]


# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = True, stratify = y)


# defining terms to be used for calculation
n_rows = X_train.shape[0] # Instances in X_train
class_list = y_train.unique() # Number of labels
column_seq = X_train.columns # Column names
df = X_train.copy() # storing X_train (features) in a dataframe for calculating
column_list = df.columns # List of column names in df

# add target column
df["target"] = y_train



In [71]:
# Computing the probabilities
feature_count = defaultdict(lambda: defaultdict(int)) # stores class- feature pair count
class_count = defaultdict(int) # store frequency of class globally
class_probability = defaultdict(int) # prior probability of class (class count / number of rows)
alpha = 1

if not df.empty:
    
    # for each row we will count the number of features
    for row in df.values:
        # storing the target class values
        target_row = row[-1] 
        # going through each feature and add 1 to sum up to the number of features
        for column in (row[:-1]):
            feature_count[target_row][column] += 1 

for target in y_train:
    # storing the number of classes
    target_class = len(class_count) 
    class_count[target] += 1

# calculation class_count into probabilities
for feature_class, count in class_count.items():
    # converting counts to probabilities
    class_probability[feature_class] = count / n_rows 



In [72]:
# hold predictions by our model
y_pred = []

data = X_test.copy()
data['target'] = y_test

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)




In [73]:
# Printing out the test labels vs the predicted labels
print("10 Test labels: ", y_test[:10].values)
print("Predicted labels:", y_pred[:10])

10 Test labels:  [ 1 11 10  6  8  4  8  5 10 12]
Predicted labels: [ 1 11 10  6  8  4  8  5 10 12]


In [74]:
# Based on the predicted labels we can match the label number to the below machine failure except 
# for No machine failure and RNF since the machine does not fail in these two cases

# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure


In [75]:
# compute accuracy
test_accuracy = accuracy_score(y_test, y_pred)


# Printing accuracy
# We notice that the accuracy is less and this is due to how the Naive bayes is created, since we are doing a 
# Probabilistic Naive Bayes we are getting a very low accuracy
print("Test Accuracy: ",test_accuracy)


Test Accuracy:  0.9687344913151364


In [76]:
# printing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 93,   0,   2,   4,   2,   0,   0,   9,   4,   8,   7,  12,   5],
       [  0, 152,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0, 150,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0, 163,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0, 155,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0, 150,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0, 171,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0, 147,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0, 157,   0,   0,   0,   0],
       [  1,   0,   0,   0,   0,   0,   0,   0,   1, 158,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 156,   0,   0],
       [  1,   0,   0,   0,   0,   3,   2,   0,   0,   1,   0, 148,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 152]])

In [77]:
# printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.64      0.77       146
           1       1.00      1.00      1.00       152
           2       0.99      1.00      0.99       150
           3       0.98      1.00      0.99       163
           4       0.99      1.00      0.99       155
           5       0.98      1.00      0.99       150
           6       0.99      1.00      0.99       171
           7       0.94      1.00      0.97       147
           8       0.97      1.00      0.98       157
           9       0.95      0.99      0.97       160
          10       0.96      1.00      0.98       156
          11       0.93      0.95      0.94       156
          12       0.96      1.00      0.98       152

    accuracy                           0.97      2015
   macro avg       0.97      0.97      0.97      2015
weighted avg       0.97      0.97      0.97      2015



In [78]:
# hold predictions by our model
y_pred = []

data = X_train.copy()
data['target'] = y_train

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)



In [79]:
print("Train Accuracy: ",accuracy_score(y_train, y_pred))

Train Accuracy:  0.9831265508684863


In [80]:
# Under-sampled Balanced dataset using binning
# 1000 instances for each label

In [81]:
import pandas as pd
# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [82]:
# Here we set the machine failure to different unique values in order to distinguish them, by doing encoding instead
# it would mix up the labels causing a confusion on what is what. 

# If we try to match the frequency after encoding we then fall into a problem since the frequency 1 is repeated twice
# and so we won't be able to identify which is which.

# Hence we use .where and set a unique value to each, then match and convert into an order from 0-12


test_data = data.copy()

test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['PWF'] == 1)  & (test_data['OSF'] == 1), 1, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['OSF'] == 1), 4, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['OSF'] == 1), 8, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['PWF'] == 1), 12, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['RNF'] == 1), 16, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1) & (test_data['OSF'] == 1), 20, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1), 24, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1), 30, 0)
test_data['Machine failure'] += np.where((test_data['OSF'] == 1), 36, 0)
test_data['Machine failure'] += np.where((test_data['RNF'] == 1), 42, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1), 48, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 0) & (test_data['TWF'] == 0) & (test_data['OSF'] == 0) &
               (test_data['HDF'] == 0) & (test_data['Machine failure'] == 1), 52, 0)



# Here the unique labels are converted to a proper order 

test_data.loc[test_data['Machine failure'] == 116, "Machine failure"] = 1 # TWFPWFOSF
test_data.loc[test_data['Machine failure'] == 89, "Machine failure"] = 2 # TWFRNF
test_data.loc[test_data['Machine failure'] == 71, "Machine failure"] = 3 # TWFOSF
test_data.loc[test_data['Machine failure'] == 81, "Machine failure"] = 4 # PWFOSF
test_data.loc[test_data['Machine failure'] == 85, "Machine failure"] = 5 # HDFPWF
test_data.loc[test_data['Machine failure'] == 93, "Machine failure"] = 6 # HDFOSF
test_data.loc[test_data['Machine failure'] == 31, "Machine failure"] = 7 # TWF
test_data.loc[test_data['Machine failure'] == 42, "Machine failure"] = 8 # RNF
test_data.loc[test_data['Machine failure'] == 25, "Machine failure"] = 9 # PWF
test_data.loc[test_data['Machine failure'] == 37, "Machine failure"] = 10 # OSF
test_data.loc[test_data['Machine failure'] == 49, "Machine failure"] = 11 # HDF
test_data.loc[test_data['Machine failure'] == 53, "Machine failure"] = 12 # UF


# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure

In [83]:
# We remove the failure columns since we have merged them with machine_failure
test_data = test_data.drop(labels = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1)

# Undersampling the dataset to 1000 for each label
sample = 1000
test_data = test_data.groupby('Machine failure').apply(lambda x: x.sample(sample,replace=True))


# Since we had use group by machine failure, the index is now machine failure so in X_train we will be able to
# see the target label (Machine failure) so for that we need to change the index from group by machine failure
# to the normal count index
test_data = test_data.droplevel(0).reset_index()
test_data = test_data.drop(labels = 'index', axis =1)


# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
test_data['Type'] = le.fit_transform(test_data['Type'])

In [84]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value = test_data['Air temperature [K]'].min()
max_value = test_data['Air temperature [K]'].max()
print(min_value)
print(max_value)

# Store in Bins 
bins = np.linspace(min_value, max_value, 4)
bins

# Create labels
labels = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Air temperature [K]'] = pd.cut(test_data['Air temperature [K]'], bins=bins, labels=labels, include_lowest=True)

295.3
304.4


In [85]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_2 = test_data['Process temperature [K]'].min()
max_value_2 = test_data['Process temperature [K]'].max()
print(min_value_2)
print(max_value_2)

# Store in Bins 
bins_2 = np.linspace(min_value_2, max_value_2, 4)
bins_2

# Create labels
labels_2 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Process temperature [K]'] = pd.cut(test_data['Process temperature [K]'], bins=bins_2, labels=labels_2, include_lowest=True)

305.7
313.7


In [86]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_3 = test_data['Rotational speed [rpm]'].min()
max_value_3 = test_data['Rotational speed [rpm]'].max()
print(min_value_3)
print(max_value_3)

# Store in Bins 
bins_3 = np.linspace(min_value_3, max_value_3, 4)
bins_3

# Create labels
labels_3 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Rotational speed [rpm]'] = pd.cut(test_data['Rotational speed [rpm]'], bins=bins_3, labels=labels_3, include_lowest=True)



1181
2886


In [87]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_4 = test_data['Torque [Nm]'].min()
max_value_4 = test_data['Torque [Nm]'].max()
print(min_value_4)
print(max_value_4)

# Store in Bins 
bins_4 = np.linspace(min_value_4, max_value_4, 4)
bins_4

# Create labels
labels_4 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Torque [Nm]'] = pd.cut(test_data['Torque [Nm]'], bins=bins_4, labels=labels_4, include_lowest=True)

3.8
76.6


In [88]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_5 = test_data['Tool wear [min]'].min()
max_value_5 = test_data['Tool wear [min]'].max()
print(min_value_5)
print(max_value_5)

# Store in Bins 
bins_5 = np.linspace(min_value_5, max_value_5, 4)
bins_5

# Create labels
labels_5 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Tool wear [min]'] = pd.cut(test_data['Tool wear [min]'], bins=bins_5, labels=labels_5, include_lowest=True)



0
253


In [89]:
# Encoding the features for getting a better result

le =LabelEncoder()

test_data['Type'] = le.fit_transform(test_data['Type'])
test_data['Air temperature [K]'] = le.fit_transform(test_data['Air temperature [K]'])
test_data['Process temperature [K]'] = le.fit_transform(test_data['Process temperature [K]'])
test_data['Rotational speed [rpm]'] = le.fit_transform(test_data['Rotational speed [rpm]'])
test_data['Torque [Nm]'] = le.fit_transform(test_data['Torque [Nm]'])
test_data['Tool wear [min]'] = le.fit_transform(test_data['Tool wear [min]'])

In [90]:
# Cluster sampling the dataset to 8060 where 6045 is used for training and 2015 is used for testing
test_data = test_data.sample(frac=0.62, replace=True) # 8060 instances

# Defining X and y
X = test_data.iloc[:, 2:8]
y = test_data.iloc[:,8]


# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = True)


# defining terms to be used for calculation
n_rows = X_train.shape[0] # Instances in X_train
class_list = y_train.unique() # Number of labels
column_seq = X_train.columns # Column names
df = X_train.copy() # storing X_train (features) in a dataframe for calculating
column_list = df.columns # List of column names in df

# add target column
df["target"] = y_train


In [91]:
# Computing the probabilities
feature_count = defaultdict(lambda: defaultdict(int)) # stores class- feature pair count
class_count = defaultdict(int) # store frequency of class globally
class_probability = defaultdict(int) # prior probability of class (class count / number of rows)
alpha = 1

if not df.empty:
    
    # for each row we will count the number of features
    for row in df.values:
        # storing the target class values
        target_row = row[-1] 
        # going through each feature and add 1 to sum up to the number of features
        for column in (row[:-1]):
            feature_count[target_row][column] += 1 

for target in y_train:
    # storing the number of classes
    target_class = len(class_count) 
    class_count[target] += 1

# calculation class_count into probabilities
for feature_class, count in class_count.items():
    # converting counts to probabilities
    class_probability[feature_class] = count / n_rows 




In [92]:
# hold predictions by our model
y_pred = []

data = X_test.copy()
data['target'] = y_test

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)



In [93]:
# Printing out the test labels vs the predicted labels
print("10 Test labels: ", y_test[:10].values)
print("Predicted labels:", y_pred[:10])


10 Test labels:  [ 8  5  4 11 10  3  4  3  0 11]
Predicted labels: [11  7  7 12  1 11  6  3 12 12]


In [94]:
# Based on the predicted labels we can match the label number to the below machine failure except 
# for No machine failure and RNF since the machine does not fail in these two cases

# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure


In [95]:
# compute accuracy
test_accuracy = accuracy_score(y_test, y_pred)


# Printing accuracy
# We notice that the accuracy is less and this is due to how the Naive bayes is created, since we are doing a 
# Probabilistic Naive Bayes we are getting a very low accuracy
print("Test Accuracy: ",test_accuracy)


Test Accuracy:  0.26004962779156326


In [96]:
# printing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 83,   5,   0,   0,   3,   0,   1,   6,   0,   4,   9,  14,  24],
       [  0, 142,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 160,   0],
       [  0,   0,   0,  76,   0,   0,   0,   0,   0,   0,   0,  70,   0],
       [  0,   0,   0,  24,  12,   0,  29,  16,   0,   0,  54,  43,   0],
       [  0,   0,   0,   0,   0,   0,   0,  44,   0,   0,  48,  45,   0],
       [  0,   0,   0,   0,  91,   0,  90,   0,   0,   0,   0,   0,   0],
       [ 12,   6,   0,   1,  17,   0,   0,  28,   0,  13,  10,  17,  41],
       [ 45,   0,   0,   0,  19,   0,  26,   0,   0,   0,  12,  35,  29],
       [ 30,   5,   0,   0,  17,   0,   8,  10,   0,  17,  14,  21,  27],
       [  4,   6,   0,   1,  31,   0,  41,   0,   0,   1,   7,  66,  15],
       [ 13,   8,   0,   7,  20,   0,  18,  10,   0,   0,  19,  24,  33],
       [ 55,  12,   0,   0,   0,   0,   0,   8,   0,  18,   0,   0,  45]])

In [97]:
# printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.34      0.56      0.42       149
           1       0.77      1.00      0.87       142
           2       0.00      0.00      0.00       160
           3       0.70      0.52      0.60       146
           4       0.06      0.07      0.06       178
           5       0.00      0.00      0.00       137
           6       0.42      0.50      0.46       181
           7       0.23      0.19      0.21       145
           8       0.00      0.00      0.00       166
           9       0.32      0.11      0.17       149
          10       0.04      0.04      0.04       172
          11       0.05      0.16      0.07       152
          12       0.21      0.33      0.26       138

    accuracy                           0.26      2015
   macro avg       0.24      0.27      0.24      2015
weighted avg       0.24      0.26      0.24      2015



  _warn_prf(average, modifier, msg_start, len(result))


In [98]:

# hold predictions by our model
y_pred = []

data = X_train.copy()
data['target'] = y_train

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)



In [99]:
print("Train Accuracy: ",accuracy_score(y_train, y_pred))

Train Accuracy:  0.27758478081058724


In [100]:
# Oversampling dataset
# 10000 instances for each label
import pandas as pd
# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [101]:
# Here we set the machine failure to different unique values in order to distinguish them, by doing encoding instead
# it would mix up the labels causing a confusion on what is what. 

# If we try to match the frequency after encoding we then fall into a problem since the frequency 1 is repeated twice
# and so we won't be able to identify which is which.

# Hence we use .where and set a unique value to each, then match and convert into an order from 0-12

test_data = data.copy()

test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['PWF'] == 1)  & (test_data['OSF'] == 1), 1, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['OSF'] == 1), 4, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['OSF'] == 1), 8, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['PWF'] == 1), 12, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['RNF'] == 1), 16, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1) & (test_data['OSF'] == 1), 20, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1), 24, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1), 30, 0)
test_data['Machine failure'] += np.where((test_data['OSF'] == 1), 36, 0)
test_data['Machine failure'] += np.where((test_data['RNF'] == 1), 42, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1), 48, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 0) & (test_data['TWF'] == 0) & (test_data['OSF'] == 0) &
               (test_data['HDF'] == 0) & (test_data['Machine failure'] == 1), 52, 0)


# Here the unique labels are converted to a proper order 

test_data.loc[test_data['Machine failure'] == 116, "Machine failure"] = 1 # TWFPWFOSF
test_data.loc[test_data['Machine failure'] == 89, "Machine failure"] = 2 # TWFRNF
test_data.loc[test_data['Machine failure'] == 71, "Machine failure"] = 3 # TWFOSF
test_data.loc[test_data['Machine failure'] == 81, "Machine failure"] = 4 # PWFOSF
test_data.loc[test_data['Machine failure'] == 85, "Machine failure"] = 5 # HDFPWF
test_data.loc[test_data['Machine failure'] == 93, "Machine failure"] = 6 # HDFOSF
test_data.loc[test_data['Machine failure'] == 31, "Machine failure"] = 7 # TWF
test_data.loc[test_data['Machine failure'] == 42, "Machine failure"] = 8 # RNF
test_data.loc[test_data['Machine failure'] == 25, "Machine failure"] = 9 # PWF
test_data.loc[test_data['Machine failure'] == 37, "Machine failure"] = 10 # OSF
test_data.loc[test_data['Machine failure'] == 49, "Machine failure"] = 11 # HDF
test_data.loc[test_data['Machine failure'] == 53, "Machine failure"] = 12 # UF


# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure

In [102]:
# We remove the failure columns since we have merged them with machine_failure
test_data = test_data.drop(labels = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1)

In [103]:

# Oversampling the dataset to 10000 for each label
sample = 10000
test_data = test_data.groupby('Machine failure').apply(lambda x: x.sample(sample,replace=True))


# Since we had use group by machine failure, the index is now machine failure so in X_train we will be able to
# see the target label (Machine failure) so for that we need to change the index from group by machine failure
# to the normal count index
test_data = test_data.droplevel(0).reset_index()
test_data = test_data.drop(labels = 'index', axis =1)


# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
test_data['Type'] = le.fit_transform(test_data['Type'])

In [104]:
# Cluster sampling the dataset from 130000 to 13000 instances
test_data = test_data.sample(frac=0.1, replace=True) # 13000 instances


# Cluster sampling the dataset from 13000 to 8060 where 6045 is used for training and 2015 is used for testing
test_data = test_data.sample(frac=0.62, replace=True) # 8060 instances



# Defining X and y
X = test_data.drop(labels = ['UDI', 'Product ID', 'Machine failure'], axis =1 )
y = test_data['Machine failure']


# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)



# defining terms to be used for calculation
n_rows = X_train.shape[0] # Instances in X_train
class_list = y_train.unique() # Number of labels
column_seq = X_train.columns # Column names
df = X_train.copy() # storing X_train (features) in a dataframe for calculating
column_list = df.columns # List of column names in df

# add target column
df["target"] = y_train



In [105]:
# Computing the probabilities
feature_count = defaultdict(lambda: defaultdict(int)) # stores class- feature pair count
class_count = defaultdict(int) # store frequency of class globally
class_probability = defaultdict(int) # prior probability of class (class count / number of rows)
alpha = 1

if not df.empty:
    
    # for each row we will count the number of features
    for row in df.values:
        # storing the target class values
        target_row = row[-1] 
        # going through each feature and add 1 to sum up to the number of features
        for column in (row[:-1]):
            feature_count[target_row][column] += 1 

for target in y_train:
    # storing the number of classes
    target_class = len(class_count) 
    class_count[target] += 1

# calculation class_count into probabilities
for feature_class, count in class_count.items():
    # converting counts to probabilities
    class_probability[feature_class] = count / n_rows 





In [106]:
# This will take some time since we have 120,000 instances

# hold predictions by our model
y_pred = []

data = X_test.copy()
data['target'] = y_test

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)



In [107]:
# Printing out the test labels vs the predicted labels
print("10 Test labels: ", y_test[:10].values)
print("Predicted labels:", y_pred[:10])

10 Test labels:  [10  5  2  9  8  3  7  9 11 10]
Predicted labels: [10  5  2  9  8  3  7  9 11 10]


In [108]:
# Based on the predicted labels we can match the label number to the below machine failure except 
# for No machine failure and RNF since the machine does not fail in these two cases

# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure


In [109]:
# compute accuracy
test_accuracy = accuracy_score(y_test, y_pred)


# Printing accuracy
# We notice that the accuracy is less and this is due to how the Naive bayes is created, since we are doing a 
# Probabilistic Naive Bayes we are getting a very low accuracy
print("Test Accuracy: ",test_accuracy)


Test Accuracy:  0.9652605459057072


In [110]:
# printing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 95,   0,   1,   1,   4,   2,   0,   6,   6,   8,   9,  11,   1],
       [  0, 162,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0, 163,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0, 154,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0, 149,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0, 170,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0, 152,   0,   0,   0,   0,   0,   0],
       [  6,   0,   0,   0,   0,   0,   0, 151,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0, 149,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   1,   3, 148,   1,   1,   4],
       [  0,   0,   0,   0,   0,   3,   0,   0,   0,   0, 145,   1,   0],
       [  0,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0, 159,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 148]])

In [111]:
# printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.66      0.78       144
           1       1.00      1.00      1.00       162
           2       0.99      1.00      1.00       163
           3       0.99      1.00      1.00       154
           4       0.97      1.00      0.99       149
           5       0.97      1.00      0.99       170
           6       0.99      1.00      1.00       152
           7       0.96      0.96      0.96       157
           8       0.94      1.00      0.97       149
           9       0.95      0.94      0.94       158
          10       0.94      0.97      0.95       149
          11       0.92      0.99      0.96       160
          12       0.97      1.00      0.98       148

    accuracy                           0.97      2015
   macro avg       0.96      0.96      0.96      2015
weighted avg       0.97      0.97      0.96      2015



In [112]:
# hold predictions by our model
y_pred = []

data = X_train.copy()
data['target'] = y_train

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)



In [113]:
print("Train Accuracy: ",accuracy_score(y_train, y_pred))

Train Accuracy:  0.9839536807278743


In [114]:
# Over sampling with binning
import pandas as pd
# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [115]:
# Here we set the machine failure to different unique values in order to distinguish them, by doing encoding instead
# it would mix up the labels causing a confusion on what is what. 

# If we try to match the frequency after encoding we then fall into a problem since the frequency 1 is repeated twice
# and so we won't be able to identify which is which.

# Hence we use .where and set a unique value to each, then match and convert into an order from 0-12


test_data = data.copy()

test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['PWF'] == 1)  & (test_data['OSF'] == 1), 1, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['OSF'] == 1), 4, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['OSF'] == 1), 8, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1) & (test_data['PWF'] == 1), 12, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1) & (test_data['RNF'] == 1), 16, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1) & (test_data['OSF'] == 1), 20, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 1), 24, 0)
test_data['Machine failure'] += np.where((test_data['TWF'] == 1), 30, 0)
test_data['Machine failure'] += np.where((test_data['OSF'] == 1), 36, 0)
test_data['Machine failure'] += np.where((test_data['RNF'] == 1), 42, 0)
test_data['Machine failure'] += np.where((test_data['HDF'] == 1), 48, 0)
test_data['Machine failure'] += np.where((test_data['PWF'] == 0) & (test_data['TWF'] == 0) & (test_data['OSF'] == 0) &
               (test_data['HDF'] == 0) & (test_data['Machine failure'] == 1), 52, 0)


# Here the unique labels are converted to a proper order 

test_data.loc[test_data['Machine failure'] == 116, "Machine failure"] = 1 # TWFPWFOSF
test_data.loc[test_data['Machine failure'] == 89, "Machine failure"] = 2 # TWFRNF
test_data.loc[test_data['Machine failure'] == 71, "Machine failure"] = 3 # TWFOSF
test_data.loc[test_data['Machine failure'] == 81, "Machine failure"] = 4 # PWFOSF
test_data.loc[test_data['Machine failure'] == 85, "Machine failure"] = 5 # HDFPWF
test_data.loc[test_data['Machine failure'] == 93, "Machine failure"] = 6 # HDFOSF
test_data.loc[test_data['Machine failure'] == 31, "Machine failure"] = 7 # TWF
test_data.loc[test_data['Machine failure'] == 42, "Machine failure"] = 8 # RNF
test_data.loc[test_data['Machine failure'] == 25, "Machine failure"] = 9 # PWF
test_data.loc[test_data['Machine failure'] == 37, "Machine failure"] = 10 # OSF
test_data.loc[test_data['Machine failure'] == 49, "Machine failure"] = 11 # HDF
test_data.loc[test_data['Machine failure'] == 53, "Machine failure"] = 12 # UF


# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure


In [116]:
# We remove the failure columns since we have merged them with machine_failure
test_data = test_data.drop(labels = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1)

In [117]:
# Oversampling the dataset to 10000 for each label
sample = 10000
test_data = test_data.groupby('Machine failure').apply(lambda x: x.sample(sample,replace=True))

# Since we had use group by machine failure, the index is now machine failure so in X_train we will be able to
# see the target label (Machine failure) so for that we need to change the index from group by machine failure
# to the normal count index
test_data = test_data.droplevel(0).reset_index()
test_data = test_data.drop(labels = 'index', axis =1)


# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
test_data['Type'] = le.fit_transform(test_data['Type'])

In [118]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value = test_data['Air temperature [K]'].min()
max_value = test_data['Air temperature [K]'].max()
print(min_value)
print(max_value)

# Store in Bins 
bins = np.linspace(min_value, max_value, 4)
bins

# Create labels
labels = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Air temperature [K]'] = pd.cut(test_data['Air temperature [K]'], bins=bins, labels=labels, include_lowest=True)

295.3
304.5


In [119]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_2 = test_data['Process temperature [K]'].min()
max_value_2 = test_data['Process temperature [K]'].max()
print(min_value_2)
print(max_value_2)

# Store in Bins 
bins_2 = np.linspace(min_value_2, max_value_2, 4)
bins_2

# Create labels
labels_2 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Process temperature [K]'] = pd.cut(test_data['Process temperature [K]'], bins=bins_2, labels=labels_2, include_lowest=True)

305.8
313.8


In [120]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high


min_value_3 = test_data['Rotational speed [rpm]'].min()
max_value_3 = test_data['Rotational speed [rpm]'].max()
print(min_value_3)
print(max_value_3)

# Store in Bins 
bins_3 = np.linspace(min_value_3, max_value_3, 4)
bins_3

# Create labels
labels_3 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Rotational speed [rpm]'] = pd.cut(test_data['Rotational speed [rpm]'], bins=bins_3, labels=labels_3, include_lowest=True)




1168
2886


In [121]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high


min_value_4 = test_data['Torque [Nm]'].min()
max_value_4 = test_data['Torque [Nm]'].max()
print(min_value_4)
print(max_value_4)

# Store in Bins 
bins_4 = np.linspace(min_value_4, max_value_4, 4)
bins_4

# Create labels
labels_4 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Torque [Nm]'] = pd.cut(test_data['Torque [Nm]'], bins=bins_4, labels=labels_4, include_lowest=True)

3.8
76.6


In [122]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_5 = test_data['Tool wear [min]'].min()
max_value_5 = test_data['Tool wear [min]'].max()
print(min_value_5)
print(max_value_5)

# Store in Bins 
bins_5 = np.linspace(min_value_5, max_value_5, 4)
bins_5

# Create labels
labels_5 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
test_data['Tool wear [min]'] = pd.cut(test_data['Tool wear [min]'], bins=bins_5, labels=labels_5, include_lowest=True)



0
253


In [123]:
# Encoding the features for getting a better result

le =LabelEncoder()

# test_data['Type'] = le.fit_transform(test_data['Type'])
test_data['Air temperature [K]'] = le.fit_transform(test_data['Air temperature [K]'])
test_data['Process temperature [K]'] = le.fit_transform(test_data['Process temperature [K]'])
test_data['Rotational speed [rpm]'] = le.fit_transform(test_data['Rotational speed [rpm]'])
test_data['Torque [Nm]'] = le.fit_transform(test_data['Torque [Nm]'])
test_data['Tool wear [min]'] = le.fit_transform(test_data['Tool wear [min]'])

In [124]:
# Cluster sampling the dataset from 130000 to 13000 instances
test_data = test_data.sample(frac=0.1, replace=True) # 13000 instances


# Cluster sampling the dataset from 13000 to 8060 where 6045 is used for training and 2015 is used for testing
test_data = test_data.sample(frac=0.62, replace=True) # 8060 instances



# Defining X and y
X = test_data.drop(labels = ['UDI', 'Product ID', 'Machine failure'], axis =1 )
y = test_data['Machine failure']
# X = test_data.iloc[:, 2:8]
# y = test_data.iloc[:,8]



# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101)





# defining terms to be used for calculation
n_rows = X_train.shape[0] # Instances in X_train
class_list = y_train.unique() # Number of labels
column_seq = X_train.columns # Column names
df = X_train.copy() # storing X_train (features) in a dataframe for calculating
column_list = df.columns # List of column names in df

# add target column
df["target"] = y_train





In [125]:
# Computing the probabilities
feature_count = defaultdict(lambda: defaultdict(int)) # stores class- feature pair count
class_count = defaultdict(int) # store frequency of class globally
class_probability = defaultdict(int) # prior probability of class (class count / number of rows)
alpha = 1

if not df.empty:
    
    # for each row we will count the number of features
    for row in df.values:
        # storing the target class values
        target_row = row[-1] 
        # going through each feature and add 1 to sum up to the number of features
        for column in (row[:-1]):
            feature_count[target_row][column] += 1 

for target in y_train:
    # storing the number of classes
    target_class = len(class_count) 
    class_count[target] += 1

# calculation class_count into probabilities
for feature_class, count in class_count.items():
    # converting counts to probabilities
    class_probability[feature_class] = count / n_rows 



In [126]:
# This will take some time since we have 120,000 instances

# hold predictions by our model
y_pred = []

data = X_test.copy()
data['target'] = y_test

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)

In [127]:
# Printing out the test labels vs the predicted labels
print("10 Test labels: ", y_test[:10].values)
print("Predicted labels:", y_pred[:10])


10 Test labels:  [ 3 11  1  5  0 10 10  7  3  5]
Predicted labels: [2 6 1 2 7 6 6 3 3 2]


In [128]:
# Based on the predicted labels we can match the label number to the below machine failure except 
# for No machine failure and RNF since the machine does not fail in these two cases

# 0 - No machine failure
# 1 = TWFPWFOSF
# 2 = TWFRNF
# 3 = TWFOSF
# 4 = PWFOSF
# 5 = HDFPWF
# 6 = HDFOSF
# 7 = TWF
# 8 = RNF - No machine failure but failure occurs
# 9 = PWF
# 10 = OSF
# 11 = HDF
# 12 = UF, Machine failure is equal to 1 but none of the failure types had occured 
# which is basically an undefined failure


In [129]:
# compute accuracy
test_accuracy = accuracy_score(y_test, y_pred)


# Printing accuracy
# We notice that the accuracy is less and this is due to how the Naive bayes is created, since we are doing a 
# Probabilistic Naive Bayes we are getting a very low accuracy
print("Test Accuracy: ",test_accuracy)


Test Accuracy:  0.337468982630273


In [130]:
# printing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 40,   3,  14,   4,   0,   0,   2,   5,   0,  15,   6,   0,  56],
       [  0, 182,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0, 176,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,  88,  79,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,  49,  25,   0,   0,  28,  12,   0,   0,  38,   0,   0],
       [  0,   0,  51,   0,   0,   0,   0,  53,   0,   0,  71,   0,   0],
       [  0,   0,   0,  81,   0,   0,  87,   0,   0,   0,   0,   0,   0],
       [ 17,   4,  21,  24,   0,   0,   0,  23,   0,   6,   4,   0,  48],
       [ 15,   0,  24,  11,   0,   0,  25,   0,   0,   0,   8,   0,  56],
       [ 31,   3,  21,  22,   0,   0,   7,   4,   0,  25,   7,   0,  38],
       [  0,   2,  63,  24,   0,   0,  37,   0,   0,   2,   6,   0,  19],
       [ 13,   5,  24,  15,   0,   0,   8,  13,   0,   0,  16,   0,  26],
       [ 26,  15,   0,   0,   0,   0,   0,  15,   0,  15,   0,   0,  62]])

In [131]:
# printing the classification report
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.28      0.28      0.28       145
           1       0.85      1.00      0.92       182
           2       0.33      1.00      0.50       176
           3       0.28      0.47      0.35       167
           4       0.00      0.00      0.00       152
           5       0.00      0.00      0.00       175
           6       0.45      0.52      0.48       168
           7       0.18      0.16      0.17       147
           8       0.00      0.00      0.00       139
           9       0.40      0.16      0.23       158
          10       0.04      0.04      0.04       153
          11       0.00      0.00      0.00       120
          12       0.20      0.47      0.28       133

    accuracy                           0.34      2015
   macro avg       0.23      0.31      0.25      2015
weighted avg       0.25      0.34      0.27      2015



  _warn_prf(average, modifier, msg_start, len(result))


In [132]:

# hold predictions by our model
y_pred = []

data = X_train.copy()
data['target'] = y_train

# for each row in the dataset
for row in data.values:
    
    # parse the target
    target = row[-1]
    
    # query point
    query = row[:-1]
    
    # get the log probability distribution
    # pd = probability distribution
    pd = predict(query, feature_count) #, class_feature_pair_mean, class_feature_pair_std)

    # maximum a posteriori to get the class index
    pred_class =np.argmax(list(pd.values()))

    # get the class label
    pred_label = list(feature_count.keys())[pred_class]
    
    # append the prediction to the list
    y_pred.append(pred_label)

y_pred = np.array(y_pred, dtype= int)




In [133]:
print("Train Accuracy: ",accuracy_score(y_train, y_pred))

Train Accuracy:  0.3301902398676592
