In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
class GaussianNaiveBayes:
    
    # fit methods trains the data
    def fit(self,X,y,spar=10e-3): # here self is the variable which refers to current object of class 
        #spar method for selecting the smoothing parameter
        sample,features = X.shape # returns shape of X which is in 2d dimensional
        #sample stores the instances in X while the features store the columns (attributes) in X
        # target contains unique labels of Y  
        self.target=np.unique(y)
        
        # classes contain the target labels
        classes=len(self.target)
        
        # Initialization of Mean, Variance and Priors
        
        # Initializing the mean of each feature and setting the type as float
        self.gaussian_mean=np.zeros((classes,features),dtype=np.float64)
        
        # Initializing the variance (standard deviation) of each feature and setting the type as float
        self.gaussian_var=np.zeros((classes,features),dtype=np.float64)
        
        # Initializing the prior (the number of labels in target class) and setting the type as float
        self.log_prior=np.zeros((classes),dtype=np.float64)
        
        
        # Calculation of Mean, Variance, Priors based on the target class
        for label in self.target:
            X_class= X [label == y] # grouping features array based on the target class
            
            # calculating the mean by adding each row in a column and then dividing it (average) and storing the 
            # avearge to the particular feature (attributes)
            self.gaussian_mean[label,:]=X_class.mean(axis=0) 
            
            # calculating the mean (standard deviation) between each row in a column corresponding to 
            # a particular feature (attributes)
            self.gaussian_var[label,:]=X_class.var(axis=0)
            
            
            # Calculating the prior by getting the number of instances (sample) from X (feature class) and dividing
            # with the sample in X for each label to check if the samples match
            self.log_prior[label]=np.log(X_class.shape[0]/float(sample)) 
        
        
        
        
        # Predicting the labels 
    def predict(self,X):
        # Posterior proabability is calculated by the X class by the number of unique labels (P(A/B))
        posterior =np.zeros((X.shape[0],len(self.target)))
        
         # calculating posterior with log of probablity districution by X, mean, variance and log prior
        for label in self.target: 
            # calculating 
            # Compute MVN PDF (Gaussian Distribution)
            posterior[:,label]=mvn.logpdf(X,
                                             mean=self.gaussian_mean[label,:],
                                             cov=self.gaussian_var[label,:]) + self.log_prior[label]
            
        return np.argmax(posterior,axis=1)
        
    def accuracy(self,y_test,y_pred):
        return np.mean(y_test == y_pred)
        
            
        

In [3]:
# Unbalanced dataset

# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
data['Type'] = le.fit_transform(data['Type'])

# Cluster sampling the dataset to 8000 where 6000 is used for training and 2000 is used for testing
df = data.sample(frac=0.8, replace=True, random_state=1)

# Defining X and y
X = df.iloc[:,2:8]
y = df.iloc[:, 8]

# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = True)


In [5]:
data

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,2,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,1,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,1,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,1,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,1,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,2,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,0,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,2,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,0,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [6]:
# using the Gaussian Naive bayes to fit X_train and y_train for helping the model to predict new data (y_test)
gnb=GaussianNaiveBayes()
gnb.fit(X_train,y_train)

In [7]:
# Storing the prediction of the training and testing sets
y_train_pred = gnb.predict(X_train)
y_pred = gnb.predict(X_test)

In [8]:
# Getting the accuracy of the training and testing set
print("Training Accuracy :",gnb.accuracy(y_train,y_train_pred))
print("Testing Accuracy :",gnb.accuracy(y_test,y_pred))


print(classification_report(y_test,y_pred))

Training Accuracy : 0.9583333333333334
Testing Accuracy : 0.959
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1937
           1       0.31      0.25      0.28        63

    accuracy                           0.96      2000
   macro avg       0.64      0.62      0.63      2000
weighted avg       0.96      0.96      0.96      2000



In [9]:
confusion_matrix(y_test, y_pred)

array([[1902,   35],
       [  47,   16]])

In [10]:
# Unbalanced data is binned

In [11]:
# The file is read again so that any changes made will not affect the output this experimentation for this section
# Also taking into consideration of the memory space

In [12]:
# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [13]:

# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
data['Type'] = le.fit_transform(data['Type'])

# Cluster sampling the dataset to 8000 where 6000 is used for training and 2000 is used for testing
df = data.sample(frac=0.8, replace=True, random_state=1)


In [14]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high
min_value = df['Air temperature [K]'].min()
max_value = df['Air temperature [K]'].max()
print(min_value)
print(max_value)

# Store in Bins 
bins = np.linspace(min_value, max_value, 4)
bins

# Create labels
labels = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Air temperature [K]'] = pd.cut(df['Air temperature [K]'], bins=bins, labels=labels, include_lowest=True)

295.3
304.5


In [15]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_2 = df['Process temperature [K]'].min()
max_value_2 = df['Process temperature [K]'].max()
print(min_value_2)
print(max_value_2)

# Store in Bins 
bins_2 = np.linspace(min_value_2, max_value_2, 4)
bins_2

# Create labels
labels_2 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Process temperature [K]'] = pd.cut(df['Process temperature [K]'], bins=bins_2, labels=labels_2, include_lowest=True)

305.7
313.7


In [16]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_3 = df['Rotational speed [rpm]'].min()
max_value_3 = df['Rotational speed [rpm]'].max()
print(min_value_3)
print(max_value_3)

# Store in Bins 
bins_3 = np.linspace(min_value_3, max_value_3, 4)
bins_3

# Create labels
labels_3 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Rotational speed [rpm]'] = pd.cut(df['Rotational speed [rpm]'], bins=bins_3, labels=labels_3, include_lowest=True)



1181
2886


In [17]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_4 = df['Torque [Nm]'].min()
max_value_4 = df['Torque [Nm]'].max()
print(min_value_4)
print(max_value_4)

# Store in Bins 
bins_4 = np.linspace(min_value_4, max_value_4, 4)
bins_4

# Create labels
labels_4 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Torque [Nm]'] = pd.cut(df['Torque [Nm]'], bins=bins_4, labels=labels_4, include_lowest=True)




3.8
74.5


In [18]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_5 = df['Tool wear [min]'].min()
max_value_5 = df['Tool wear [min]'].max()
print(min_value_5)
print(max_value_5)

# Store in Bins 
bins_5 = np.linspace(min_value_5, max_value_5, 4)
bins_5

# Create labels
labels_5 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Tool wear [min]'] = pd.cut(df['Tool wear [min]'], bins=bins_5, labels=labels_5, include_lowest=True)




0
251


In [19]:
# Encoding the features for getting a better result
le =LabelEncoder()

df['Type'] = le.fit_transform(df['Type'])
df['Air temperature [K]'] = le.fit_transform(df['Air temperature [K]'])
df['Process temperature [K]'] = le.fit_transform(df['Process temperature [K]'])
df['Rotational speed [rpm]'] = le.fit_transform(df['Rotational speed [rpm]'])
df['Torque [Nm]'] = le.fit_transform(df['Torque [Nm]'])
df['Tool wear [min]'] = le.fit_transform(df['Tool wear [min]'])



In [20]:

# Defining X and y
X = df.iloc[:,2:8]
y = df.iloc[:, 8]

# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = True)


# using the Gaussian Naive bayes to fit X_train and y_train for helping the model to predict new data (y_test)
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train) 

# Storing the prediction of the training and testing sets
y_train_pred = gnb.predict(X_train)
y_pred = gnb.predict(X_test)

# Getting the accuracy of the training and testing set
print("Training Accuracy :",gnb.accuracy(y_train,y_train_pred))
print("Testing Accuracy :",gnb.accuracy(y_test,y_pred))

confusion_matrix(y_test, y_pred)

print(classification_report(y_test,y_pred))

Training Accuracy : 0.9638333333333333
Testing Accuracy : 0.9685
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1937
           1       0.50      0.08      0.14        63

    accuracy                           0.97      2000
   macro avg       0.74      0.54      0.56      2000
weighted avg       0.96      0.97      0.96      2000



In [21]:
confusion_matrix(y_test, y_pred)

array([[1932,    5],
       [  58,    5]])

In [22]:
# Balancing dataset with undersampling 
# 1 is increased to 5000 from 339
# 0 is reduced to 5000 from 10000

# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [23]:
# Undersampling the dataset to 1000 for each label
sample = 5000
df = data.groupby('Machine failure').apply(lambda x: x.sample(sample,replace=True))

# Since we had use group by machine failure, the index is now machine failure so in X_train we will be able to
# see the target label (Machine failure) so for that we need to change the index from group by machine failure
# to the normal count index
df = df.droplevel(0).reset_index()
df = df.drop(labels = 'index', axis =1)

In [24]:
# Cluster sampling the dataset to 10,000 where 8000 is used for training and 2000 is used for testing
df = df.sample(frac=0.8, replace=True) # 8000 instances


In [25]:
# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
df['Type'] = le.fit_transform(df['Type'])



# Defining X and y
X = df.iloc[:,2:8]
y = df.iloc[:, 8]

# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = True)



In [26]:
# using the Gaussian Naive bayes to fit X_train and y_train for helping the model to predict new data (y_test)
gnb=GaussianNaiveBayes()
gnb.fit(X_train,y_train)

In [27]:
# Storing the prediction of the training and testing sets
y_train_pred = gnb.predict(X_train)
y_pred = gnb.predict(X_test)

# Getting the accuracy of the training and testing set
print("Training Accuracy :",gnb.accuracy(y_train,y_train_pred))
print("Testing Accuracy :",gnb.accuracy(y_test,y_pred))


print(classification_report(y_test,y_pred))

Training Accuracy : 0.8306666666666667
Testing Accuracy : 0.82
              precision    recall  f1-score   support

           0       0.80      0.86      0.83      1030
           1       0.84      0.78      0.81       970

    accuracy                           0.82      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.82      0.82      0.82      2000



In [28]:
confusion_matrix(y_test, y_pred)

array([[885, 145],
       [215, 755]])

In [29]:
# Balanced dataset (Undersampling) using binning 
# 1 is increased to 5000 from 339
# 0 is reduced to 5000 from 10000

# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [30]:
# Undersampling the dataset to 1000 for each label
sample = 5000
df = data.groupby('Machine failure').apply(lambda x: x.sample(sample,replace=True))

# Since we had use group by machine failure, the index is now machine failure so in X_train we will be able to
# see the target label (Machine failure) so for that we need to change the index from group by machine failure
# to the normal count index
df = df.droplevel(0).reset_index()
df = df.drop(labels = 'index', axis =1)

In [31]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value = df['Air temperature [K]'].min()
max_value = df['Air temperature [K]'].max()
print(min_value)
print(max_value)

# Store in Bins 
bins = np.linspace(min_value, max_value, 4)
bins

# Create labels
labels = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Air temperature [K]'] = pd.cut(df['Air temperature [K]'], bins=bins, labels=labels, include_lowest=True)

295.3
304.4


In [32]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_2 = df['Process temperature [K]'].min()
max_value_2 = df['Process temperature [K]'].max()
print(min_value_2)
print(max_value_2)

# Store in Bins 
bins_2 = np.linspace(min_value_2, max_value_2, 4)
bins_2

# Create labels
labels_2 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Process temperature [K]'] = pd.cut(df['Process temperature [K]'], bins=bins_2, labels=labels_2, include_lowest=True)

305.8
313.8


In [33]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_3 = df['Rotational speed [rpm]'].min()
max_value_3 = df['Rotational speed [rpm]'].max()
print(min_value_3)
print(max_value_3)

# Store in Bins 
bins_3 = np.linspace(min_value_3, max_value_3, 4)
bins_3

# Create labels
labels_3 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Rotational speed [rpm]'] = pd.cut(df['Rotational speed [rpm]'], bins=bins_3, labels=labels_3, include_lowest=True)




1168
2886


In [34]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_4 = df['Torque [Nm]'].min()
max_value_4 = df['Torque [Nm]'].max()
print(min_value_4)
print(max_value_4)

# Store in Bins 
bins_4 = np.linspace(min_value_4, max_value_4, 4)
bins_4

# Create labels
labels_4 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Torque [Nm]'] = pd.cut(df['Torque [Nm]'], bins=bins_4, labels=labels_4, include_lowest=True)



3.8
76.6


In [35]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_5 = df['Tool wear [min]'].min()
max_value_5 = df['Tool wear [min]'].max()
print(min_value_5)
print(max_value_5)

# Store in Bins 
bins_5 = np.linspace(min_value_5, max_value_5, 4)
bins_5

# Create labels
labels_5 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Tool wear [min]'] = pd.cut(df['Tool wear [min]'], bins=bins_5, labels=labels_5, include_lowest=True)





0
253


In [36]:
# Encoding the features for getting a better result

le =LabelEncoder()

df['Type'] = le.fit_transform(df['Type'])
df['Air temperature [K]'] = le.fit_transform(df['Air temperature [K]'])
df['Process temperature [K]'] = le.fit_transform(df['Process temperature [K]'])
df['Rotational speed [rpm]'] = le.fit_transform(df['Rotational speed [rpm]'])
df['Torque [Nm]'] = le.fit_transform(df['Torque [Nm]'])
df['Tool wear [min]'] = le.fit_transform(df['Tool wear [min]'])



In [37]:
# Cluster sampling the dataset to 10,000 where 8000 is used for training and 2000 is used for testing
df = df.sample(frac=0.8, replace=True) # 8000 instances



# Defining X and y
X = df.iloc[:,2:8]
y = df.iloc[:, 8]

# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = True)

# using the Gaussian Naive bayes to fit X_train and y_train for helping the model to predict new data (y_test)
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train) 


# Storing the prediction of the training and testing sets
y_train_pred = gnb.predict(X_train)
y_pred = gnb.predict(X_test)

# Getting the accuracy of the training and testing set
print("Training Accuracy :",metrics.accuracy_score(y_train, y_train_pred))
print("Testing Accuracy :",metrics.accuracy_score(y_test, y_pred))



print(classification_report(y_test,y_pred))

Training Accuracy : 0.785
Testing Accuracy : 0.787
              precision    recall  f1-score   support

           0       0.75      0.86      0.80      1014
           1       0.84      0.71      0.77       986

    accuracy                           0.79      2000
   macro avg       0.79      0.79      0.79      2000
weighted avg       0.79      0.79      0.79      2000



In [38]:
confusion_matrix(y_test, y_pred)

array([[877, 137],
       [289, 697]])

In [39]:
# Balancing dataset with oversampling 
# 0 increases to 10000 from 9661
# 1 increases to 10000 from 339

# Reading the csv and store it in a dataframe

data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [40]:
# Oversampling the dataset to 10000 for each label
sample = 10000
df = data.groupby('Machine failure').apply(lambda x: x.sample(sample,replace=True))

# Since we had use group by machine failure, the index is now machine failure so in X_train we will be able to
# see the target label (Machine failure) so for that we need to change the index from group by machine failure
# to the normal count index
df = df.droplevel(0).reset_index()
df = df.drop(labels = 'index', axis =1)

In [41]:
# Now we have 20000 instances 

# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
df['Type'] = le.fit_transform(df['Type'])

# Cluster sampling the dataset from 20,000 to 10,000 
df = df.sample(frac=0.5, replace=True) # 10000 instances

# Cluster sampling the dataset to 10,000 where 8000 is used for training and 2000 is used for testing
df = df.sample(frac=0.8, replace=True) # 8000 instances

df.count()['UDI']

# Defining X and y
X = df.iloc[:,2:8]
y = df.iloc[:, 8]

# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = True)


In [42]:
# using the Gaussian Naive bayes to fit X_train and y_train for helping the model to predict new data (y_test)
gnb=GaussianNaiveBayes()
gnb.fit(X_train,y_train)

In [43]:
# Storing the prediction of the training and testing sets
y_train_pred = gnb.predict(X_train)
y_pred = gnb.predict(X_test)

In [44]:
# Getting the accuracy of the training and testing set
print("Training Accuracy :",metrics.accuracy_score(y_train, y_train_pred))
print("Testing Accuracy :",metrics.accuracy_score(y_test, y_pred))


print(classification_report(y_test,y_pred))

Training Accuracy : 0.836
Testing Accuracy : 0.8475
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1031
           1       0.87      0.81      0.84       969

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



In [45]:
confusion_matrix(y_test, y_pred)

array([[913, 118],
       [187, 782]])

In [46]:
# Balanced dataset (oversampling) using binning 
# 1 = 10000 instances
# 0 = 10000 instances

# Reading the csv and store it in a dataframe
data = pd.read_csv('ai4i2020.csv')
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [47]:
# Oversampling the dataset to 10000 for each label
sample = 10000
df = data.groupby('Machine failure').apply(lambda x: x.sample(sample,replace=True))

# Since we had use group by machine failure, the index is now machine failure so in X_train we will be able to
# see the target label (Machine failure) so for that we need to change the index from group by machine failure
# to the normal count index
df = df.droplevel(0).reset_index()
df = df.drop(labels = 'index', axis =1)

In [48]:
# Now we have 20000 instances 

# Setting the column'Type' with datatype object to int64 by encoding
le =LabelEncoder()
df['Type'] = le.fit_transform(df['Type'])


In [49]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value = df['Air temperature [K]'].min()
max_value = df['Air temperature [K]'].max()
print(min_value)
print(max_value)

# Store in Bins 
bins = np.linspace(min_value, max_value, 4)
bins

# Create labels
labels = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Air temperature [K]'] = pd.cut(df['Air temperature [K]'], bins=bins, labels=labels, include_lowest=True)

295.3
304.4


In [50]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_2 = df['Process temperature [K]'].min()
max_value_2 = df['Process temperature [K]'].max()
print(min_value_2)
print(max_value_2)

# Store in Bins 
bins_2 = np.linspace(min_value_2, max_value_2, 4)
bins_2

# Create labels
labels_2 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Process temperature [K]'] = pd.cut(df['Process temperature [K]'], bins=bins_2, labels=labels_2, include_lowest=True)

305.7
313.8


In [51]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_3 = df['Rotational speed [rpm]'].min()
max_value_3 = df['Rotational speed [rpm]'].max()
print(min_value_3)
print(max_value_3)

# Store in Bins 
bins_3 = np.linspace(min_value_3, max_value_3, 4)
bins_3

# Create labels
labels_3 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Rotational speed [rpm]'] = pd.cut(df['Rotational speed [rpm]'], bins=bins_3, labels=labels_3, include_lowest=True)


1168
2886


In [52]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_4 = df['Torque [Nm]'].min()
max_value_4 = df['Torque [Nm]'].max()
print(min_value_4)
print(max_value_4)

# Store in Bins 
bins_4 = np.linspace(min_value_4, max_value_4, 4)
bins_4

# Create labels
labels_4 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Torque [Nm]'] = pd.cut(df['Torque [Nm]'], bins=bins_4, labels=labels_4, include_lowest=True)

3.8
76.6


In [53]:
# Implementing binning
# Taking the Minimum and maximum and diving it into three sections of low, medium and high

min_value_5 = df['Tool wear [min]'].min()
max_value_5 = df['Tool wear [min]'].max()
print(min_value_5)
print(max_value_5)

# Store in Bins 
bins_5 = np.linspace(min_value_5, max_value_5, 4)
bins_5

# Create labels
labels_5 = ['low', 'medium', 'high']

# Replace to the value in the bin based on the label
df['Tool wear [min]'] = pd.cut(df['Tool wear [min]'], bins=bins_5, labels=labels_5, include_lowest=True)


0
253


In [54]:
# Encoding the features for getting a better result

le =LabelEncoder()


df['Air temperature [K]'] = le.fit_transform(df['Air temperature [K]'])
df['Process temperature [K]'] = le.fit_transform(df['Process temperature [K]'])
df['Rotational speed [rpm]'] = le.fit_transform(df['Rotational speed [rpm]'])
df['Torque [Nm]'] = le.fit_transform(df['Torque [Nm]'])
df['Tool wear [min]'] = le.fit_transform(df['Tool wear [min]'])




In [55]:
# Cluster sampling the dataset from 20,000 to 10,000 
df = df.sample(frac=0.5, replace=True) # 10000 instances

# Cluster sampling the dataset to 10,000 where 8000 is used for training and 2000 is used for testing
df = df.sample(frac=0.8, replace=True) # 8000 instances


# Defining X and y
X = df.iloc[:,2:8]
y = df.iloc[:, 8]

# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = True)



In [56]:
# using the Gaussian Naive bayes to fit X_train and y_train for helping the model to predict new data (y_test)
gnb=GaussianNaiveBayes()
gnb.fit(X_train,y_train)

In [57]:
# Storing the prediction of the training and testing sets
y_train_pred = gnb.predict(X_train)
y_pred = gnb.predict(X_test)

In [58]:
# Getting the accuracy of the training and testing set
print("Training Accuracy :",metrics.accuracy_score(y_train, y_train_pred))
print("Testing Accuracy :",metrics.accuracy_score(y_test, y_pred))

print(classification_report(y_test,y_pred))

Training Accuracy : 0.7915
Testing Accuracy : 0.8065
              precision    recall  f1-score   support

           0       0.78      0.88      0.82      1035
           1       0.85      0.73      0.78       965

    accuracy                           0.81      2000
   macro avg       0.81      0.80      0.80      2000
weighted avg       0.81      0.81      0.81      2000



In [59]:
confusion_matrix(y_test, y_pred)

array([[907, 128],
       [259, 706]])