In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt 

In [2]:
data = pd.read_csv("clean_data.csv")

In [3]:
data.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm]
0,298.1,308.6,1551,42.8
1,298.2,308.7,1408,46.3
2,298.1,308.5,1498,49.4
3,298.2,308.6,1433,39.5
4,298.2,308.7,1408,40.0


In [4]:
iso_forest = IsolationForest(contamination=0.1, random_state=42)

In [5]:
iso_forest.fit(data)

In [6]:
anomalies = iso_forest.predict(data)

In [7]:
data['Anomaly'] = anomalies

In [8]:
data.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Anomaly
0,298.1,308.6,1551,42.8,1
1,298.2,308.7,1408,46.3,1
2,298.1,308.5,1498,49.4,1
3,298.2,308.6,1433,39.5,1
4,298.2,308.7,1408,40.0,1


In [9]:
# 1 represents normal data
data.query('Anomaly==1')

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Anomaly
0,298.1,308.6,1551,42.8,1
1,298.2,308.7,1408,46.3,1
2,298.1,308.5,1498,49.4,1
3,298.2,308.6,1433,39.5,1
4,298.2,308.7,1408,40.0,1
...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,1
9996,298.9,308.4,1632,31.8,1
9997,299.0,308.6,1645,33.4,1
9998,299.0,308.7,1408,48.5,1


In [10]:
# -1 represent anomaly present
data.query('Anomaly==-1')

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Anomaly
14,298.6,309.2,2035,19.6,-1
50,298.9,309.1,2861,4.6,-1
101,298.8,308.8,1991,20.7,-1
139,298.8,308.6,1268,55.8,-1
155,298.4,308.2,1987,19.8,-1
...,...,...,...,...,...
9849,298.6,309.4,2312,15.5,-1
9865,298.9,309.8,2450,13.9,-1
9871,298.9,309.8,1947,21.7,-1
9939,298.3,308.1,2636,12.8,-1


In [11]:
# If less than 20% of data set has anomaly, then low risk
# If between 20% and 40%, then medium risk
# If more than 40% then high risk
groupByAnomaly = data.groupby('Anomaly')

In [12]:
n_anomoly = len(data[data['Anomaly']== -1])
n_total = len(data)

In [13]:
percentage_an = n_anomoly/n_total * 100

In [14]:
print(percentage_an)

10.0


In [15]:
if percentage_an < 20:
    print("Risk of failure is low.")
elif percentage_an > 20 and percentage_an < 40:
    print("Risk of failure is medium")
else:
    print("Risk of failure is high")

Risk of failure is low.
