## References
 - __[Dataset: Kaggle](https://www.kaggle.com/datasets/boltzmannbrain/nab)__

## Info
> ![image.png](attachment:efedc41d-4006-41ac-b17c-f0b11d895f50.png)

## Expected Output
> ![image.png](attachment:8727fa4f-d3cc-4ba0-b925-f3b9e813e6ba.png)

## Best Final Output (Attempt #2)
> ![image.png](attachment:194607c2-baa6-4cfc-a222-2e9f21fa701e.png)

## Note
- Out of curiosity, I've tried fine-tuning the model by changing the input/parameters. The output are then observed.
- Final comment: My lack of understanding on Isolation Forest algorithm limits what I can do with the tool.

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Extra Libs
import matplotlib.dates as mdates
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.models import HoverTool
from IPython.display import HTML, display

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import IsolationForest

## Overview

In [5]:
# load dataset
df = pd.read_csv('../Python/Input/realKnownCause/machine_temperature_system_failure.csv', parse_dates=['timestamp'])

(df.head(5)
 .style
 .set_caption('Machine Temperature Failure')
 .format({'value':"{:,.0f}"})
)

Unnamed: 0,timestamp,value
0,2013-12-02 21:15:00,74
1,2013-12-02 21:20:00,75
2,2013-12-02 21:25:00,76
3,2013-12-02 21:30:00,78
4,2013-12-02 21:35:00,79


In [6]:
df.describe()

Unnamed: 0,timestamp,value
count,22695,22695.0
mean,2014-01-11 06:16:49.887640576,85.926498
min,2013-12-02 21:15:00,2.084721
25%,2013-12-22 14:02:30,83.080078
50%,2014-01-11 05:50:00,89.408246
75%,2014-01-30 22:37:30,94.016252
max,2014-02-19 15:25:00,108.510543
std,,13.746912


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22695 entries, 0 to 22694
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  22695 non-null  datetime64[ns]
 1   value      22695 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 354.7 KB


In [8]:
FiveMin = hv.Curve(df).opts(
          opts.Curve(title="Machine's Temperature Sensor Data Every 5 Min", xlabel="", ylabel="Temperature",
                     width=700, height=300,tools=['hover'],show_grid=True))

Hourly = hv.Curve(df.set_index('timestamp').resample('H').mean()).opts(
         opts.Curve(title="Machine's Temperature Sensor Data Hourly", xlabel="", ylabel="Temperature",
                    width=700, height=300,tools=['hover'],show_grid=True))

Daily = hv.Curve(df.set_index('timestamp').resample('D').mean()).opts(
         opts.Curve(title="Machine's Temperature Sensor Data Daily", xlabel="", ylabel="Temperature",
                    width=700, height=300,tools=['hover'],show_grid=True))

(FiveMin + Hourly + Daily).opts(shared_axes=False).cols(1)

## Feature Engineering

In [10]:
# Backup
df_copy = df

# Feature Engineering
df['weekday']      = (pd.Categorical(df['timestamp'].dt.strftime('%A'),
                                     categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday', 'Saturday', 'Sunday'])
                            )
df['hour']         = df['timestamp'].dt.hour
df['day']          = df['timestamp'].dt.weekday
df['month']        = df['timestamp'].dt.month
df['year']         = df['timestamp'].dt.year
df['month_day']    = df['timestamp'].dt.day
df['lag']          = df['value'].shift(1)
df['rolling_mean'] = df['value'].rolling(7, min_periods=1).mean()

# Show datasets
print('Dataset')
display(df)

Dataset


Unnamed: 0,timestamp,value,weekday,hour,day,month,year,month_day,lag,rolling_mean
0,2013-12-02 21:15:00,73.967322,Monday,21,0,12,2013,2,,73.967322
1,2013-12-02 21:20:00,74.935882,Monday,21,0,12,2013,2,73.967322,74.451602
2,2013-12-02 21:25:00,76.124162,Monday,21,0,12,2013,2,74.935882,75.009122
3,2013-12-02 21:30:00,78.140707,Monday,21,0,12,2013,2,76.124162,75.792018
4,2013-12-02 21:35:00,79.329836,Monday,21,0,12,2013,2,78.140707,76.499582
...,...,...,...,...,...,...,...,...,...,...
22690,2014-02-19 15:05:00,98.185415,Wednesday,15,2,2,2014,19,97.360905,97.436008
22691,2014-02-19 15:10:00,97.804168,Wednesday,15,2,2,2014,19,98.185415,97.583951
22692,2014-02-19 15:15:00,97.135468,Wednesday,15,2,2,2014,19,97.804168,97.640466
22693,2014-02-19 15:20:00,98.056852,Wednesday,15,2,2,2014,19,97.135468,97.750791


## Visual Exploration

In [12]:
(hv.Distribution(df['value'])
.opts(opts.Distribution(title="Overall Value Distribution",
                        xlabel="Value",
                        ylabel="Density",
                        width=700, height=300,
                        tools=['hover'],show_grid=True)
     ))

In [13]:
by_weekday = df.groupby(['hour','weekday']).mean()['value'].unstack()

display(by_weekday)

weekday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,83.268409,88.528922,87.671819,92.190361,89.208688,86.104478,86.254048
1,83.734766,86.934298,87.527097,91.893462,89.620775,85.544941,86.348646
2,83.854206,85.987696,88.063096,92.221296,89.512388,85.154698,85.900146
3,83.445865,85.397517,89.166532,92.122671,89.221499,84.689747,85.951564
4,82.852529,83.97135,89.934965,91.039336,89.441666,85.494146,85.796986
5,82.252554,83.836276,90.95624,90.749605,90.230178,83.781135,85.8333
6,80.799264,84.170226,90.39016,90.031857,90.281701,84.021773,86.087222
7,79.470874,84.001671,89.728558,89.725502,89.74338,85.320602,85.758503
8,79.332768,84.335797,89.478553,88.034738,87.793093,85.382386,84.778751
9,79.333308,84.640612,89.499686,84.613687,86.163575,85.02954,85.335989


In [14]:
plot = (hv.Distribution(by_weekday['Monday'], label='Monday') * 
        hv.Distribution(by_weekday['Tuesday'], label='Tuesday') * 
        hv.Distribution(by_weekday['Wednesday'], label='Wednesday') * 
        hv.Distribution(by_weekday['Thursday'], label='Thursday') * 
        hv.Distribution(by_weekday['Friday'], label='Friday') * 
        hv.Distribution(by_weekday['Saturday'], label='Saturday') * 
        hv.Distribution(by_weekday['Sunday'], label='Sunday').opts(opts.Distribution(title="Machine's Temperature by Day & Hour")))
plot.opts(opts.Distribution(width=800, height=300,tools=['hover'],show_grid=True, ylabel="Demand", xlabel="Demand"))

In [15]:
data = df[['value','weekday']].groupby('weekday').mean()

display(data)

Unnamed: 0_level_0,value
weekday,Unnamed: 1_level_1
Monday,80.993944
Tuesday,84.413051
Wednesday,89.518365
Thursday,88.270918
Friday,87.228534
Saturday,84.642035
Sunday,86.402453


In [16]:
data2 = df[['value','hour']].groupby('hour').mean()

display(data2)

Unnamed: 0_level_0,value
hour,Unnamed: 1_level_1
0,87.616389
1,87.368421
2,87.220841
3,87.145739
4,86.933507
5,86.820566
6,86.559046
7,86.265446
8,85.624194
9,84.998996


In [17]:
bar = hv.Bars(data).opts(
        opts.Bars(title="Machine's Temperature by Day", xlabel="", ylabel="Temperature", ylim=(75, 90),
                   width=700, height=300,tools=['hover'],show_grid=True))

day = 'Monday'
mon = hv.Curve(df[df.weekday == "{}".format(day)]).opts(
          opts.Curve(title="Machine's Temperature Sensor Data on {}".format(day), xlabel="", ylabel="Temperature",
                     width=700, height=300,tools=['hover'],show_grid=True))

hour = hv.Curve(data2).opts(
        opts.Curve(title="Machine's Temperature by Hour", xlabel="Hour", ylabel="Temperature",
                   width=700, height=300,tools=['hover'],show_grid=True))

(bar + mon + hour).opts(shared_axes=False).cols(1)

### Note
- Monday: On Dec 16, 2013, temp goes down to it's lowest (2deg F) which explain the left-skew.
- Wednesday: Lowest temp was 49deg F which explain the right-skew.
- 1900 to 0700: Temp at its peak. Day, cold; Night, hot.....What kind of machine are you??

## Models

Note on anamoly scores:
-    the lower, the more abnormal.
-    negative scores represent outliers
-    positive scores represent inliers

## Attempt #1 (Additonal Feature: 'Minute' to Further Refine the Data)

In [21]:
# df = pd.read_csv('../Python/Input/realKnownCause/machine_temperature_system_failure.csv', parse_dates=['timestamp'])

# df['minute']       = df['timestamp'].dt.minute
# df['hour']         = df['timestamp'].dt.hour
# df['day']          = df['timestamp'].dt.weekday
# df['month_day']    = df['timestamp'].dt.day
# df['month']        = df['timestamp'].dt.month
# df['year']         = df['timestamp'].dt.year
# df['lag']          = df['value'].shift(1)
# df['rolling_mean'] = df['value'].rolling(7, min_periods=1).mean()

# model_data = df.set_index('timestamp').dropna()

# # print('model_data')
# # display(model_data)

# # Fit Model & View Outliers
# def run_isolation_forest(model_data: pd.DataFrame, contamination=0.001, n_estimators=100, max_samples=0.7) -> pd.DataFrame:
#     IF = (IsolationForest(random_state  = 0,
#                           contamination = contamination,
#                           n_estimators  = n_estimators,
#                           max_samples   = max_samples)
#          )
#     IF.fit(model_data)
#     output = pd.Series(IF.predict(model_data)).apply(lambda x: 1 if x == -1 else 0)
#     score = IF.decision_function(model_data)
#     return output, score

# outliers, score = run_isolation_forest(model_data)

# df = (df
#       .dropna()
#       .assign(outliers = outliers)
#       .assign(score = score)
#      )

# # print('df')
# # display(df)

In [22]:
# frequencies, edges = np.histogram(score, 50)
# hist = hv.Histogram((edges, frequencies)).opts(width=1000, height=300,tools=['hover'], xlabel='Anamoly Score',
#                                                title="Machine's Temperature Anomalies")

# tooltips = [
#     ('timestamp', '@timestamp'),
#     ('value', '@value'),
#     ('weekday', '@weekday')
# ]
# hover = HoverTool(tooltips=tooltips)

# anamo = len(df[df.outliers == 1])
# line = (hv.Points(df.query("outliers == 1")).opts(size=10, color='#ff0000') * 
#              hv.Curve(df).opts(opts.Curve(title="Machine's Temperature Anomalies (Anamolies Count: {})".format(anamo), 
#                                          xlabel="", ylabel="Demand" , width=1000, height=300,
#                                          tools=[hover,'box_select', 'lasso_select', 'tap'],show_grid=True)))

# print('model_data')
# display(model_data)
# (hist + line).opts(shared_axes=False).cols(1)

## Attempt #2 (Less Feature: Remove 'Minute')

In [24]:
df = df_copy

model_data = df.drop(['weekday'], axis = 1).set_index('timestamp').dropna()

# print('model_data')
# display(model_data)

# Fit Model & View Outliers
def run_isolation_forest(model_data: pd.DataFrame, contamination=0.001, n_estimators=100, max_samples=0.7) -> pd.DataFrame:
    IF = (IsolationForest(random_state  = 0,
                          contamination = contamination,
                          n_estimators  = n_estimators,
                          max_samples   = max_samples)
         )
    IF.fit(model_data)
    output = pd.Series(IF.predict(model_data)).apply(lambda x: 1 if x == -1 else 0)
    score = IF.decision_function(model_data)
    return output, score

outliers, score = run_isolation_forest(model_data)

df = (df
      .dropna()
      .assign(outliers = outliers)
      .assign(score = score)
     )

# print('df')
# display(df)

In [25]:
frequencies, edges = np.histogram(score, 50)
hist = hv.Histogram((edges, frequencies)).opts(width=1000, height=300,tools=['hover'], xlabel='Anamoly Score',
                                               title="Machine's Temperature Anomalies")

tooltips = [
    ('timestamp', '@timestamp'),
    ('value', '@value'),
    ('weekday', '@weekday')
]
hover = HoverTool(tooltips=tooltips)

anamo = len(df[df.outliers == 1])
line = (hv.Points(df.query("outliers == 1")).opts(size=10, color='#ff0000') * 
             hv.Curve(df).opts(opts.Curve(title="Machine's Temperature Anomalies (Anamolies Count: {})".format(anamo), 
                                         xlabel="", ylabel="Demand" , width=1000, height=300,
                                         tools=[hover,'box_select', 'lasso_select', 'tap'],show_grid=True)))

print('model_data')
display(model_data)
(hist + line).opts(shared_axes=False).cols(1)

model_data


Unnamed: 0_level_0,value,hour,day,month,year,month_day,lag,rolling_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-12-02 21:20:00,74.935882,21,0,12,2013,2,73.967322,74.451602
2013-12-02 21:25:00,76.124162,21,0,12,2013,2,74.935882,75.009122
2013-12-02 21:30:00,78.140707,21,0,12,2013,2,76.124162,75.792018
2013-12-02 21:35:00,79.329836,21,0,12,2013,2,78.140707,76.499582
2013-12-02 21:40:00,78.710418,21,0,12,2013,2,79.329836,76.868055
...,...,...,...,...,...,...,...,...
2014-02-19 15:05:00,98.185415,15,2,2,2014,19,97.360905,97.436008
2014-02-19 15:10:00,97.804168,15,2,2,2014,19,98.185415,97.583951
2014-02-19 15:15:00,97.135468,15,2,2,2014,19,97.804168,97.640466
2014-02-19 15:20:00,98.056852,15,2,2,2014,19,97.135468,97.750791


## Attempt #3 (Less Feature: Remove 'Hour')

In [27]:
# df = df_copy

# model_data = df.drop(['weekday','hour'], axis = 1).set_index('timestamp').dropna()

# # print('model_data')
# # display(model_data)

# # Fit Model & View Outliers
# def run_isolation_forest(model_data: pd.DataFrame, contamination=0.001, n_estimators=100, max_samples=0.7) -> pd.DataFrame:
#     IF = (IsolationForest(random_state  = 0,
#                           contamination = contamination,
#                           n_estimators  = n_estimators,
#                           max_samples   = max_samples)
#          )
#     IF.fit(model_data)
#     output = pd.Series(IF.predict(model_data)).apply(lambda x: 1 if x == -1 else 0)
#     score = IF.decision_function(model_data)
#     return output, score

# outliers, score = run_isolation_forest(model_data)

# df = (df
#       .dropna()
#       .assign(outliers = outliers)
#       .assign(score = score)
#      )

# # print('df')
# # display(df)

In [28]:
# frequencies, edges = np.histogram(score, 50)
# hist = hv.Histogram((edges, frequencies)).opts(width=1000, height=300,tools=['hover'], xlabel='Anamoly Score',
#                                                title="Machine's Temperature Anomalies")

# tooltips = [
#     ('timestamp', '@timestamp'),
#     ('value', '@value'),
#     ('weekday', '@weekday')
# ]
# hover = HoverTool(tooltips=tooltips)

# anamo = len(df[df.outliers == 1])
# line = (hv.Points(df.query("outliers == 1")).opts(size=10, color='#ff0000') * 
#              hv.Curve(df).opts(opts.Curve(title="Machine's Temperature Anomalies (Anamolies Count: {})".format(anamo), 
#                                          xlabel="", ylabel="Demand" , width=1000, height=300,
#                                          tools=[hover,'box_select', 'lasso_select', 'tap'],show_grid=True)))

# print('model_data')
# display(model_data)
# (hist + line).opts(shared_axes=False).cols(1)

## Attempt #4 (Less Feature: Remove 'Day', 'Month_Day')

In [30]:
# df = df_copy

# model_data = df.drop(['weekday','hour','day','month_day'], axis = 1).set_index('timestamp').dropna()

# # print('model_data')
# # display(model_data)

# # Fit Model & View Outliers
# def run_isolation_forest(model_data: pd.DataFrame, contamination=0.001, n_estimators=100, max_samples=0.7) -> pd.DataFrame:
#     IF = (IsolationForest(random_state  = 0,
#                           contamination = contamination,
#                           n_estimators  = n_estimators,
#                           max_samples   = max_samples)
#          )
#     IF.fit(model_data)
#     output = pd.Series(IF.predict(model_data)).apply(lambda x: 1 if x == -1 else 0)
#     score = IF.decision_function(model_data)
#     return output, score

# outliers, score = run_isolation_forest(model_data)

# df = (df
#       .dropna()
#       .assign(outliers = outliers)
#       .assign(score = score)
#      )

# # print('df')
# # display(df)

In [31]:
# frequencies, edges = np.histogram(score, 50)
# hist = hv.Histogram((edges, frequencies)).opts(width=1000, height=300,tools=['hover'], xlabel='Anamoly Score',
#                                                title="Machine's Temperature Anomalies")

# tooltips = [
#     ('timestamp', '@timestamp'),
#     ('value', '@value'),
#     ('weekday', '@weekday')
# ]
# hover = HoverTool(tooltips=tooltips)

# anamo = len(df[df.outliers == 1])
# line = (hv.Points(df.query("outliers == 1")).opts(size=10, color='#ff0000') * 
#              hv.Curve(df).opts(opts.Curve(title="Machine's Temperature Anomalies (Anamolies Count: {})".format(anamo), 
#                                          xlabel="", ylabel="Demand" , width=1000, height=300,
#                                          tools=[hover,'box_select', 'lasso_select', 'tap'],show_grid=True)))

# print('model_data')
# display(model_data)
# (hist + line).opts(shared_axes=False).cols(1)

## Attempt #5 (No Feature: Using Data As-Is)

In [33]:
# df = df_copy

# model_data = df[['timestamp','value']].set_index('timestamp')

# # print('model_data')
# # display(model_data)

# # Fit Model & View Outliers
# def run_isolation_forest(model_data: pd.DataFrame, contamination=0.001, n_estimators=100, max_samples=0.7) -> pd.DataFrame:
#     IF = (IsolationForest(random_state  = 0,
#                           contamination = contamination,
#                           n_estimators  = n_estimators,
#                           max_samples   = max_samples)
#          )
#     IF.fit(model_data)
#     output = pd.Series(IF.predict(model_data)).apply(lambda x: 1 if x == -1 else 0)
#     score = IF.decision_function(model_data)
#     return output, score

# outliers, score = run_isolation_forest(model_data)

# df = (df
#       .assign(outliers = outliers)
#       .assign(score = score)
#      )

# # print('df')
# # display(df)

In [34]:
# frequencies, edges = np.histogram(score, 50)
# hist = hv.Histogram((edges, frequencies)).opts(width=1000, height=300,tools=['hover'], xlabel='Anamoly Score',
#                                                title="Machine's Temperature Anomalies")

# tooltips = [
#     ('timestamp', '@timestamp'),
#     ('value', '@value'),
#     ('weekday', '@weekday')
# ]
# hover = HoverTool(tooltips=tooltips)

# anamo = len(df[df.outliers == 1])
# line = (hv.Points(df.query("outliers == 1")).opts(size=10, color='#ff0000') * 
#              hv.Curve(df).opts(opts.Curve(title="Machine's Temperature Anomalies (Anamolies Count: {})".format(anamo), 
#                                          xlabel="", ylabel="Demand" , width=1000, height=300,
#                                          tools=[hover,'box_select', 'lasso_select', 'tap'],show_grid=True)))

# print('model_data')
# display(model_data)
# (hist + line).opts(shared_axes=False).cols(1)

## Attempt #6 (Contamination: Increase Percentages)

In [36]:
# df = df_copy

# model_data = df[['timestamp','value']].set_index('timestamp')

# # print('model_data')
# # display(model_data)

# # Fit Model & View Outliers
# def run_isolation_forest(model_data: pd.DataFrame, contamination=0.0025, n_estimators=100, max_samples=0.7) -> pd.DataFrame:
#     IF = (IsolationForest(random_state  = 0,
#                           contamination = contamination,
#                           n_estimators  = n_estimators,
#                           max_samples   = max_samples)
#          )
#     IF.fit(model_data)
#     output = pd.Series(IF.predict(model_data)).apply(lambda x: 1 if x == -1 else 0)
#     score = IF.decision_function(model_data)
#     return output, score

# outliers, score = run_isolation_forest(model_data)

# df = (df
#       .assign(outliers = outliers)
#       .assign(score = score)
#      )

# # print('df')
# # display(df)

In [37]:
# frequencies, edges = np.histogram(score, 50)
# hist = hv.Histogram((edges, frequencies)).opts(width=1000, height=300,tools=['hover'], xlabel='Anamoly Score',
#                                                title="Machine's Temperature Anomalies")

# tooltips = [
#     ('timestamp', '@timestamp'),
#     ('value', '@value'),
#     ('weekday', '@weekday')
# ]
# hover = HoverTool(tooltips=tooltips)

# anamo = len(df[df.outliers == 1])
# line = (hv.Points(df.query("outliers == 1")).opts(size=10, color='#ff0000') * 
#              hv.Curve(df).opts(opts.Curve(title="Machine's Temperature Anomalies (Anamolies Count: {})".format(anamo), 
#                                          xlabel="", ylabel="Demand" , width=1000, height=300,
#                                          tools=[hover,'box_select', 'lasso_select', 'tap'],show_grid=True)))

# print('model_data')
# display(model_data)
# (hist + line).opts(shared_axes=False).cols(1)