## References
 - __[Script: Kaggle](https://www.kaggle.com/code/joshuaswords/time-series-anomaly-detection/notebook)__
 - __[Dataset: Kaggle](https://www.kaggle.com/datasets/boltzmannbrain/nab)__
 - ![image.png](attachment:c6bbec45-605e-40c6-87d2-a06afefa12d7.png)

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Extra Libs
import matplotlib.dates as mdates
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.models import HoverTool
from IPython.display import HTML, display

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import IsolationForest

## Overview

In [9]:
# load dataset
df = pd.read_csv('../Python/Input/realKnownCause/nyc_taxi.csv', parse_dates=['timestamp'])

(df.head(5)
 .style
 .set_caption('New York City Taxi Demand')
 .format({'value':"{:,.0f}"})
)

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [11]:
# Check for blank values, data types
def overview(df: pd.DataFrame, timestamp_col: str = None) -> None:
    print('Null Count:\n', df.isnull().sum(),'\n')
    print('Data Types:\n', df.dtypes)
    
    if timestamp_col is not None:
        print('\nDate Range:\n\nStart:\t',df[timestamp_col].min())
        print('End:\t',df[timestamp_col].max())
        print('Days:\t',(df[timestamp_col].max() - df[timestamp_col].min()))

overview(df, timestamp_col='timestamp')

Null Count:
 timestamp    0
value        0
dtype: int64 

Data Types:
 timestamp    datetime64[ns]
value                 int64
dtype: object

Date Range:

Start:	 2014-07-01 00:00:00
End:	 2015-01-31 23:30:00
Days:	 214 days 23:30:00


In [15]:
HalfHour = hv.Curve(df).opts(
    opts.Curve(title="New York City Taxi Demand Every 30 Min", xlabel="", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

Hourly = hv.Curve(df.set_index('timestamp').resample('H').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Hourly", xlabel="", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

Daily = hv.Curve(df.set_index('timestamp').resample('D').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Daily", xlabel="", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

Weekly = hv.Curve(df.set_index('timestamp').resample('W').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Weekly", xlabel="Date", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))


(HalfHour + Hourly + Daily + Weekly).opts(shared_axes=False).cols(1)

## Feature Engineering

In [21]:
# A variety of resamples which I may or may not use
df_hourly = df.set_index('timestamp').resample('H').mean().reset_index()
df_daily  = df.set_index('timestamp').resample('D').mean().reset_index()
df_weekly = df.set_index('timestamp').resample('W').mean().reset_index()

# Loop to cycle through both DataFrames
for DataFrame in [df_hourly, df_daily]:
    DataFrame['Weekday'] = (pd.Categorical(DataFrame['timestamp'].dt.strftime('%A'),
                                           categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday', 'Saturday', 'Sunday'])
                           )
    DataFrame['Hour'] = DataFrame['timestamp'].dt.hour
    DataFrame['Day'] = DataFrame['timestamp'].dt.weekday
    DataFrame['Month'] = DataFrame['timestamp'].dt.month
    DataFrame['Year'] = DataFrame['timestamp'].dt.year
    DataFrame['Month_day'] = DataFrame['timestamp'].dt.day
    DataFrame['Lag'] = DataFrame['value'].shift(1)
    DataFrame['Rolling_Mean'] = DataFrame['value'].rolling(7, min_periods=1).mean()
    DataFrame = DataFrame.dropna()

display(df_hourly)

Unnamed: 0,timestamp,value,Weekday,Hour,Day,Month,Year,Month_day,Lag,Rolling_Mean
0,2014-07-01 00:00:00,9485.5,Tuesday,0,1,7,2014,1,,9485.500000
1,2014-07-01 01:00:00,5433.0,Tuesday,1,1,7,2014,1,9485.5,7459.250000
2,2014-07-01 02:00:00,3346.5,Tuesday,2,1,7,2014,1,5433.0,6088.333333
3,2014-07-01 03:00:00,2216.5,Tuesday,3,1,7,2014,1,3346.5,5120.375000
4,2014-07-01 04:00:00,2189.5,Tuesday,4,1,7,2014,1,2216.5,4534.200000
...,...,...,...,...,...,...,...,...,...,...
5155,2015-01-31 19:00:00,28288.5,Saturday,19,5,1,2015,31,26665.0,23537.214286
5156,2015-01-31 20:00:00,24138.0,Saturday,20,5,1,2015,31,28288.5,23673.571429
5157,2015-01-31 21:00:00,24194.5,Saturday,21,5,1,2015,31,24138.0,24031.214286
5158,2015-01-31 22:00:00,26515.0,Saturday,22,5,1,2015,31,24194.5,24635.714286
