In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

# This is to make sure matplotlib doesn't throw the following error:
# The next line fixes "TypeError: float() argument must be a string or a number, not 'Timestamp' matplotlib"
pd.plotting.register_matplotlib_converters()

# Anomalies in Time Series Data
#### Lesson Goals

- Use entropy as a quick way to identify fields that may have anomalies.

- Use statistical properties to flag the data points that deviate from the expected.

#### The Data

Logs of API requests to our data containing sales information about our stores and items.

Type of target variable: Continuous or Discrete

Type of observations: Time Series or Point in Time

#### The Questions

Are there unusual IP addresses accessing our data via the API?

Have we seen any spikes or unusual patterns in the size of requests?

In general: Does this new value deviate from what we would expect based on historical data? If so, is it something to be concerned about? Remember, we aren't detecting anomalies for the sake of detecting anomalies.

# Goal:

- **Ask questions about the data!!!!** That's going to be the point of this project.
- Pick a cohort, measure # of visits per day, and do the %B and Bollinger Bands on that single cohort to learn and explore the data.
- I'm going to have to determine how to come up with a metric (probably count, just different ways to count).
    - Pick a cohort, do count of visits per day. I may not find anything, so find another cohort, and keep on trying until I find something. Nose to the grindstone "time".

In [2]:
# Acquire

In [3]:
colnames=['ip', 'timestamp', 'request_method', 'status', 'size',
          'destination', 'request_agent']
df_orig = pd.read_csv('https://python.zach.lol/access.log',          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0, 3, 4, 5, 6, 7, 8]
)

In [4]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13974 entries, 0 to 13973
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ip              13974 non-null  object
 1   timestamp       13974 non-null  object
 2   request_method  13974 non-null  object
 3   status          13974 non-null  int64 
 4   size            13974 non-null  int64 
 5   destination     25 non-null     object
 6   request_agent   13974 non-null  object
dtypes: int64(2), object(5)
memory usage: 764.3+ KB


In [None]:
my_ip = 96.8.130.213

In [5]:
df.info()

NameError: name 'df' is not defined

Resampling data
In [26]:

1
my_datetime_fmt = mdates.DateFormatter('%m-%d %H:%T')
2
my_datetime_fmt
Out[26]:
<matplotlib.dates.DateFormatter at 0x7f9f1aa0b8d0>
In [28]:

1
df_ts_user = df['user_id'].resample('30T').max()
2
df_ts_user
Out[28]:
date_time
2018-01-26 09:30:00      5.0
2018-01-26 10:00:00     11.0
2018-01-26 10:30:00     21.0
2018-01-26 11:00:00     24.0
2018-01-26 11:30:00     35.0
                       ...  
2020-11-02 14:30:00    785.0
2020-11-02 15:00:00    775.0
2020-11-02 15:30:00    782.0
2020-11-02 16:00:00    784.0
2020-11-02 16:30:00    773.0
Freq: 30T, Name: user_id, Length: 48543, dtype: float64
In [30]:

1
idx = pd.date_range(
2
    df_ts_user.sort_index().index.min(), 
3
    df_ts_user.sort_index().index.max(),
4
    freq='30min'
5
)
6
idx
Out[30]:
DatetimeIndex(['2018-01-26 09:30:00', '2018-01-26 10:00:00',
               '2018-01-26 10:30:00', '2018-01-26 11:00:00',
               '2018-01-26 11:30:00', '2018-01-26 12:00:00',
               '2018-01-26 12:30:00', '2018-01-26 13:00:00',
               '2018-01-26 13:30:00', '2018-01-26 14:00:00',
               ...
               '2020-11-02 12:00:00', '2020-11-02 12:30:00',
               '2020-11-02 13:00:00', '2020-11-02 13:30:00',
               '2020-11-02 14:00:00', '2020-11-02 14:30:00',
               '2020-11-02 15:00:00', '2020-11-02 15:30:00',
               '2020-11-02 16:00:00', '2020-11-02 16:30:00'],
              dtype='datetime64[ns]', length=48543, freq='30T')
In [31]:

1
df_ts_user = df_ts_user.reindex(idx, fill_value=0).fillna(value=0)
2
df_ts_user
Out[31]:
2018-01-26 09:30:00      5.0
2018-01-26 10:00:00     11.0
2018-01-26 10:30:00     21.0
2018-01-26 11:00:00     24.0
2018-01-26 11:30:00     35.0
                       ...  
2020-11-02 14:30:00    785.0
2020-11-02 15:00:00    775.0
2020-11-02 15:30:00    782.0
2020-11-02 16:00:00    784.0
2020-11-02 16:30:00    773.0
Freq: 30T, Name: user_id, Length: 48543, dtype: float64
In [32]:

1
df_ts_size.shape, df_ts_user.shape
Out[32]:
((48543,), (48543,))
In [33]:

1
df_ts_test = df['user_id'].resample('30T').max()
2
df_ts_test.shape
Out[33]:
(48543,)
In [35]:

1
df['user_id'].resample('30T').max().shape
Out[35]:
(48543,)
In [ ]:

1
​
2
df_ts_size = df['size_mb'].resample('30T').max()
3
​
4
idx = pd.date_range(
5
    df_ts_size.sort_index().index.min(), 
6
    df_ts_size.sort_index().index.max(),
7
    freq='30min'
8
)
9
​
10
df_ts_size = df_ts_size.reindex(idx, fill_value=0).fillna(value=0)
In [ ]:

1
df_ts_user = df['user_id'].resample('30T').max()
2
df_ts_user
In [40]:

1
# I don't know if I actually need to resample my data in this way...
In [ ]:

1
​
In [ ]:

1
​
In [ ]:

1
​
In [ ]:

1
​
In [ ]:

1
​
In [ ]:

1
data[col].str.contains('<what you are looking for>')