In [19]:
from pandas.core.interchange.dataframe_protocol import DataFrame
# ============================================================
# Notebook setup: run this before everything
# ============================================================
# -- Copied from lecture
%load_ext autoreload
%config IPCompleter.greedy=True
%autoreload 1
%aimport util
from util import util
import numpy as np
# from matplotlib import pyplot as plt
import pandas as pd
from sklearn.neighbors import KernelDensity
from sklearn.metrics import mean_squared_error
from IPython.display import display
from collections import Counter


# Only if environment does not support explicit display, like Pycharm
display = print

# Control figure size
interactive_figures = False
if interactive_figures:
    # Normal behavior
    %matplotlib widget
    figsize=(9, 3)
else:
    # PDF export behavior
    figsize=(14, 4)

data_folder = '../resources/dataset'
file_name = '7_gecco2019_train_water_quality.csv'
# Load the input data
data_path = f'{data_folder}/{file_name}'
data = pd.read_csv(data_path)
data['Time'] = pd.to_datetime(data['Time'])
data.set_index('Time', inplace=True)
data = data.drop(columns=["Unnamed: 0"]) # The index was stored as an unnamed column

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Missing Values
## Investigation

First, we see that most of our values are floating point numbers, except for the `Event` column which is our label, that is,
whether we are dealing with an anomaly or not.

In [20]:
print("Columns: ")
print({column: data[column].dtype for column in data.columns})
data.describe()

Columns: 
{'Tp': dtype('float64'), 'pH': dtype('float64'), 'Cond': dtype('float64'), 'Turb': dtype('float64'), 'SAC': dtype('float64'), 'PFM': dtype('float64'), 'Event': dtype('bool')}


Unnamed: 0,Tp,pH,Cond,Turb,SAC,PFM
count,132266.0,132272.0,132276.0,132275.0,132280.0,132269.0
mean,8.041982,8.533168,0.021095,0.128368,4.100425,68.66098
std,0.893687,0.052646,0.000347,0.005928,0.304104,11.420472
min,0.0,0.0,0.0,0.0,0.0,42.1631
25%,7.25,8.50278,0.020871,0.12716,3.81643,60.4849
50%,7.88,8.53963,0.021102,0.127754,4.125515,71.4407
75%,8.71,8.56349,0.021327,0.128686,4.39575,79.0769
max,10.3,8.66676,0.023214,1.78285,6.102071,84.8151


We will now look at the alignment and completeness of the sample timeslots:

In [21]:
display((data.index[1:] - data.index[:-1]).value_counts())

Time
0 days 00:01:00    132479
Name: count, dtype: int64


We see that the timeslots are regular and not missing.

Now, we will look at missing values:

In [22]:
print("Columns containing missing values:")
display(data.isna().any())
print("Some rows containing missing values:")
data.loc[data.isna().any(axis=1)].head()

Columns containing missing values:
Tp        True
pH        True
Cond      True
Turb      True
SAC       True
PFM       True
Event    False
dtype: bool
Some rows containing missing values:


Unnamed: 0_level_0,Tp,pH,Cond,Turb,SAC,PFM,Event
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-07-01 13:24:00,,,,,,,False
2017-07-01 17:00:00,,8.58873,,0.126537,3.58901,,False
2017-07-02 22:08:00,,,,,,,False
2017-07-03 01:46:00,7.1,8.55185,0.021717,,,44.7191,False
2017-07-03 05:23:00,,,,,,,False


As we can see, there are quite a few missing values in our dataset. Each column has some.

We will now try to characterize the distribution in which they show up.

In [35]:

for column in data.drop(columns=["Event"]).columns:
    print()
    print("Column:", util.bold(column))
    series = util.calculate_na_series(data[column])
    for length, count in Counter(map(int, series[:, 1])).most_common():
        print(f"There are {util.bold(count)} NA-sequences of length {util.bold(length)}")


Column: [1mTp[0m
There are [1m201[0m NA-sequences of length [1m1[0m
There are [1m1[0m NA-sequences of length [1m3[0m
There are [1m1[0m NA-sequences of length [1m10[0m

Column: [1mpH[0m
There are [1m195[0m NA-sequences of length [1m1[0m
There are [1m1[0m NA-sequences of length [1m3[0m
There are [1m1[0m NA-sequences of length [1m10[0m

Column: [1mCond[0m
There are [1m191[0m NA-sequences of length [1m1[0m
There are [1m1[0m NA-sequences of length [1m3[0m
There are [1m1[0m NA-sequences of length [1m10[0m

Column: [1mTurb[0m
There are [1m195[0m NA-sequences of length [1m1[0m
There are [1m1[0m NA-sequences of length [1m10[0m

Column: [1mSAC[0m
There are [1m188[0m NA-sequences of length [1m1[0m
There are [1m1[0m NA-sequences of length [1m2[0m
There are [1m1[0m NA-sequences of length [1m10[0m

Column: [1mPFM[0m
There are [1m198[0m NA-sequences of length [1m1[0m
There are [1m1[0m NA-sequences of length [1m3[0m
There are

We see that most missing values just come up randomly and not in a bunch.
The segments of 10 consecutive NA values across all columns seems suspicious though.

Let's see if this is the same for all.

In [24]:
collective_na_series = util.calculate_true_series(data.drop(columns="Event").isna().to_numpy().all(axis=1))
collective_na_series[collective_na_series[:,1] > 1]

array([[78550,    10]])

Indeed, we see that there is only one index with a series of timestamps where all entries are 0 (that is more than timeslot long).
It starts at index 78550.

Since most of them however, just come up individually, we would expect that naive filling algorithms should perform well enough as more complex ones.

By the way, respectively the other series' with more than one consecutive Na look like this. Note that all of them are classified as an anomaly.

In [25]:
for column in data.drop(columns="Event").columns:
    print()
    longer_segments = series[np.logical_and(series[:, 1] > 1, series[:, 1] < 10)]
    display([data.iloc[i:i+l] for i, l in longer_segments])


[                     Tp  pH  Cond      Turb      SAC  PFM  Event
Time                                                            
2017-08-24 12:42:00 NaN NaN   NaN  0.153299  4.24212  NaN   True
2017-08-24 12:43:00 NaN NaN   NaN  0.158719      NaN  NaN   True
2017-08-24 12:44:00 NaN NaN   NaN       NaN      NaN  NaN   True]

[                     Tp  pH  Cond      Turb      SAC  PFM  Event
Time                                                            
2017-08-24 12:42:00 NaN NaN   NaN  0.153299  4.24212  NaN   True
2017-08-24 12:43:00 NaN NaN   NaN  0.158719      NaN  NaN   True
2017-08-24 12:44:00 NaN NaN   NaN       NaN      NaN  NaN   True]

[                     Tp  pH  Cond      Turb      SAC  PFM  Event
Time                                                            
2017-08-24 12:42:00 NaN NaN   NaN  0.153299  4.24212  NaN   True
2017-08-24 12:43:00 NaN NaN   NaN  0.158719      NaN  NaN   True
2017-08-24 12:44:00 NaN NaN   NaN       NaN      NaN  NaN   True]

[              

# Filling missing values
We want to try and evaluate all methods separately. We will evaluate it using the RMSE analogously to the lecture.
I.e., we will look for mostly intact segments, remove values arbitrarily, and then fill them using the methods.
## Simple Filling
Because filling values for one column is independent of the others, we can fill the missing values for each column separately in order to maximize the size of the validation set.
Prior, we calculate the proportion of Na values in the entire column and delete the same
proportion from our validation set.
Assuming the missing values are uniformly randomly distributed, our calculated error will tend towards the true error of our method.

In [26]:
bold = util.bold
permitted_missing_values = 10
print("Permitted missing values in validation segment:", bold(permitted_missing_values))
for column in data.columns:
    if column == "Event":
        continue
    segment_bounds = util.find_best_segment_in_series(data[column], permitted_missing_values)
    segment = data[column].iloc[segment_bounds[0]:segment_bounds[1]]
    na_proportion = data[column].isna().sum() / len(segment)
    drop_count = round(len(segment) * na_proportion)
    np.random.seed(42) # seed (to get reproducible results)
    mv_idx = np.random.choice(np.arange(len(segment)), size=drop_count, replace=False)
    segment_mv = segment.copy()
    segment_mv.iloc[mv_idx] = np.nan
    ffill_error = np.sqrt(mean_squared_error(segment.ffill(), segment_mv.ffill()))
    bfill_error = np.sqrt(mean_squared_error(segment.bfill(), segment_mv.bfill()))
    print()
    print("Column:", bold(column))
    print(f"Validation segment length:", bold(len(segment)))
    print("Proportion of Na values in the column:", bold(f"{na_proportion:.3%}"))
    print("Deleted samples for evaluation:", bold(drop_count))
    print("FFill RMSE:", bold(ffill_error))
    print("BFill RMSE:", bold(bfill_error))


Permitted missing values in validation segment: [1m10[0m

Column: [1mTp[0m
Validation segment length: [1m15050[0m
Proportion of Na values in the column: [1m1.422%[0m
Deleted samples for evaluation: [1m214[0m
FFill RMSE: [1m0.0007944993617872808[0m
BFill RMSE: [1m0.0009329693959009103[0m

Column: [1mpH[0m
Validation segment length: [1m16071[0m
Proportion of Na values in the column: [1m1.294%[0m
Deleted samples for evaluation: [1m208[0m
FFill RMSE: [1m0.00021721817299058996[0m
BFill RMSE: [1m0.00021717736321615354[0m

Column: [1mCond[0m
Validation segment length: [1m15477[0m
Proportion of Na values in the column: [1m1.318%[0m
Deleted samples for evaluation: [1m204[0m
FFill RMSE: [1m6.527287269456962e-07[0m
BFill RMSE: [1m6.510298698614589e-07[0m

Column: [1mTurb[0m
Validation segment length: [1m15266[0m
Proportion of Na values in the column: [1m1.343%[0m
Deleted samples for evaluation: [1m205[0m
FFill RMSE: [1m0.00032659511767562993[0m
BFil

## Auto Correlation Plot

In [27]:
#üfor column in data.columns:
##    display(util.plot_autocorrelation(data[column], figsize))
#util.plot_autocorrelation(data["Tp"], figsize)

## Test for markov property

In the lectures, wen defined the alarm signal we strive to minimize:
$$
-\log f(x, θ) \geq ε
$$
where $$ f(x, θ) $$ is the true distribution function.

## Investigation
Determining the period:

# Multivariate Kernel Density Estimation
The first approach presented in the lecture is **Kernel Density Estimation**

In order to employ **KDE**, we need to determine the optimal **Kernel Function** and **Bandwidth**. 
Since we have multiple columns, we cannot use the Rule Of Thumb for the latter. Therefore, we need to optimize the following term according to the lecture: 
$$
\mathop{\arg\max}_{h} \mathbb{E}_{x \sim f(x), \bar{x} \sim f(x)}\left[ L(h, x, \bar{x})\right]
$$
where
- $$
L(h, x, \bar{x}) = \prod_{i=1}^m \hat{f}(x_i, \bar{x}_i, h)
$$
- $\hat{f}$ is the density estimator (which outputs a probability)
- $\bar{x}$ the training set

according to the lecture.
