In [1]:
# ============================================================
# Notebook setup: run this before everything
# ============================================================
# -- Copied from lecture
%load_ext autoreload
%config IPCompleter.greedy=True
%autoreload 1
%aimport util
from util import util
import numpy as np
# from matplotlib import pyplot as plt
import pandas as pd
from sklearn.neighbors import KernelDensity
from sklearn.metrics import mean_squared_error
from IPython.display import display

# Only if environment does not support explicit display, like Pycharm
display = print

# Control figure size
interactive_figures = False
if interactive_figures:
    # Normal behavior
    %matplotlib widget
    figsize=(9, 3)
else:
    # PDF export behavior
    figsize=(14, 4)

data_folder = '../resources/dataset'
file_name = '1_gecco2019_water_quality.csv'
# Load the input data
data_path = f'{data_folder}/{file_name}'
data = pd.read_csv(data_path)
data['Time'] = pd.to_datetime(data['Time'])
data.set_index('Time', inplace=True)
data = data.drop(columns=["Unnamed: 0"]) # The index was stored as an unnamed column

# Investigation

In [2]:
print("Columns: ")
{column: data[column].dtype for column in data.columns}

Columns: 


{'Tp': dtype('float64'),
 'pH': dtype('float64'),
 'Cond': dtype('float64'),
 'Turb': dtype('float64'),
 'SAC': dtype('float64'),
 'PFM': dtype('float64'),
 'Event': dtype('bool')}

In [3]:
display((data.index[1:] - data.index[:-1]).value_counts())
print("\nTherefore, there exists a sample for each timeslot.")
print("However, all columns themselves except for the label column contain missing values: \n")
display(data.isna().any())
data.loc[data.isna().any(axis=1)].head()

Time
0 days 00:01:00    132479
Name: count, dtype: int64

Therefore, there exists a sample for each timeslot.
However, all columns themselves except for the label column contain missing values: 

Tp        True
pH        True
Cond      True
Turb      True
SAC       True
PFM       True
Event    False
dtype: bool


Unnamed: 0_level_0,Tp,pH,Cond,Turb,SAC,PFM,Event
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-07-01 13:24:00,,,,,,,False
2017-07-01 17:00:00,,8.58873,,0.126537,3.58901,,False
2017-07-02 22:08:00,,,,,,,False
2017-07-03 01:46:00,7.1,8.55185,0.021717,,,44.7191,False
2017-07-03 05:23:00,,,,,,,False


# Filling missing values
We want to try and evaluate all methods separately. We will evaluate it using the RMSE analogously to the lecture.
I.e., we will look for mostly intact segments, remove values arbitrarily, and then fill them using the methods.
## Simple Filling
Because filling values for one column is independent of the others, we can fill the missing values for each column separately in order to maximize the size of the validation set.
Prior, we calculate the proportion of Na values in the entire column and delete the same
proportion from our validation set.
Assuming the missing values are uniformly randomly distributed, our calculated error will tend towards the true error of our method.

In [6]:
bold = util.bold
permitted_missing_values = 10
print("Permitted missing values in validation segment:", bold(permitted_missing_values))
for column in data.columns:
    if column == "Event":
        continue
    segment_bounds = util.find_best_segment_in_series(data[column], permitted_missing_values)
    segment = data[column].iloc[segment_bounds[0]:segment_bounds[1]]
    na_proportion = data[column].isna().sum() / len(segment)
    drop_count = round(len(segment) * na_proportion)
    np.random.seed(42) # seed (to get reproducible results)
    mv_idx = np.random.choice(np.arange(len(segment)), size=drop_count, replace=False)
    segment_mv = segment.copy()
    segment_mv.iloc[mv_idx] = np.nan
    error = np.sqrt(mean_squared_error(segment.ffill(), segment_mv.ffill()))
    print()
    print("Column:", bold(column))
    print(f"Validation segment length:", bold(len(segment)))
    print("Proportion of Na values in the column:", bold(f"{na_proportion:.3%}"))
    print("Deleted samples for evaluation:", bold(drop_count))
    print("Reconstruction error:", bold(f"{error:.3}"))


Permitted missing values in validation segment: [1m10[0m

Column: [1mTp[0m
Validation segment length: [1m15050[0m
Proportion of Na values in the column: [1m1.422%[0m
Deleted samples for evaluation: [1m214[0m
Reconstruction error: [1m0.000794[0m

Column: [1mpH[0m
Validation segment length: [1m16071[0m
Proportion of Na values in the column: [1m1.294%[0m
Deleted samples for evaluation: [1m208[0m
Reconstruction error: [1m0.000217[0m

Column: [1mCond[0m
Validation segment length: [1m15477[0m
Proportion of Na values in the column: [1m1.318%[0m
Deleted samples for evaluation: [1m204[0m
Reconstruction error: [1m6.53e-07[0m

Column: [1mTurb[0m
Validation segment length: [1m15266[0m
Proportion of Na values in the column: [1m1.343%[0m
Deleted samples for evaluation: [1m205[0m
Reconstruction error: [1m0.000327[0m

Column: [1mSAC[0m
Validation segment length: [1m13958[0m
Proportion of Na values in the column: [1m1.433%[0m
Deleted samples for evaluatio

## Auto Correlation Plot

In [5]:
#üfor column in data.columns:
##    display(util.plot_autocorrelation(data[column], figsize))
#util.plot_autocorrelation(data["Tp"], figsize)

## Test for markov property

In the lectures, wen defined the alarm signal we strive to minimize:
$$
-\log f(x, θ) \geq ε
$$
where $$ f(x, θ) $$ is the true distribution function.

## Investigation
Determining the period:

# Multivariate Kernel Density Estimation
The first approach presented in the lecture is **Kernel Density Estimation**

In order to employ **KDE**, we need to determine the optimal **Kernel Function** and **Bandwidth**. 
Since we have multiple columns, we cannot use the Rule Of Thumb for the latter. Therefore, we need to optimize the following term according to the lecture: 
$$
\mathop{\arg\max}_{h} \mathbb{E}_{x \sim f(x), \bar{x} \sim f(x)}\left[ L(h, x, \bar{x})\right]
$$
where
- $$
L(h, x, \bar{x}) = \prod_{i=1}^m \hat{f}(x_i, \bar{x}_i, h)
$$
- $\hat{f}$ is the density estimator (which outputs a probability)
- $\bar{x}$ the training set

according to the lecture.
