In [1]:
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
from settings import DATA_PATH

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.listdir(DATA_PATH)

['series1.csv', 'series2.csv', 'series3.csv']

In [7]:
df_1 = pd.read_csv(os.path.join(DATA_PATH, "series1.csv"))
df_2 = pd.read_csv(os.path.join(DATA_PATH, "series2.csv"))
df_3 = pd.read_csv(os.path.join(DATA_PATH, "series3.csv"))
df = pd.concat([df_1, df_2, df_3], ignore_index=True)

# Attempt to model data distributions

In [8]:
class NormalDistChecker:
    def __init__(self, diff_window_size):
        self._mean = 0
        self._std = 1
#         self._likelihood = 0
        self.history = {"mean": list(), "std": list(), "likelihood": list()}
        self.diff_window_size = diff_window_size
    
    def fit(self, data_batch):
        mean = np.mean(data_batch)
        std = np.std(data_batch)
        likelihood = np.mean(self.calculate_likelihood(data_batch[-self.diff_window_size:]))
        
        self._mean = mean
        self._std = std
        
        self.history["mean"].append(mean)
        self.history["std"].append(std)
        self.history["likelihood"].append(likelihood)
        
    def calculate_likelihood(self, data_batch, verbose=False):
        if verbose:
            print(data_batch - self._mean)
            print((data_batch - self._mean)/self._std)
            print(((data_batch - self._mean)/self._std) ** 2)
            print(- ((data_batch - self._mean)/self._std) ** 2 / 2)
            print(self._std * np.sqrt(2 * np.pi))
        return np.exp(- ((data_batch - self._mean)/self._std) ** 2 / 2) / (self._std * np.sqrt(2 * np.pi) + 1e-5)

## Model Unit Tests

In [9]:
model = NormalDistChecker(diff_window_size=10)
model._mean = 0
model._std = 1
model.calculate_likelihood(np.asarray([0, 1, 2, 3, 4, -4]), verbose=True)

[ 0  1  2  3  4 -4]
[ 0.  1.  2.  3.  4. -4.]
[ 0.  1.  4.  9. 16. 16.]
[-0.  -0.5 -2.  -4.5 -8.  -8. ]
2.5066282746310002


array([3.98940689e-01, 2.41969759e-01, 5.39907511e-02, 4.43183073e-03,
       1.33829692e-04, 1.33829692e-04])

## Experiments

In [10]:
x = df.x.values
model = NormalDistChecker(diff_window_size=10)
for i in range(len(x) - window_size):
    data_batch = x[i:i + window_size]
    model.fit(data_batch)

NameError: name 'window_size' is not defined

In [11]:
# model.history["likelihood"]
for i in range(0, len(model.history["likelihood"])):
    if model.history["likelihood"][i] < 0.35:
        print(i, model.history["likelihood"][i])

# New ideas
## Models

In [12]:
class CompareNextBatchesAnalyser:
    def __init__(self, window_size, threshold):
        # current distribution params
        self._mean = 0
        self._std = 1
        
        # window
        self._window_size = window_size
        self._window = np.zeros(shape=window_size)
        self._window_mean = 0
        self._window_std = 1
        self._current_index = -1

        # other params
        self._threshold = threshold
        
        # returned dictionary
        self.distributions = pd.DataFrame(columns=["start_index", "mean", "std"])
        
        # global index
        self._global_index = -1
    
    def fit(self, x):
        self._global_index += 1
        self._current_index += 1
        
        self._window[self._current_index] = x
        
        if self._current_index == self._window_size - 1:
            self._calculate_window_params()
            self._check_dist_if_changed()
            self._current_index = -1
            
    def get_distributions(self):
        return self.distributions            
            
    def _calculate_window_params(self):
        self._window_mean = np.mean(self._window)
        self._window_std = np.std(self._window)
        
    def _check_dist_if_changed(self):
        if np.abs(self._mean - self._window_mean) > self._threshold or np.abs(self._std - self._window_std) > self._threshold:
            self._mean = self._window_mean
            self._std = self._window_std
            start_index = self._global_index - self._window_size + 1
            self.distributions = self.distributions.append([{"start_index": start_index,
                                                            "mean": self._mean,
                                                            "std": self._std}], ignore_index=True)

## Experiments

In [13]:
stream = df_1.x
model = CompareNextBatchesAnalyser(window_size=100, threshold=0.2)
for i in range(len(stream)):
    model.fit(stream[i])
model.get_distributions()

Unnamed: 0,start_index,mean,std
0,0,1.238637,0.308834
1,2800,1.71551,0.434285
2,3000,1.279395,0.309475
3,3200,1.740668,0.438334
4,3400,1.293221,0.336348
5,3600,1.77713,0.425814
6,3800,1.226509,0.337892
7,4000,1.771789,0.349463
8,4200,1.282972,0.302951
9,4400,1.813593,0.420115


In [14]:
stream = df_2.x
model = CompareNextBatchesAnalyser(window_size=100, threshold=0.2)
for i in range(len(stream)):
    model.fit(stream[i])
model.get_distributions()

Unnamed: 0,start_index,mean,std
0,0,0.987104,0.196924
1,4100,1.194886,0.356508
2,5400,1.415543,0.289735


In [15]:
stream = df_3.x
model = CompareNextBatchesAnalyser(window_size=100, threshold=0.2)
for i in range(len(stream)):
    model.fit(stream[i])
model.get_distributions()

Unnamed: 0,start_index,mean,std
0,0,0.413841,0.694086
1,2600,0.20952,0.791659
2,3000,0.423469,0.793082
3,4200,0.199673,0.723433
4,4300,0.445625,0.752086
5,6500,0.232122,0.706892
6,7100,0.444803,0.835293
7,7400,0.137056,0.789461
8,7700,0.380942,0.640852
