# Introduction

This notebook duplicates the data preprocessing and modelling that Chronos+ does, with all the database and job queue code stripped out. As far as possible, the duplication is verbatim to facilitate the transfer of any changes made in this notebook back into Chronos+. All current data rejection points are also documented.

## Imports

In [11]:
import numpy as np
import pandas as pd
import statsmodels.tsa.stattools as sts
from scipy.signal import argrelextrema

## Define CSV filename, datetime and value columns

To test the modelling and scoring process on your own data, just change the following variables.

1. `datecol` --> the name of the column in the CSV file we wish to use as the time series index.
2. `ycols` --> the list of names of the value columns we wish to train and score on.
3. `fname` --> name of csv file.

In [2]:
datecol = "Time"
ycols = ["SA1900282"] # there can be more than one
fname = "data/giordano-oct-nov.csv"

## Ingesting the data

1. `datecol` must exist in the CSV or we raise an exception
2. `ycols` must contain the name of at least one column or we raise an exception
3. every column in `ycols` must exist in the CSV or we raise an exception
4. if we cannot parse a value in the datetime column as a datetime, we discard the whole row
5. every value in the columns denoted by `ycols` is parsed as a float or set to np.nan
6. we set the `datecol` column as the index of the dataframe
7. we sort the dataframe by the index
7. we drop any columns not listed in `ycols` and return the result

In [3]:
class MissingColumnError(Exception):
    def __init__(self, col, *args, **kwargs):
        self.column = col
        Exception.__init__(self, *args, **kwargs)
        
def ingest(csv, dscol, ycols):
    """ ingests a timeseries csv. Does not throw an error if datetime format is not followed (will always try to interpret) or if value column contains non-int or non-floats.

    1) renames the datetime column to 'ds'

    2) indexes the 'ds' column

    3) sorts by ds column
    
    Parameters
    ----------
    csv : str 
         path to csv with header and format `datetime,value`.
         datetime: {%Y-%m-%d (e.g. 2017-12-01), 
                    %Y-%m-%d (e.g. 2017-12-1), 
                    %Y-%b-%d (e.g. 2017-Dec-01), 
                    %d-%b-%Y (e.g. 1-Dec-2017), 
                    %Y-%m-%d %H:%M:%S (e.g. 2017-12-01 00:00:00)}
                 * note all represent 1st December 2017
         value: {float, int}

    dscol : str
                    the header name for the datetime column in the csv
    ycols: list of str
                    the names of the value columns
    
    Returns
    -------
    pandas.DataFrame

    """
    df = pd.read_csv(csv)

    if dscol not in df:
        raise MissingColumnError(dscol)

    if not ycols:
        raise ValueError(f"ycols is empty: {ycols}")

    for ycol in ycols:
        if ycol not in df:
            raise MissingColumnError(ycol)

    # if the date column is not parseable as a date, then we should discard the row
    df[dscol] = pd.to_datetime(df[dscol], errors="coerce")
    df = df[~df[dscol].isnull()]

    # parse value column as float. no exceptions
    for ycol in ycols:
        df[ycol] = df[ycol].apply(__float_or_nan)

    df = df.set_index(dscol).sort_index()[ycols]
    return df.sort_index()

def __float_or_nan(o):
    try:
        return np.float(o)
    except:
        return np.nan

In [4]:
df = ingest(fname, datecol, ycols)

## Autoencoder preprocessing

This is code that needs to be run regardless of whether we are preprocessing data for training or scoring. That is why it has been refactored into its own function.

1. remove duplicate rows (identical datetime and values)
2. if we find rows with identical datetime but different values, we raise an exception because it is impossible for us to decide which row contains the correct value.
  1. the other option is to delete all such rows, but for now we have elected to reject the entire CSV
3. we try to detect the sampling frequency of each value column
  1. we take the mode of the time differences between successive values for each column
  2. if there is no mode for any of the columns we reject the whole CSV
4. we return the resulting dataframe, along with the sampling frequencies for each column, as well as the minimum sampling frequency

In [5]:
def ae_preproc(df, ycols):
    """takes df, ycols and returns preprocessed df, freq"""
    df = remove_duplicate_rows(df)

    dupe_ds = list(df[df.index.duplicated()].index.drop_duplicates())
    # we also reject if we have duplicate datetimes that are not duplicate rows
    if len(dupe_ds)!=0:
        raise Exception("rows with duplicate datetimes detected")

    colfreqs = {}
    for y in ycols:
        currdf = df[[y]].dropna()
        freqs = detect_ds_frequency(currdf)
        if len(freqs)!=1:
            # we reject if we have no modal sampling frequency
            raise Exception("more than one sampling frequency was detected")

        colfreqs[y] = freqs.index[0]

    minfreq = min(colfreqs.values())

    return df, colfreqs, minfreq

def remove_duplicate_rows(df):
    """returns df with duplicates removed 
    """    
    idx_name = df.index.name
    return df.reset_index().drop_duplicates().set_index(idx_name)

def detect_ds_frequency(df):
    """detects the frequency of datetime in df.
    
    1) subtract all the datetime in ds by its next datetime to retrieve the deltas
    
    2) count the number of occurrences for each delta
    
    Parameters
    ----------
    df : pandas.DataFrame 
         The dataframe to detect the datetime frequency, needs to have a sorted datetime index
    Returns
    -------
    freq : pd.Series of size 1 whose index is the modal timedelta and value is the number of times it appeared if there is one mode
    and it appeared > 50% of the time
           else pd.Series of max size 10 sorted by ascending order of modal timedeltas
    """
    deltas = pd.Series(df.index[1:] - df.index[:-1])
    threshold = len(deltas)/2
    delta_counts = deltas.value_counts()
    modal_delta = delta_counts.idxmax()
    mdcount = delta_counts[modal_delta]

    if mdcount > threshold:
        return delta_counts.head(1)

    dc = [item for item in delta_counts.iteritems()]
    # sort by ascending order of timedelta
    dc.sort(key=lambda e:e[0])
    # sort by descending order of occurrences
    dc.sort(key=lambda e:e[1], reverse=True)
    dc = dc[:50]

    return pd.Series(data=[e[1] for e in dc], index=[e[0] for e in dc])

In [6]:
df, colfreqs, minfreq = ae_preproc(df, ycols)

## Check for sparse columns

Given a duration _d_ and a time series with a sampling frequency _f_, the expected number of values it should have over _d_ is 

_e_ = _d_/_f_

We define a sparse column as a column within the dataframe where given its sampling frequency and the duration of the whole dataframe, it has less than 75% the number of expected values. We raise an exception if any column is sparse.

In [7]:
class SparseColumnError(Exception):
    pass

def check_sparse_cols(df, colfreqs):
    totalduration = df.index[-1] - df.index[0]
    for y, freq in colfreqs.items():
        numpoints = len(df[[y]].dropna())
        # every column must cover 75% of the duration
        expected = totalduration // freq * 0.75
        if numpoints < expected:
            raise SparseColumnError(f"column {y} has frequency {freq} and {numpoints} points, but duration of dataset is {totalduration}")

In [8]:
check_sparse_cols(df, colfreqs)

## Get period

In [12]:
class NoPeriodError(Exception):
    pass

def get_period(df, colfreqs, minfreq):
    # check that all sampling frequencies are multiples of the smallest sampling frequency
    notime = pd.Timedelta(0)
    for f in colfreqs.values():
        if f % minfreq != notime:
            raise ValueError("detected sampling frequency that is not a multiple of the minimum sampling frequency")

    # there is no point calculating autocorrelation for lags greater than n/2
    nlags = len(df)//2
    acfs = np.zeros((nlags+1,))
    for y in colfreqs:
        yvals = df[y].dropna()
        yacfs = sts.acf(yvals, nlags=len(yvals)//2)
        step_size = colfreqs[y]//minfreq
        yacfs_idx = range(0, len(acfs), step_size)
        # we simplify by cropping because if len(df) % len(yvals) != step_size we get tedious off-by-one errors
        yacfs = yacfs[:len(yacfs_idx)]
        # if y has missing values, yacfs can be shorter than yacfs_idx -_-
        yacfs_idx = yacfs_idx[:len(yacfs)]
        acfs[yacfs_idx] += yacfs

    max_corr_points = argrelextrema(acfs, np.greater, order=max(len(df)//1000, 2))
    max_corr_points = max_corr_points[0]
    max_corr_points = max_corr_points[acfs[max_corr_points]>0.2]
    max_corr_points = np.insert(max_corr_points, 0, 0, axis=0)
    max_cor_diff = []
    for point_1 in max_corr_points:
        for point_2 in max_corr_points:
            if point_1==point_2: continue
            max_cor_diff.append(abs(point_1-point_2))
    max_cor_diff = np.array(max_cor_diff)
    unique_vals, counts = np.unique(max_cor_diff, return_counts=True)
    adjust_counts = []
    for idx in range(len(unique_vals)):
        adjust_counts.append(np.sum(counts[np.where(unique_vals%unique_vals[idx]==0)]))
    if(np.max(adjust_counts)>5):
        return unique_vals[np.argmax(adjust_counts)]
    raise NoPeriodError("no period detected")

In [13]:
period = get_period(df, colfreqs, minfreq)

