# K-shape algorithm

## Required Python Libraries

In [1]:
import math
import numpy as np
import pandas as pd
import h5py
import os
import csv
import time
import multiprocessing
from numpy.random import randint
from numpy.linalg import norm, eigh
from numpy.fft import fft, ifft
from sklearn.base import ClusterMixin, BaseEstimator
from sklearn.metrics import rand_score, normalized_mutual_info_score, adjusted_rand_score

## Functions

### ***Z-score*** Function

Z-score is a mathematical approach to convert a "Generic" Random Variables to a Normal Random Variables. 

For more information about this approach: 
https://it.wikipedia.org/wiki/Standardizzazione_(statistica)

**Function Inputs**
1. ```a```: numpy array containing time-series values;
2. ```axis```: axis used to evaluate mean and standard deviation. For axis we intend rows or columns.
3. ```ddof```: Degrees of freedom correction in the calculation of the standard deviation. Default value equals to 0

**Function returned values**:
1. Normalized Time-Series

**Function Assumptions**:
1. ```axis = 0 ```: normalize always on column axis. This is true beacuse we want to normalize time instants values, to exlude possible outliers or errors

In [2]:
def zscore(a, axis=0, ddof=0):
    a = np.asanyarray(a) #convert the input a in an array
    mns = a.mean(axis=axis) #compute the mean of the time series along the column axis (mean for each time instant)
    sstd = a.std(axis=axis, ddof=ddof) #compute the standard deviation of the array along the column axis (mean for each time instant)

    ## If Condition-->
    ## If axis equals to row (axis = 1) and dimensions of mean array is lower than dimensions of time series array
    ## -----> calculate Z score expanding dimensions of mean and std array on row axis
    ## If axis equals to columns (axis = 0) or dimensions of mean array is equal to dimensions of time series array
    ## -----> calculate Z score as time-series array minus its mean, divided by standard deviation

    if axis and mns.ndim < a.ndim:
        res = ((a - np.expand_dims(mns, axis=axis)) / #compute normalized data and expand the the mean and sd to match the dimension with the array a (the mean and std have 1 dimension less=
               np.expand_dims(sstd, axis=axis))
    else:
        res = (a - mns) / sstd #compute normalized data

    ## nan_to_num method --> replace NaN values with 0 or large integer values for infinite values
    return np.nan_to_num(res)

   


### ***Roll_zeropad*** Function

```roll_zeropad``` is a custom function to shift time series values to the right. The number of positions required by the movement are evaluated with parameter ```shift```.
For each shift, a zero value is included in the time series.

**Function Inputs**
1. ```a```: numpy array containing time-series values;
2. ```shift```: Number of right movements required;
3. ```axis```: axis to consider for right movements.

**Function returned values**:
1. Shifted Time-Series

**Function Assumptions**:
1. ```axis = None ```: time series can be considered as a 1D array and not as a matrix

In [3]:
def roll_zeropad(a, shift, axis=None):
    # asanyarray method --> Convert the input to an ndarray, but pass ndarray subclasses through.
    a = np.asanyarray(a) # Ensure that the input is ndarrary data type

    # If no shift is required (shift = 0), it returns the original values of time series
    if shift == 0:
        return a

    # If axis parameter is not evaluated (axis = None)
    # ------> consider all elements of time series and not a specific dimensions, but this requires a reshape of data at the end
    # If axis is set
    # ------> extracted dimensions of a specific axis (rows or columns) and work on that
    if axis is None:
        n = a.size
        reshape = True
    else:
        n = a.shape[axis]
        reshape = False

    # zeros_like() method --> Return an array of zeros with the same shape and type as a given array.

    # If shift parameter is greater than size of the time series (but this requires that no axis are specified)
    # ------> it evaluates an array of 0 values with the same size of a
    if np.abs(shift) > n:
        res = np.zeros_like(a)
    # If shift parameter is lower than 0 (moving to the left)
    # ------> subtract the shift value to n size of the array
    # ------> extract the indices from 0 to (n-shift) values, considering a plain array (axis = None) or a specific dimensions
    # ------> generate a 0-valued array with size [0, (n-shift)]
    # ------> extract values from (n-shift) position to n position
    # ------> create an array with values in range [(n-shift),n] at the beginning and all 0 values after
    elif shift < 0:
        shift += n
        zeros = np.zeros_like(a.take(np.arange(n-shift), axis))
        res = np.concatenate((a.take(np.arange(n-shift, n), axis), zeros), axis)
    # If shift parameter is in the range [0, size of time series]
    # ------> extract values from (n-shift) position to n position
    # ------> generate a 0-valued array with size [(n-shift), n]
    # ------> extract the indices from 0 to (n-shift) values, considering a plain array (axis = None) or a specific dimensions
    # ------> create an array with 0 value at the beginning and then values in range [0, (n-shift)]
    else:
        zeros = np.zeros_like(a.take(np.arange(n-shift, n), axis))
        res = np.concatenate((zeros, a.take(np.arange(n-shift), axis)), axis)

    # If reshape is required
    # -----> change shape (row, column) of shifted array, considering shape of time series values
    if reshape:
        return res.reshape(a.shape)
    else:
        return res

### ***_ncc_c_3dim*** Function

```_ncc_c_3dim``` is a custom function to evaluate the **array** of NCCc values. \
Each element of the array is associated with a specific shift computed on a specific time series.\
NCCc is a normalization of Cross Correlation value, which represents with a value the similarity between 2 time series\
K-Shape Algorithm uses NCCc to evaluate SBD distance measure which is used to assign the time series to the closest cluster centroid.

**Function Inputs**
1. ```data```: numpy array containing 2 time series to compare;
    * Generally, in the algorithm, X is the cluster centroid while Y is the time series to assign.

**Function returned values**:
1. The optimal value of NCCc between X and Y.

**Function Assumptions**:
1. ```None ```

In [4]:
def _ncc_c_3dim(data):
    # Extract X and Y from data
    # Positions in data array are very important
    x, y = data[0], data[1]

    # np.norm function returns one of the eight matrix norms (X parameter considered is a 2D matrix.)
    # Evaluating axis parameter specifies in which dimension evaluate matrix norm.
    den = norm(x, axis=(0,1)) * norm(y, axis=(0,1))
    #print(den)

    # If the computed denominator of NCCc is so small, it is set to infinite value, in order to ignore NCCc value. 
    if den < 1e-9:
        den = np.inf

    # the variable represents the number of time instants considered for each time series.
    x_len = x.shape[0]
    #print(x_len)

    # This is operation is necessary to improve computational effort of FFT.
    # As the article said, in order to improve FFT performances, Cross Correlation must be an exact power-of-two.
    # The following line of code approximate the result to the next power-of-two value. 
    fft_size = 1 << (2*x_len-1).bit_length()

    # As the paper said, CC can be evaluated as convolution of two time series where one of two sequences is reversed in time domain. 
    # The convolution is computed as Inverse Discrete Fourier Transformer (IDFT) of the product of the individual Discrete Fourier Transforms (DFT) of the time series
    # To reduce computational effort, Fast Fourier Transformer (FFT) substitutes DFT.
    # np.conj function return the complex coniugate of a specified number. 
    # The complex conjugate of a complex number is obtained by changing the sign of its imaginary part.
    cc = ifft(fft(x, fft_size, axis=0) * np.conj(fft(y, fft_size, axis=0)), axis=0)

    # CC is the join, along axis 0 (rows), of two selected sequences
    # The selected sequences are extracted from convolution of two time series (IFFT)
    # CC is an array of complex numbers
    cc = np.concatenate((cc[-(x_len-1):], cc[:x_len]), axis=0)
    #print(cc)
    
    # Return array of NCCc values
    return np.real(cc).sum(axis=-1) / den

### ***_sbd*** Function

```_sbd``` function is used to evaluate Shape Based Distance between two time series\
In particular, SBD is similarity measures applied to two time series.\
SBD values are in range [0,2] with 0 assigning perfect match\
```_sbd``` function does not return similarity value, but it returns the optimal shift y.\
The optimal shift of Y represents the closest shift of Y compared to X.\
\
This function is associated with **Algorithm 1** of the paper

**Function Inputs**
1. ```x```: reference time series
    * Generally, in the algorithm, X is the cluster centroid.
2. ```y```: compared time series
    * Generally, in the algorithm, X is the cluster centroid.

**Function returned values**:
1. The optimal shift of Y.

**Function Assumptions**:
1. ```None ```

In [5]:
# It is possible that the algorithm can ignore this function
# In fact, if cluster centroids are evaluated as all zeros, the function is not used
def _sbd(x, y):
    # Evaluate NCCc array
    ncc = _ncc_c_3dim([x, y])
    #print(ncc)

    # Find the index where NCCc is maximized (w in the paper)
    idx = np.argmax(ncc)
    #print(ncc[idx])

    # The position w is used to evaluate the optimal shift as w-m
    # The shift index is passed to roll_zeropad function to generate optimal y shift
    yshift = roll_zeropad(y, (idx + 1) - max(len(x), len(y)))
    
    return yshift

### ***collect_shift*** Function

```collect_shift``` function is used compare a specific time series against a cluster centroid\
If the cluster centroid is represented by all zeros value, SBD distance is not evaluated\
\
It is part of the **Algorithm 2** presented in the paper


**Function Inputs**
1. ```data```: array of time series
    * The first position of the array contains the time series to analyze (X).
    * The second element is column vector, containing cluster centroids.

**Function returned values**:
1. Cluster centroid associated with specified time series.

**Function Assumptions**:
1. If cluster centroids are all zeros, single time series is considered as cluster centroids

In [6]:
def collect_shift(data):
    # Extract data from input parameter
    x, cur_center = data[0], data[1]
    #print(cur_center.shape)

    # If all the elements of cluster centroids
    # ------> Return the value of compared time series as cluster centroids
    # Else
    # ------> Return the optimal shift of X compared to current centroids.
    if np.all(cur_center==0):
        return x
    else:
        return _sbd(cur_center, x)

### ***_extract_shape*** Function

```_extract_shape``` function returns optimal and normalized cluster centroids to consider in K-Shape Algorithm.\
\
Main steps considered are presented in **Algorithm 2** of the paper.\
As the article summarizes, cluster centroids are evaluated considering an optimization approach and a minimizer function.\
The main idea is to minimize the sum of squared distances among all other time series. \
In our case, as Cross Correlation captures the similarity, a maximizer function is considered in optimization problem.\
So, we have to maximize the squared similarities to all other time series sequences.

**Function Inputs**
1. ```idx```: array containing cluster labels associated to each time series;
2. ```x```: array of time series to analyze;
3. ```j```: cluster to consider during the extraction;
4. ```cur_center```: initial values of cluster centroid for cluster ***j***:
    * Can be all zeros;
    * A random selected time series
    * Shape of ```cur_center``` is (len(<TIME_INSTANTS>, 1))

**Function returned values**:
1. Normalized Cluster Centroid for J Cluster

**Function Assumptions**:
1. As this approach is used in the context of iterative clustering, we use the previously computed centroid as reference and align all sequences towards this reference sequence.
    * This is a reasonable choice because the previous centroid will be very close to the new centroid;
    * SBD is used to evaluate optimal shift
2. The maximizer function is associated to well-known maximization problem: maximization of the Rayleigh Quotient.

In [55]:
def _extract_shape(idx, x, j, cur_center):
    _a=[]
    #print(idx)
    # For each cluster label:
    # -----> If it is equals to cluster to consider
    # ---------> Find the centroid to consider (_sbd function used)
    for i in range(len(idx)):
        if idx[i] == j:
            _a.append(collect_shift([x[i], cur_center]))

    # Convert collected cluster centroids in a numpy array
    a = np.array(_a)
    #print(a)

    # np.squeeze --> remove axes of length one from a
    # If the previous code does not select anything, it means that noone of the time series is associated with J Cluster
    # ------> Select a random index between 1 and number of time series (x.shape[0])
    # ------> Return the time series associated with generated random index as cluster centroid
    if len(a) == 0:
        indices = np.random.choice(x.shape[0], 1)
        return np.squeeze(x[indices].copy())
        #return np.zeros((x.shape[1]))

    # Extract column numbers of centroids --> time instants
    columns = a.shape[1]

    # Z-Normalization of centroids array
    y = zscore(a, axis=1, ddof=1)

    # Matrix preparation to apply Rayleigh Quotient maximization problem --> Algorithm 2 of paper
    # np.dot --> dot products of two array
    # np.empty --> create an empty array
    # np.eye --> create identity matrix

    # S matrix = product between centroids and theri transposed matrix
    s = np.dot(y[:, :, 0].transpose(), y[:, :, 0])

    # Create empty matrix P of dimensions [columns, columns]
    p = np.empty((columns, columns))

    # Fill the matrix p with a specific value, depending on problem dimensions
    # With this operation, matrix P represents the product (1/m)*O evaluated in Algorithm 2
    # matrix O is an all ones matrix
    # m parameter is called columns in the code
    p.fill(1.0/columns)

    # Now matrix P represents matrix Q of the Algorithm 2
    # It is the difference between the identity matrix and the product (1/m)*O
    p = np.eye(columns) - p

    # Evaluate matrix M with given formula
    m = np.dot(np.dot(p, s), p)

    # Calculate eigen vectors of matrix M
    _, vec = eigh(m)

    # Extract first centroid of matrix M --> it represents the centroid for specific cluster J
    # These equations depends on Rayleigh Quotient maximization problem
    centroid = vec[:, -1]

    print((centroid.reshape((x.shape[1], 1)).shape))
    # print(a - centroid.reshape((x.shape[1], 1)))
    # print((a - centroid.reshape((x.shape[1], 1))).shape)
    # print(np.linalg.norm(a - centroid.reshape((x.shape[1], 1)), axis=(1, 2)))

    # Vector norm --> non negative real number which could represent the distance from the origin in a vector space

    # The following operations create a matrix where each time series is differenced with centroid values
    # Then, the equations try to evaluate the distance from the origin of an ideal vector space
    # This approach is repeated adding centroid values with time series values
    finddistance1 = np.sum(np.linalg.norm(a - centroid.reshape((x.shape[1], 1)), axis=(1, 2)))
    finddistance2 = np.sum(np.linalg.norm(a + centroid.reshape((x.shape[1], 1)), axis=(1, 2)))

    # If the first distance is greater, you need to consider the symmetric quadrant
    # So, centroid values are multiplied with -1.
    if finddistance1 >= finddistance2:
        centroid *= -1

    # Return normalized centroid for cluster J
    return zscore(centroid, ddof=1)

### ***_kshape*** Function

```_kshape``` function represents the real intelligence behind K-Shape algorithm\
\
Main steps considered are presented in **Algorithm 3** of the paper.\
The general idea is to apply kshape approach until a stopping condition is reached.\
There are two possible stopping conditions:
- Maximum number of iterations is reached;
- In two consecutive iterations, the same output is produced.\


\
K-Shape Algorithm is composed by two main steps:
* Refinement step: during this step, new cluster centroids are evaluated;
* Assigment step: it describe the assigment of a specific time series to the closest cluster.

**Function Inputs**
1. ```x```: array of time series to analyze; 
2. ```k```: number of cluster to find out;
3. ```centroid_init```: how centroids values are initialized:
    * Possible values are ```zero``` or ```random```
    * Default value is ```zero```
4. ```max_iter```: maximum number of iterations executed by the algorithm:
    * Default values is 100.
5. ```n_jobs```: this parameter activates the multiprocessing approach inside Python. 
    * This could be useful to improve algorithm computational time;
    * Since multiprocessing is quite heavy, default value of parameter ```n_jobs``` is 1

**Function returned values**:
1. Assigned sequence: an array where the value of ith element represents the cluster where ith time series joins.
2. Extracted centroids: array of extracted centroids.

**Function Assumptions**:
1. Except from the first iteration, previous evaluated centroids are used to extract new centroids values. 
2. The first assignment sequence is randomly generated. 

In [57]:
def _kshape(x, k, centroid_init='zero', max_iter=100, n_jobs=1):
    # Extract number of time series to analyze
    m = x.shape[0]

    # Generate a random sequence assigment --> necessary only in the first execution
    idx = randint(0, k, size=m)

    # Initial centroid values
    # Zero value generated all centroids as zeros array
    # Random value extract k time series from available ones and use them as cluster centroids.
    if centroid_init == 'zero':
        centroids = np.zeros((k, x.shape[1], x.shape[2]))
    elif centroid_init == 'random':
        indices = np.random.choice(x.shape[0], k)
        centroids = x[indices].copy()
    
    # Distances matrix is used to track distances of each time series from all clusters
    # It is initialized as empty
    distances = np.empty((m, k))
    
    # For istruction is necessary to apply the first stopping condition --> the maximum number of iterations is reached
    for it in range(max_iter):
        # This assignment is necessary for the second stopping condition
        # With old_idx, we can compare if the previously found sequence is equal to the current one.
        old_idx = idx

        # Refinement step
        # Using _extract_shape function, new cluster centroids are evaluated on each iteration
        # To extract new cluster centroids, previous ones are used
        # This is applied for each cluster

        for j in range(k):
            for d in range(x.shape[2]):
                centroids[j, :, d] = _extract_shape(idx, np.expand_dims(x[:, :, d], axis=2), j, np.expand_dims(centroids[j, :, d], axis=1))
                #centroids[j] = np.expand_dims(_extract_shape(idx, x, j, centroids[j]), axis=1)

        # Using multiprocessing to apply multi threading of the same tasks
        pool = multiprocessing.Pool(n_jobs)
        args = []
        for p in range(m):
            for q in range(k):
                args.append([x[p, :], centroids[q, :]])
        # The task is NCC-C evaluation for each time series to all cluster centroids
        # Arguments of the task are time series and cluster centroids
        # A specific comparison represents a single task

        # Extract results
        result = pool.map(_ncc_c_3dim, args)

        # Close pool
        pool.close()

        # R index is necessary to access results of multiprocessing
        r = 0

        # For each time series p in m
        for p in range(m):
            # For each cluster q in k
            for q in range(k):
                # Distance matrix in [p,q] contains SBD value
                # SBD value represents the similarity between time series p and cluster centroids q 
                distances[p, q] = 1 - result[r].max()
                r = r + 1

        # Using argmin function, for each row of distance matrix (so, for each time series) the column index with the lowest value is extracted
        # So, this istruction extracts for each time series the column (so, the cluster) with the lowest SBD value
        idx = distances.argmin(1)

        # Second stopping conditions --> two consecutive sequences are equal
        if np.array_equal(old_idx, idx):
            break
    
    # Return extracted sequence and cluster centroids values
    return idx, centroids

### ***kshape*** Function

In [9]:
def kshape(x, k, centroid_init='zero', max_iter=100):
    idx, centroids = _kshape(np.array(x), k, centroid_init=centroid_init, max_iter=max_iter)
    clusters = []
    for i, centroid in enumerate(centroids):
        series = []
        for j, val in enumerate(idx):
            if i == val:
                series.append(j)
        clusters.append((centroid, series))

    return clusters

### ***KShapeClusteringCPU*** Class

In [10]:
class KShapeClusteringCPU(ClusterMixin,BaseEstimator):
    labels_= None
    centroids_ = None

    def __init__(self,n_clusters, centroid_init='zero', max_iter=100, n_jobs=None):
        self.n_clusters = n_clusters
        self.centroid_init = centroid_init
        self.max_iter = max_iter
        if n_jobs is None:
            self.n_jobs=1
        elif n_jobs == -1:
            self.n_jobs = multiprocessing.cpu_count()
        else:
            self.n_jobs=n_jobs
        


    def fit(self,X,y=None):
        clusters = self._fit(X,self.n_clusters, self.centroid_init, self.max_iter,self.n_jobs)
        self.labels_ = np.zeros(X.shape[0])
        self.centroids_ =np.zeros((self.n_clusters, X.shape[1], X.shape[2]))
        for i in range(self.n_clusters):
            self.labels_[clusters[i][1]] = i
            self.centroids_[i]=clusters[i][0]
        return self

    def predict(self, X):
        labels, _ = self._predict(X,self.centroids_)
        return labels
        
    
    def _predict(self,x, centroids):
        m = x.shape[0]
        idx = randint(0, self.n_clusters, size=m)
        distances = np.empty((m, self.n_clusters))
        

    
        pool = multiprocessing.Pool(self.n_jobs)
        args = []
        for p in range(m):
            for q in range(self.n_clusters):
                args.append([x[p, :], centroids[q, :]])
        result = pool.map(_ncc_c_3dim, args)
        pool.close()
        r = 0
        for p in range(m):
            for q in range(self.n_clusters):
                distances[p, q] = 1 - result[r].max()
                r = r + 1
    
        idx = distances.argmin(1)

        return idx, centroids
    
    
    def _fit(self,x, k, centroid_init='zero', max_iter=100,n_jobs=1):
        idx, centroids = _kshape(np.array(x), k, centroid_init=centroid_init, max_iter=max_iter, n_jobs=n_jobs)
        clusters = []
        for i, centroid in enumerate(centroids):
            series = []
            for j, val in enumerate(idx):
                if i == val:
                    series.append(j)
            clusters.append((centroid, series))
    
        return clusters

## Correlated Functions

```ClusterDataLoader``` class is used to read dataset input files and convert them in a set of arrays. \
The function will load every files (both TRAIN and TEST files) in a specific path. \
Every files must be a CSV files with comma separator\
\
Each row of the file represents a time-series and it contains:
* The first element is cluster which time-series joins;
* All successive elements are measurements of each time-series. Each measurements is associated with a specific time instants. 

It is important to highlight that there aren't any information about time-series or time instants order. \
We suppose that the order of time-series is not important for K-shape Clustering Algorithm.

In [11]:
class ClusterDataLoader:
    def __init__(self, dataset_path):
        self.path = dataset_path

    def load(self, sub_dataset_name):
        ts, labels = [], []
        for mode in ['_TRAIN']:
            with open(os.path.join(self.path, sub_dataset_name, sub_dataset_name + mode)) as csv_file:
                lines = csv.reader(csv_file, delimiter=',')
                for line in lines:
                    ts.append([float(x) for x in line[1:]])
                    labels.append(int(line[0])-1)

        if min(labels) == 1:
            labels = labels - 1
        if min(labels) == -1:
            labels = labels + 1

        return np.array(ts), np.array(labels), int(len(set(labels)))

## K-Shape Execution

Load a subset (100 hundred rows) of ```CROP``` dataset

In [12]:
DATASET_PATH = 'dataset/univariate_example/'
DATASET_NAME = 'Crop'

dataloder = ClusterDataLoader(DATASET_PATH)

In [13]:
ts, labels, num_clusters = dataloder.load(DATASET_NAME)

### Dataset Information

In [14]:
print("-------------------------\nTimeseries Dataset\n-------------------------\n")
print(ts)

-------------------------
Timeseries Dataset
-------------------------

[[-1.04911634 -0.96685745 -0.88459857 ... -0.51685297 -0.59911186
  -0.68137074]
 [-1.10145498 -1.1413895  -1.18132403 ... -1.02158593 -1.10544843
  -1.18931093]
 [-0.80873226 -0.98601697 -1.16330168 ... -0.78711217 -0.70063182
  -0.61415147]
 ...
 [-1.62102196 -1.41436412 -1.20770628 ... -1.28672251 -0.75184341
  -0.2169643 ]
 [-1.0246006  -1.48431623 -1.36938732 ... -1.04614977 -0.39967465
   0.24680046]
 [-1.10748362 -1.14969269 -1.19190177 ... -0.67695102 -0.19154663
   0.29385776]]


In [15]:
print("-------------------------\nTimeseries Dataset Dimensions\n-------------------------\n")
print(ts.shape)

-------------------------
Timeseries Dataset Dimensions
-------------------------

(100, 46)


In [16]:
print("-------------------------\nClusters Dataset\n-------------------------\n")
print(labels)

-------------------------
Clusters Dataset
-------------------------

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [17]:
print("-------------------------\nClusters Dataset Dimensions\n-------------------------\n")
print(labels.shape)

-------------------------
Clusters Dataset Dimensions
-------------------------

(100,)


In [18]:
print(num_clusters)

1


### K-Shape Single Execution

In [54]:
start_time = time.time()

ksc = KShapeClusteringCPU(n_clusters=num_clusters,max_iter=2,n_jobs=-1)
ksc.fit(np.expand_dims(ts, axis=2))
print(f' Execution time: {time.time() - start_time}')

(46, 1)
 Execution time: 0.0734097957611084


## Specific Tests

### Monovariate Time Series

Monovariate time-series can be represented as a matrix NxM with:
- N (rows) as the number of time-series. Each row represent a time series which can be connected to a specific date 
- M (columns) as time instants considered for each time series. Each time instants respects a frequency of analysis 

In [20]:
#Example of monovariate array of time series: 4 time series and 3 time instants
##
a_monovariate = np.array([[10, 12, 31],
              [4, 9, 10],
              [70, 81, 2],
              [12, 112, 12]])

### Multivariate Time Series

Multivariate time-series can be represented as a matrix NxM with:
- N (rows) as the number of time-series. Each row represent a time series which can be connected to a specific date 
- M (columns) as time instants considered for each time series. Each time instants respects a frequency of analysis 

The main difference with monovariate time-series is that each cell of the matrix is an array of values. The number of elements of the array can be considered as the number of variables involved, in addition to timestamp

In [21]:
#Example of multivariate array of time series:

#2 time series, with 3 time instants and 2 variables

a_multivariate = np.array([[[11, 1000], [25, 30], [350, 40]],
                           [[45, 0.2], [1, 0.111], [65, 70]],
                           [[17, 59], [11, 14], [45, 87]]])

#compute the mean for each time instant
print("Mean\n-----------")
print(a_multivariate.mean(axis=0))
print("Std\n-----------")
print(a_multivariate.std(axis=0))

Mean
-----------
[[ 24.33333333 353.06666667]
 [ 12.33333333  14.70366667]
 [153.33333333  65.66666667]]
Std
-----------
[[ 14.81740718 458.080352  ]
 [  9.84321537  12.21227362]
 [139.30382463  19.43078886]]


The output can be explained as:
- Cell 0,0 ==> mean (or std) of first elements of first time instant;
- Cell 0,1 ==> mean (or std) of second elements of first time instant;
- Cell 1,0 ==> mean (or std) of first elements of second time instant;
....

So the output is a matrix Nx(M-1)

### Multivariate Time Series - zScore

In [22]:
#we apply the zscore function
zscore_result = zscore(a_multivariate)
print(zscore_result)

[[[-0.89984254  1.41227042]
  [ 1.28684238  1.25253772]
  [ 1.41178225 -1.32092767]]

 [[ 1.39475594 -0.77031609]
  [-1.15138528 -1.19491809]
  [-0.63410559  0.22301376]]

 [[-0.4949134  -0.64195433]
  [-0.13545709 -0.05761963]
  [-0.77767666  1.09791391]]]


### Monovariate Time Series - zScore

In [23]:
#we apply the zscore function
zscore_result = zscore(a_monovariate, axis=0)
print(zscore_result)


[[-0.52393683 -0.93494794  1.62139887]
 [-0.74848119 -1.00253454 -0.35247801]
 [ 1.72150673  0.61954382 -1.10443111]
 [-0.44908871  1.31793866 -0.16448974]]


### Monovariate Time Series - Shifting

In [24]:
print(a_monovariate)

[[ 10  12  31]
 [  4   9  10]
 [ 70  81   2]
 [ 12 112  12]]


In [25]:
a_monovariate_shifted = roll_zeropad(a_monovariate, 1)
print(a_monovariate_shifted)

[[  0  10  12]
 [ 31   4   9]
 [ 10  70  81]
 [  2  12 112]]


Evaluating parameter ```shift``` to 1, the function shifts time series to the right of one position

In [26]:
a_monovariate_shifted = roll_zeropad(a_monovariate, -1)
print(a_monovariate_shifted)

[[ 12  31   4]
 [  9  10  70]
 [ 81   2  12]
 [112  12   0]]


Evaluating parameter ```shift``` to -1, the function shifts time series to the left of one position

In [27]:
a_monovariate_shifted = roll_zeropad(a_monovariate, 5)
print(a_monovariate_shifted)

[[ 0  0  0]
 [ 0  0 10]
 [12 31  4]
 [ 9 10 70]]


In [28]:
a_monovariate_shifted = roll_zeropad(a_monovariate, -3)
print(a_monovariate_shifted)

[[  4   9  10]
 [ 70  81   2]
 [ 12 112  12]
 [  0   0   0]]


### Multivariate Time Series - Shifting

In [29]:
print(a_multivariate)

[[[1.10e+01 1.00e+03]
  [2.50e+01 3.00e+01]
  [3.50e+02 4.00e+01]]

 [[4.50e+01 2.00e-01]
  [1.00e+00 1.11e-01]
  [6.50e+01 7.00e+01]]

 [[1.70e+01 5.90e+01]
  [1.10e+01 1.40e+01]
  [4.50e+01 8.70e+01]]]


In [30]:
a_multivariate_shifted = roll_zeropad(a_multivariate, 1)
print(a_multivariate_shifted)

[[[0.00e+00 1.10e+01]
  [1.00e+03 2.50e+01]
  [3.00e+01 3.50e+02]]

 [[4.00e+01 4.50e+01]
  [2.00e-01 1.00e+00]
  [1.11e-01 6.50e+01]]

 [[7.00e+01 1.70e+01]
  [5.90e+01 1.10e+01]
  [1.40e+01 4.50e+01]]]


The shifting works across the same row. Considering that each element of row has an array of values, which indexes means different variables to analyze (temp, humidity, etc), this can generate misunderstanding data because the operation changes the variable associated to data value

### Monovariate Time Series - NCCc

In [31]:
x = np.array([[2,5]])
y = np.array([[2,5]])

nccc = _ncc_c_3dim([x,y])

print(nccc)

[1. 0. 1.]


In [32]:
sbd = _sbd(x,y)

print(f"SBD: {sbd}")

SBD: [[2 5]]


### FFT

In [33]:
x_len = 300
xlen_trans = 2*x_len-1
print(xlen_trans)

599


In [34]:
print(xlen_trans.bit_length())

10


In [35]:
print (1 << xlen_trans.bit_length())

1024


In [36]:
print(2 ** 11)

2048


In [37]:
print(fft([2,5],))

[ 7.+0.j -3.+0.j]


In [38]:
print(np.zeros((2, 2, 3)))

[[[0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]]]


### Read HDF File Format - Multivariate Time Series Dataset

In [39]:
with h5py.File('dataset/multivariate_example/Heartbeat/Heartbeat_DATA.h5', 'r') as f: 
    print(f.keys())
    dset = list(f['data'])
    print(f['data'])

<KeysViewHDF5 ['data']>


<HDF5 dataset "data": shape (409, 405, 61), type "<f8">


In [40]:
first_element = dset[0]
print(first_element.shape)

(405, 61)


In [41]:
second_element = dset[1]
print(second_element.shape)
print(second_element)

(405, 61)
[[ 0.67344416  1.03344607  0.51781406 ...  2.60632811  1.78939962
  -0.92018153]
 [ 0.27982926  1.97446527  2.52269576 ...  0.9516411  -0.4143421
   3.00223469]
 [-1.05258329  1.70247735  1.72015895 ...  1.19972818  1.77575933
   4.79457689]
 ...
 [-1.13371095 -0.16836813 -1.03306481 ... -0.75880258 -0.7104573
   0.53581495]
 [-0.77082255 -0.90313236 -0.98622583 ... -0.20830294  0.13230556
  -0.54074809]
 [-0.83482596 -1.52393233 -1.22096163 ...  1.03060257  1.05737061
   0.4729672 ]]


In [42]:
print("First shape element is time series number")
print("Secondo shape element is the number of variables considered for each time series")
print("Third element is the number of time instants considered for each measurement")

First shape element is time series number
Secondo shape element is the number of variables considered for each time series
Third element is the number of time instants considered for each measurement
