# K-shape algorithm

## Required Python Libraries

In [16]:
import math
import numpy as np
import multiprocessing
from numpy.random import randint
from numpy.linalg import norm, eigh
from numpy.fft import fft, ifft
from sklearn.base import ClusterMixin, BaseEstimator

## Functions

### Z-score Function

Z-score is a mathematical approach to convert a "Generic" Random Variables to a Normal Random Variables. 

For more information about this approach: 
https://it.wikipedia.org/wiki/Standardizzazione_(statistica)

**Function Inputs**
1. ```a```: numpy array containing time-series values;
2. ```axis```: axis used to evaluate mean and standard deviation. For axis we intend rows or columns.
3. ```ddof```: Degrees of freedom correction in the calculation of the standard deviation. Default value equals to 0

**Function returned values**:
1. Normalized Time-Series

**Function Assumptions**:
1. ```axis = 0 ```: normalize always on column axis. This is true beacuse we want to normalize time instants values, to exlude possible outliers or errors

In [17]:
def zscore(a, axis=0, ddof=0):
    a = np.asanyarray(a) #convert the input a in an array
    mns = a.mean(axis=axis) #compute the mean of the time series along the column axis (mean for each time instant)
    sstd = a.std(axis=axis, ddof=ddof) #compute the standard deviation of the array along the column axis (mean for each time instant)

    ## If Condition-->
    ## If axis equals to row (axis = 1) and dimensions of mean array is lower than dimensions of time series array
    ## -----> calculate Z score expanding dimensions of mean and std array on row axis
    ## If axis equals to columns (axis = 0) or dimensions of mean array is equal to dimensions of time series array
    ## -----> calculate Z score as time-series array minus its mean, divided by standard deviation

    if axis and mns.ndim < a.ndim:
        res = ((a - np.expand_dims(mns, axis=axis)) / #compute normalized data and expand the the mean and sd to match the dimension with the array a (the mean and std have 1 dimension less=
               np.expand_dims(sstd, axis=axis))
    else:
        res = (a - mns) / sstd #compute normalized data

    ## nan_to_num method --> replace NaN values with 0 or large integer values for infinite values
    return np.nan_to_num(res)

   


### Roll_zeropad Function

```roll_zeropad``` is a custom function to shift time series values to the right. The number of positions required by the movement are evaluated with parameter ```shift```.
For each shift, a zero value is included in the time series.

**Function Inputs**
1. ```a```: numpy array containing time-series values;
2. ```shift```: Number of right movements required;
3. ```axis```: axis to consider for right movements.

**Function returned values**:
1. Shifted Time-Series

**Function Assumptions**:
1. ```axis = None ```: time series can be considered as a 1D array and not as a matrix

In [18]:
def roll_zeropad(a, shift, axis=None):
    # asanyarray method --> Convert the input to an ndarray, but pass ndarray subclasses through.
    a = np.asanyarray(a) # Ensure that the input is ndarrary data type

    # If no shift is required (shift = 0), it returns the original values of time series
    if shift == 0:
        return a

    # If axis parameter is not evaluated (axis = None)
    # ------> consider all elements of time series and not a specific dimensions, but this requires a reshape of data at the end
    # If axis is set
    # ------> extracted dimensions of a specific axis (rows or columns) and work on that
    if axis is None:
        n = a.size
        reshape = True
    else:
        n = a.shape[axis]
        reshape = False

    # zeros_like() method --> Return an array of zeros with the same shape and type as a given array.

    # If shift parameter is greater than size of the time series (but this requires that no axis are specified)
    # ------> it evaluates an array of 0 values with the same size of a
    if np.abs(shift) > n:
        res = np.zeros_like(a)
    # If shift parameter is lower than 0 (moving to the left)
    # ------> subtract the shift value to n size of the array
    # ------> extract the indices from 0 to (n-shift) values, considering a plain array (axis = None) or a specific dimensions
    # ------> generate a 0-valued array with size [0, (n-shift)]
    # ------> extract values from (n-shift) position to n position
    # ------> create an array with values in range [(n-shift),n] at the beginning and all 0 values after
    elif shift < 0:
        shift += n
        zeros = np.zeros_like(a.take(np.arange(n-shift), axis))
        res = np.concatenate((a.take(np.arange(n-shift, n), axis), zeros), axis)
    # If shift parameter is in the range [0, size of time series]
    # ------> extract values from (n-shift) position to n position
    # ------> generate a 0-valued array with size [(n-shift), n]
    # ------> extract the indices from 0 to (n-shift) values, considering a plain array (axis = None) or a specific dimensions
    # ------> create an array with 0 value at the beginning and then values in range [0, (n-shift)]
    else:
        zeros = np.zeros_like(a.take(np.arange(n-shift, n), axis))
        res = np.concatenate((zeros, a.take(np.arange(n-shift), axis)), axis)

    # If reshape is required
    # -----> change shape (row, column) of shifted array, considering shape of time series values
    if reshape:
        return res.reshape(a.shape)
    else:
        return res

### _ncc_c_3dim Function

```_ncc_c_3dim``` is a custom function 

**Function Inputs**
1. ```a```: numpy array containing time-series values;
2. ```shift```: Number of right movements required;
3. ```axis```: axis to consider for right movements.

**Function returned values**:
1. Shifted Time-Series

**Function Assumptions**:
1. ```axis = None ```: time series can be considered as a 1D array and not as a matrix

In [42]:
def _ncc_c_3dim(data):
    x, y = data[0], data[1]
    #print(x)
    #print(y)
    den = norm(x, axis=(0,1)) * norm(y, axis=(0,1))

    if den < 1e-9:
        den = np.inf

    #print(x.shape)
    x_len = x.shape[0]
    fft_size = 1 << (2*x_len-1).bit_length()

    cc = ifft(fft(x, fft_size, axis=0) * np.conj(fft(y, fft_size, axis=0)), axis=0)
    cc = np.concatenate((cc[-(x_len-1):], cc[:x_len]), axis=0)

    return np.real(cc).sum(axis=-1) / den

In [51]:
def _sbd(x, y):
    ncc = _ncc_c_3dim([x, y])
    print(type(ncc))
    idx = np.argmax(ncc)
    print(ncc[idx])
    #yshift = 1. - ncc[idx]
    yshift = roll_zeropad(y, (idx + 1) - max(len(x), len(y)))

    return yshift

## Time Series - Examples

### Monovariate Time Series

Monovariate time-series can be represented as a matrix NxM with:
- N (rows) as the number of time-series. Each row represent a time series which can be connected to a specific date 
- M (columns) as time instants considered for each time series. Each time instants respects a frequency of analysis 

In [21]:
#Example of monovariate array of time series: 4 time series and 3 time instants
##
a_monovariate = np.array([[10, 12, 31],
              [4, 9, 10],
              [70, 81, 2],
              [12, 112, 12]])

### Multivariate Time Series

Multivariate time-series can be represented as a matrix NxM with:
- N (rows) as the number of time-series. Each row represent a time series which can be connected to a specific date 
- M (columns) as time instants considered for each time series. Each time instants respects a frequency of analysis 

The main difference with monovariate time-series is that each cell of the matrix is an array of values. The number of elements of the array can be considered as the number of variables involved, in addition to timestamp

In [22]:
#Example of multivariate array of time series:

#2 time series, with 3 time instants and 2 variables

a_multivariate = np.array([[[11, 1000], [25, 30], [350, 40]],
                           [[45, 0.2], [1, 0.111], [65, 70]],
                           [[17, 59], [11, 14], [45, 87]]])

#compute the mean for each time instant
print("Mean\n-----------")
print(a_multivariate.mean(axis=0))
print("Std\n-----------")
print(a_multivariate.std(axis=0))

Mean
-----------
[[ 24.33333333 353.06666667]
 [ 12.33333333  14.70366667]
 [153.33333333  65.66666667]]
Std
-----------
[[ 14.81740718 458.080352  ]
 [  9.84321537  12.21227362]
 [139.30382463  19.43078886]]


The output can be explained as:
- Cell 0,0 ==> mean (or std) of first elements of first time instant;
- Cell 0,1 ==> mean (or std) of second elements of first time instant;
- Cell 1,0 ==> mean (or std) of first elements of second time instant;
....

So the output is a matrix Nx(M-1)

### Multivariate Time Series - zScore

In [23]:
#we apply the zscore function
zscore_result = zscore(a_multivariate)
print(zscore_result)

[[[-0.89984254  1.41227042]
  [ 1.28684238  1.25253772]
  [ 1.41178225 -1.32092767]]

 [[ 1.39475594 -0.77031609]
  [-1.15138528 -1.19491809]
  [-0.63410559  0.22301376]]

 [[-0.4949134  -0.64195433]
  [-0.13545709 -0.05761963]
  [-0.77767666  1.09791391]]]


### Monovariate Time Series - zScore

In [24]:
#we apply the zscore function
zscore_result = zscore(a_monovariate, axis=0)
print(zscore_result)


[[-0.52393683 -0.93494794  1.62139887]
 [-0.74848119 -1.00253454 -0.35247801]
 [ 1.72150673  0.61954382 -1.10443111]
 [-0.44908871  1.31793866 -0.16448974]]


### Monovariate Time Series - Shifting

In [25]:
print(a_monovariate)

[[ 10  12  31]
 [  4   9  10]
 [ 70  81   2]
 [ 12 112  12]]


In [26]:
a_monovariate_shifted = roll_zeropad(a_monovariate, 1)
print(a_monovariate_shifted)

[[  0  10  12]
 [ 31   4   9]
 [ 10  70  81]
 [  2  12 112]]


Evaluating parameter ```shift``` to 1, the function shifts time series to the right of one position

In [27]:
a_monovariate_shifted = roll_zeropad(a_monovariate, -1)
print(a_monovariate_shifted)

[[ 12  31   4]
 [  9  10  70]
 [ 81   2  12]
 [112  12   0]]


Evaluating parameter ```shift``` to -1, the function shifts time series to the left of one position

In [28]:
a_monovariate_shifted = roll_zeropad(a_monovariate, 5)
print(a_monovariate_shifted)

[[ 0  0  0]
 [ 0  0 10]
 [12 31  4]
 [ 9 10 70]]


In [29]:
a_monovariate_shifted = roll_zeropad(a_monovariate, -3)
print(a_monovariate_shifted)

[[  4   9  10]
 [ 70  81   2]
 [ 12 112  12]
 [  0   0   0]]


### Multivariate Time Series - Shifting

In [30]:
print(a_multivariate)

[[[1.10e+01 1.00e+03]
  [2.50e+01 3.00e+01]
  [3.50e+02 4.00e+01]]

 [[4.50e+01 2.00e-01]
  [1.00e+00 1.11e-01]
  [6.50e+01 7.00e+01]]

 [[1.70e+01 5.90e+01]
  [1.10e+01 1.40e+01]
  [4.50e+01 8.70e+01]]]


In [31]:
a_multivariate_shifted = roll_zeropad(a_multivariate, 1)
print(a_multivariate_shifted)

[[[0.00e+00 1.10e+01]
  [1.00e+03 2.50e+01]
  [3.00e+01 3.50e+02]]

 [[4.00e+01 4.50e+01]
  [2.00e-01 1.00e+00]
  [1.11e-01 6.50e+01]]

 [[7.00e+01 1.70e+01]
  [5.90e+01 1.10e+01]
  [1.40e+01 4.50e+01]]]


The shifting works across the same row. Considering that each element of row has an array of values, which indexes means different variables to analyze (temp, humidity, etc), this can generate misunderstanding data because the operation changes the variable associated to data value

### Monovariate Time Series - NCCc

In [32]:
x = np.array([[2,5]])
y = np.array([[2,5]])

nccc = _ncc_c_3dim([x,y])

print(nccc)

[[2 5]]
[[2 5]]
(1, 2)
[1. 0. 1.]


In [52]:
sbd = _sbd(x,y)

print(f"SBD: {sbd}")

<class 'numpy.ndarray'>
1.0000000000000002
SBD: [[2 5]]
