# R Code
## https://www.r-bloggers.com/measuring-time-series-characteristics/

```
# Quick and dirty implementation of characteristic based time series clustering as seen in: 
# https://www.r-bloggers.com/measuring-time-series-characteristics/
library(forecast)

# Function to find the frequency of the time series data inputted
find.freq <- function(x)
{
  n <- length(x)
  spec <- spec.ar(c(na.contiguous(x)),plot=FALSE)
  if(max(spec$spec)>10) # Arbitrary threshold chosen by trial and error.
  {
    period <- round(1/spec$freq[which.max(spec$spec)])
    if(period==Inf) # Find next local maximum
    {
      j <- which(diff(spec$spec)>0)
      if(length(j)>0)
      {
        nextmax <- j[1] + which.max(spec$spec[j[1]:500])
        if(nextmax <= length(spec$freq))
          period <- round(1/spec$freq[nextmax])
        else
          period <- 1
      }
      else
        period <- 1
    }
  }
  else
    period <- 1
  
  return(period)
}


# Function that decomposes the data into trend and seasonal components
decomp <- function(x,transform=TRUE)
{
  require(forecast)
  # Transform series
  if(transform & min(x,na.rm=TRUE) >= 0)
  {
    lambda <- BoxCox.lambda(na.contiguous(x))
    x <- BoxCox(x,lambda)
  }
  else
  {
    lambda <- NULL
    transform <- FALSE
  }
  # Seasonal data
  if(frequency(x)>1)
  {
    x.stl <- stl(x,s.window="periodic",na.action=na.contiguous)
    trend <- x.stl$time.series[,2]
    season <- x.stl$time.series[,1]
    remainder <- x - trend - season
  }
  else #Nonseasonal data
  {
    require(mgcv)
    tt <- 1:length(x)
    trend <- rep(NA,length(x))
    trend[!is.na(x)] <- fitted(gam(x ~ s(tt)))
    season <- NULL
    remainder <- x - trend
  }
  return(list(x=x,trend=trend,season=season,remainder=remainder,
              transform=transform,lambda=lambda))
}

# Functions to map all the features onto a [0,1] scale
# f1 maps [0,infinity) to [0,1]
f1 <- function(x,a,b)
{
  eax <- exp(a*x)
  if (eax == Inf)
    f1eax <- 1
  else
    f1eax <- (eax-1)/(eax+b)
  return(f1eax)
}

# f2 maps [0,1] onto [0,1]
f2 <- function(x,a,b)
{
  eax <- exp(a*x)
  ea <- exp(a)
  return((eax-1)/(eax+b)*(ea+b)/(ea-1))
}

# Finally, calculate measures
library(tseries)
library(fracdiff)
measures <- function(x)
{
  require(forecast)
  
  N <- length(x)
  freq <- find.freq(x)
  fx <- c(frequency=(exp((freq-1)/50)-1)/(1+exp((freq-1)/50)))
  x <- ts(x,f=freq)
  
  # Decomposition
  decomp.x <- decomp(x)
  
  # Adjust data
  if(freq > 1)
    fits <- decomp.x$trend + decomp.x$season
  else # Nonseasonal data
    fits <- decomp.x$trend
  adj.x <- decomp.x$x - fits + mean(decomp.x$trend, na.rm=TRUE)
  
  # Backtransformation of adjusted data
  if(decomp.x$transform)
    tadj.x <- InvBoxCox(adj.x,decomp.x$lambda)
  else
    tadj.x <- adj.x
  
  # Trend and seasonal measures
  v.adj <- var(adj.x, na.rm=TRUE)
  if(freq > 1)
  {
    detrend <- decomp.x$x - decomp.x$trend
    deseason <- decomp.x$x - decomp.x$season
    trend <- ifelse(var(deseason,na.rm=TRUE) < 1e-10, 0, 
                    max(0,min(1,1-v.adj/var(deseason,na.rm=TRUE))))
    season <- ifelse(var(detrend,na.rm=TRUE) < 1e-10, 0,
                     max(0,min(1,1-v.adj/var(detrend,na.rm=TRUE))))
  }
  else #Nonseasonal data
  {
    trend <- ifelse(var(decomp.x$x,na.rm=TRUE) < 1e-10, 0,
                    max(0,min(1,1-v.adj/var(decomp.x$x,na.rm=TRUE))))
    season <- 0
  }
  
  m <- c(fx,trend,season)
  
  # Measures on original data
  xbar <- mean(x,na.rm=TRUE)
  s <- sd(x,na.rm=TRUE)
  
  # Serial correlation
  Q <- Box.test(x,lag=10)$statistic/(N*10)
  fQ <- f2(Q,7.53,0.103)
  
  # Nonlinearity
  p <- terasvirta.test(na.contiguous(x))$statistic
  fp <- f1(p,0.069,2.304)
  
  # Skewness
  s <- abs(mean((x-xbar)^3,na.rm=TRUE)/s^3)
  fs <- f1(s,1.510,5.993)
  
  # Kurtosis
  k <- mean((x-xbar)^4,na.rm=TRUE)/s^4
  fk <- f1(k,2.273,11567)
  
  # Hurst=d+0.5 where d is fractional difference.
  H <- fracdiff(na.contiguous(x),0,0)$d + 0.5
  
  # Lyapunov Exponent
  if(freq > N-10)
    stop("Insufficient data")
  Ly <- numeric(N-freq)
  for(i in 1:(N-freq))
  {
    idx <- order(abs(x[i] - x))
    idx <- idx[idx < (N-freq)]
    j <- idx[2]
    Ly[i] <- log(abs((x[i+freq] - x[j+freq])/(x[i]-x[j])))/freq
    if(is.na(Ly[i]) | Ly[i]==Inf | Ly[i]==-Inf)
      Ly[i] <- NA
  }
  Lyap <- mean(Ly,na.rm=TRUE)
  fLyap <- exp(Lyap)/(1+exp(Lyap))
  
  m <- c(m,fQ,fp,fs,fk,H,fLyap)
  
  # Measures on adjusted data
  xbar <- mean(tadj.x, na.rm=TRUE)
  s <- sd(tadj.x, na.rm=TRUE)
  
  # Serial
  Q <- Box.test(adj.x,lag=10)$statistic/(N*10)
  fQ <- f2(Q,7.53,0.103)
  
  # Nonlinearity
  p <- terasvirta.test(na.contiguous(adj.x))$statistic
  fp <- f1(p,0.069,2.304)
  
  # Skewness
  s <- abs(mean((tadj.x-xbar)^3,na.rm=TRUE)/s^3)
  fs <- f1(s,1.510,5.993)
  
  # Kurtosis
  k <- mean((tadj.x-xbar)^4,na.rm=TRUE)/s^4
  fk <- f1(k,2.273,11567)
  
  m <- c(m,fQ,fp,fs,fk)
  names(m) <- c("frequency", "trend","seasonal",
                "autocorrelation","non-linear","skewness","kurtosis",
                "Hurst","Lyapunov",
                "dc autocorrelation","dc non-linear","dc skewness","dc kurtosis")
  
  return(m)
}
```

# Now in Python!

## First find frequency (this unfortunately doesn't work yet, including for reference)

In [1]:
import numpy as np
import pandas as pd
from scipy.signal import periodogram
def find_freq(x):
    # Use an iterative function to automagically determine the frequency of the time series data
    # Takes in a single column of a pandas DataFrame as a univariate time series
    
    n = len(x)
    # Now estimate the spectral density of the time series via AR fit
    # Two ways: numpy fft method or scipy signal method
    # Method #1: numpy fft method (https://stackoverflow.com/questions/15382076/plotting-power-spectrum-in-python)
    '''
    pow_np = np.abs(np.fft.fft(x))**2
    time_step = 1 / n
    freqs = np.fft.fftfreq(n, time_step)
    idx = np.argsort(freqs)
    plt.plot(freqs[idx], ps[idx])
    '''
    
    # Method #2: scipy signal method via density or spectrum (takes an optional second frequency parameter)
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.periodogram.html
    #f, pow_scipy = signal.periodogram(x,scaling='density')
    f, pow_scipy = periodogram(x,scaling='spectrum')
    
    # Iterate through frequencies 
    freq = f
    power = pow_scipy
    if max(power) > 10: # Arbitrary threshold chosen by trial and error.
        # The power might be a huge number way of out the index bounds, so pick max index if so
        power_idx = min(int(round(max(power))),(len(power)-1))
        freq_idx = min(int(round(power[power_idx])),len(freq)-1)
        period = 1/freq[freq_idx]
        # If period is infinity, find next local maximum
        if period == np.Inf:
            j = pd.Series(power).diff() > 0
            if len(j) > 0:
                nextmax = j[1] + power[int(round(max(power[1:])))]
                if (nextmax <= len(freq)):
                    period = int(round(1/freq[int(round(nextmax))]))
                else:
                    period = 1
            else:
                period = 1
        else:
            period = int(round(period))
    else:
        period = 1
    
    return period

## In R as well (since I can't get it to work in Python)

#### This requires some manual work before these code blocks will work
1. Set R_HOME variable (can be found by typing ".Library" into R Studio)
2. The output from the above is the "library" subdirectory of the R_HOME directory established by R Studio
3. Take the output minus "/library" at the end, and save this to R_HOME in a terminal
4. Run pip3 install rpy2 (connects to R_HOME at install)

Will make an init.sh file to run all the necessary steps for this + other manual setup steps

In [2]:
# Sources: https://stackoverflow.com/questions/17573988/r-home-error-with-rpy2
# https://stackoverflow.com/questions/47585718/rpy2-installed-but-wont-run-packages
# https://stackoverflow.com/questions/24880493/how-to-find-out-r-library-location-in-mac-osx/24880594
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
from rpy2.robjects import r, pandas2ri, numpy2ri, Formula
from rpy2.robjects.vectors import IntVector,FloatVector
# Load necessary R packages
rtseries = importr('tseries')
rbase = importr('base')
rstats = importr('stats')
rfracdiff=importr('fracdiff')
rutils=importr('utils')
rmgcv=importr('mgcv')

# "Activate" pandas2ri and numpy2ri
#pandas2ri.activate()
#numpy2ri.activate()

# For some reason, running the above commands blocks me from being able to create R time series objects 
# Trying to create time series objects just leads to numpy arrays being created
# https://www.r-bloggers.com/using-r-in-python-for-statistical-learning-data-science-2/
# Example output when the below commands are NOT used:
'''
rbase.set_seed(123) # reproducibility seed
x = r.ts(r.rnorm(n=10)) # simulate the time series
print(x)

Time Series:
Start = 1 
End = 10 
Frequency = 1 
 [1] -0.56047565 -0.23017749  1.55870831  0.07050839  0.12928774  1.71506499
 [7]  0.46091621 -1.26506123 -0.68685285 -0.44566197
'''

# Example output when the below commands are used:
'''
rbase.set_seed(123) # reproducibility seed
x = r.ts(r.rnorm(n=10)) # simulate the time series
print(x)

[-0.56047565 -0.23017749  1.55870831  0.07050839  0.12928774  1.71506499
  0.46091621 -1.26506123 -0.68685285 -0.44566197]
'''

'\nrbase.set_seed(123) # reproducibility seed\nx = r.ts(r.rnorm(n=10)) # simulate the time series\nprint(x)\n\n[-0.56047565 -0.23017749  1.55870831  0.07050839  0.12928774  1.71506499\n  0.46091621 -1.26506123 -0.68685285 -0.44566197]\n'

In [3]:
rbase.set_seed(123) # reproducibility seed
x = r.ts(r.rnorm(n=10)) # simulate the time series
print(x)

Time Series:
Start = 1 
End = 10 
Frequency = 1 
 [1] -0.56047565 -0.23017749  1.55870831  0.07050839  0.12928774  1.71506499
 [7]  0.46091621 -1.26506123 -0.68685285 -0.44566197



In [4]:
#pandas2ri.activate()
#numpy2ri.activate()
#print(x)

In [5]:
rbase.set_seed(123) # reproducibility seed
y = r.ts(r.rnorm(n=10)) # simulate the time series
print(y)

Time Series:
Start = 1 
End = 10 
Frequency = 1 
 [1] -0.56047565 -0.23017749  1.55870831  0.07050839  0.12928774  1.71506499
 [7]  0.46091621 -1.26506123 -0.68685285 -0.44566197



In [197]:
x = r.ts(r.rnorm(n=30)) # simulate the time series
spec = rstats.spec_ar(x,plot=False)
print(spec.names)

[1] "freq"   "spec"   "coh"    "phase"  "n.used" "series" "method"



In [6]:
x.r_repr

<bound method RObjectMixin.r_repr of R object with classes: ('ts',) mapped to:
[-0.560476, -0.230177, 1.558708, 0.070508, ..., 0.460916, -1.265061, -0.686853, -0.445662]>

In [7]:
z = FloatVector(r.rnorm(n=10))
z.r_repr

<bound method RObjectMixin.r_repr of R object with classes: ('numeric',) mapped to:
[1.224082, 0.359814, 0.400771, 0.110683, ..., 0.497850, -1.966617, 0.701356, -0.472791]>

In [191]:
def find_freq_r(x):
    # Same as above function, but using the R code directly via rpy2
    n = len(x)
    #spec = rstats.spec_ar(r.ts(na_contiguous(x)),plot=False)
    spec = rstats.spec_ar(x,plot=False)
    # This returns a "ListVector" with the following items, which can be accessed via list indexing:
    '''
    x = r.ts(FloatVector(na_contiguous(analysis_data_by_user_id[100613640])))
    spec = rstats.spec_ar(x,plot=False)
    print(spec.names)
    
    [1] "freq"   "spec"   "coh"    "phase"  "n.used" "series" "method"
    '''
    spec_vals = spec[1]
    spec_freq = spec[0]
    #spec_vals = spec[np.where(spec.names == 'spec')[0].item()]
    #spec_freq = spec[np.where(spec.names == 'freq')[0].item()]

    if max(spec_vals) > 10:
        #period <- round(1/spec$freq[which.max(spec$spec)])
        denom = spec_freq[list(rbase.which_max(spec_vals))[0] - 1]
        if denom != 0:
            period = round(1/denom)
        else: # Means we end up with infinity as a result, so evaluate additional code block
            series = pd.Series(rbase.diff(spec_vals))
            j = series[series > 0].reset_index(drop=True)
            if len(j) > 0:
                #nextmax <- j[1] + which.max(spec$spec[j[1]:500])
                nextmax = j[0] + rbase.which_max(spec_vals[int(j[0]):500])
                if nextmax.item() - 1 <= len(spec_freq):
                    denom = spec_freq[round(nextmax.item() - 1)]
                    if denom != 0:
                        period = round(1/denom)
                    else:
                        period = 1
                else:
                    period = 1
            else:
                period = 1
    else:
        period = 1
    
    return int(period)
    
'''
# Function to find the frequency of the time series data inputted
find.freq <- function(x)
{
  n <- length(x)
  spec <- spec.ar(c(na.contiguous(x)),plot=FALSE)
  if(max(spec$spec)>10) # Arbitrary threshold chosen by trial and error.
  {
    period <- round(1/spec$freq[which.max(spec$spec)])
    if(period==Inf) # Find next local maximum
    {
      j <- which(diff(spec$spec)>0)
      if(length(j)>0)
      {
        nextmax <- j[1] + which.max(spec$spec[j[1]:500])
        if(nextmax <= length(spec$freq))
          period <- round(1/spec$freq[nextmax])
        else
          period <- 1
      }
      else
        period <- 1
    }
  }
  else
    period <- 1

  return(period)
}
'''

'\n# Function to find the frequency of the time series data inputted\nfind.freq <- function(x)\n{\n  n <- length(x)\n  spec <- spec.ar(c(na.contiguous(x)),plot=FALSE)\n  if(max(spec$spec)>10) # Arbitrary threshold chosen by trial and error.\n  {\n    period <- round(1/spec$freq[which.max(spec$spec)])\n    if(period==Inf) # Find next local maximum\n    {\n      j <- which(diff(spec$spec)>0)\n      if(length(j)>0)\n      {\n        nextmax <- j[1] + which.max(spec$spec[j[1]:500])\n        if(nextmax <= length(spec$freq))\n          period <- round(1/spec$freq[nextmax])\n        else\n          period <- 1\n      }\n      else\n        period <- 1\n    }\n  }\n  else\n    period <- 1\n\n  return(period)\n}\n'

## Now let's do the decompose function

In [137]:
from scipy.stats import boxcox
#from statsmodels.gam.api import GLMGam, BSplines
from statsmodels.gam.generalized_additive_model import GLMGam
from statsmodels.gam.smooth_basis import BSplines,CyclicCubicSplines
import statsmodels.api as sm
#from statsmodels.gam.generalized_additive_model import GLMGam

def na_contiguous(x):
    # Recreate na.contiguous function in R since this is used frequently
    # This takes a series object with a time index and finds the longest consecutive stretch of non-missing values
    # https://stackoverflow.com/questions/41494444/pandas-find-longest-stretch-without-nan-values
    # And then return the shortened dataframe with all non-null values
    values = x.values 
    mask = np.concatenate(( [True], np.isnan(values), [True] ))  # Mask
    start_stop = np.flatnonzero(mask[1:] != mask[:-1]).reshape(-1,2)   # Start-stop limits
    start,stop = start_stop[(start_stop[:,1] - start_stop[:,0]).argmax()]  # Get max interval, interval limits
    contiguous = x.iloc[start:stop]
    return contiguous

def decompose(x, transform = True):
    # Decompose data into trend, seasonality and randomness
    # Accepts a pandas series object with a datetime index
    if (transform and min(x.dropna()) >= 0):
        # Transforms data and finds the lambda that maximizes the log likelihood 
        # R version has above method and method that minimizes the coefficient of variation ("guerrero")
        x_transformed, var_lambda = boxcox(na_contiguous(x),lmbda = None)
        x_transformed = pd.Series(x_transformed,index=na_contiguous(x).index)
    
    else:
        x_transformed = x
        var_lambda = np.nan
        transform = False
    
    # Seasonal data 
    # In R code, we find the number of samples per unit time below (should be 1 every time)
    # Here I take the datetime index differences, take their inverses, and store in a list to be evaluated
    # https://stackoverflow.com/questions/36583859/compute-time-difference-of-datetimeindex
    idx = x_transformed.index
    #samples = np.unique([int(1/(idx[n]-idx[n - 1]).days) for n in range(1,len(idx))])
    # Filter out Nulls for this exercise
    #samples = samples[~np.isnan(samples)]
    #if len(samples) == 1 and samples.item() > 1:

    # Just use the R code instead
    # This is supposed to be "> 1" but all data results in a frequency of 1
    # All frequency results in R equal 4, meaning this code block gets evaluated every time in R
    # So this code block should always be evaluated as well
    freq = rstats.frequency(r.ts(FloatVector(x_transformed)))
    if list(freq)[0] == 1:
        # Decompose
        stl = sm.tsa.seasonal_decompose(na_contiguous(x_transformed))
        #stl = rstats.stl(na_contiguous(x_transformed),s_window='periodic')
        # When I try to use above function, I get this:
        '''
        R[write to console]: Error in (function (x, s.window, s.degree = 0, t.window = NULL, t.degree = 1,  : 
  series is not periodic or has less than two periods
        '''
        trend = stl.trend
        seasonality = stl.seasonal
        remainder = x_transformed - trend - seasonality

    else:
        # Nonseasonal data
        trend = pd.Series(np.nan, index=x_transformed.index)
        time_index = pd.Index([i for i in range(1,len(x_transformed)+1)])
        # Python specific
        bs = BSplines(time_index, df=[12, 10], degree=[3, 3])
        cs = CyclicCubicSplines(time_index,df=[3,3])
        alpha = np.array([218.338888])
        gam = GLMGam(x_transformed, smoother=cs, alpha=alpha).fit()
        #trend.loc[~x_transformed.isnull()] = gam.fittedvalues
        
        # R Code
        fmla = Formula('x ~ s(tt)')
        env = fmla.environment
        env['tt'] = time_index
        env['x'] = x_transformed
        trend.loc[~x_transformed.isnull()] = rstats.fitted(rmgcv.gam(fmla))
        seasonality = pd.Series(np.nan, index=x_transformed.index)
        remainder = x_transformed - trend
    
    return_dct = {
        'x': x_transformed,
        'trend': trend,
        'seasonality': seasonality,
        'remainder': remainder,
        'transform': transform,
        'lambda': var_lambda,
    }
    
    return return_dct

# Transformation Functions

In [10]:
import math
def f1_transformation(x, a, b):
    eax = math.exp(a * x)
    if eax == np.Inf:
        f1_eax = 1
    else:
        f1_eax = (eax-1)/(eax+b)
    return f1_eax

def f2_transformation(x, a, b):
    eax = math.exp(a*x)
    ea = math.exp(a)
    return((eax-1)/(eax+b)*(ea+b)/(ea-1))

'''
# Functions to map all the features onto a [0,1] scale
# f1 maps [0,infinity) to [0,1]
f1 <- function(x,a,b)
{
  eax <- exp(a*x)
  if (eax == Inf)
    f1eax <- 1
  else
    f1eax <- (eax-1)/(eax+b)
  return(f1eax)
}

# f2 maps [0,1] onto [0,1]
f2 <- function(x,a,b)
{
  eax <- exp(a*x)
  ea <- exp(a)
  return((eax-1)/(eax+b)*(ea+b)/(ea-1))
}
'''

'\n# Functions to map all the features onto a [0,1] scale\n# f1 maps [0,infinity) to [0,1]\nf1 <- function(x,a,b)\n{\n  eax <- exp(a*x)\n  if (eax == Inf)\n    f1eax <- 1\n  else\n    f1eax <- (eax-1)/(eax+b)\n  return(f1eax)\n}\n\n# f2 maps [0,1] onto [0,1]\nf2 <- function(x,a,b)\n{\n  eax <- exp(a*x)\n  ea <- exp(a)\n  return((eax-1)/(eax+b)*(ea+b)/(ea-1))\n}\n'

# Code for Neural Network Test
"Nonlinear  time  series  models  have  been  used  extensively  in  recent  years  to  model complex dynamics not adequately represented use linear models ... Because of the special characteristic (behavior) of time series data, the traditional linear models cannot handle the  forecasting  well  compared  to  non-linear  models.  Therefore,  non-linearity  is  animportant characteristic of time series data to determine the selection of appropriate forecasting method.

There are many approaches to test the nonlinearity in time series regression models. Nonparametric kernel test and neural network test are the two major models appeared in the literature. In the comparative studies between these two approaches, neural net-work has been reported with better reliability. In this research, we used Teraesvirta’s  neural  network  test  (Ter ̈aesvirta  et  al.,1993)  for  time  series  data  non-linearity characteristics identification and extraction. It has been widely accepted andreported that it can correctly model the nonlinear structure of the data (Rocca and Perna,2004). It is a test for neglected nonlinearity likely to have power against a range of al-ternatives based on neural network model (augmented single-hidden-layer feed forward neural network model). The test is based on a test function chosen as the activationsof ‘phantom’ hidden units."

(4) (PDF) Characteristic-Based Clustering for Time Series Data. Available from: https://www.researchgate.net/publication/220451959_Characteristic-Based_Clustering_for_Time_Series_Data [accessed Jun 16 2020].

In [11]:
# https://machinelearningmastery.com/implement-backpropagation-algorithm-scratch-python/
from math import exp
from random import seed
from random import random
 
# Initialize a network
def initialize_network(n_inputs, n_hidden, n_outputs):
    network = list()
    hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
    network.append(hidden_layer)
    output_layer = [{'weights':[random() for i in range(n_hidden + 1)]} for i in range(n_outputs)]
    network.append(output_layer)
    return network
 
# Calculate neuron activation for an input
def activate(weights, inputs):
    activation = weights[-1]
    for i in range(len(weights)-1):
        activation += weights[i] * inputs[i]
    return activation
 
# Transfer neuron activation
def transfer(activation):
    return 1.0 / (1.0 + exp(-activation))
 
# Forward propagate input to a network output
def forward_propagate(network, row):
    inputs = row
    print('Inputs:', inputs)
    for layer in network:
        new_inputs = []
        for neuron in layer:
            activation = activate(neuron['weights'], inputs)
            neuron['output'] = transfer(activation)
            new_inputs.append(neuron['output'])
        inputs = new_inputs
    return inputs
 
# Calculate the derivative of an neuron output
def transfer_derivative(output):
    return output * (1.0 - output)
 
# Backpropagate error and store in neurons
def backward_propagate_error(network, expected):
    for i in reversed(range(len(network))):
        layer = network[i]
        errors = list()
        if i != len(network)-1:
            for j in range(len(layer)):
                error = 0.0
                for neuron in network[i + 1]:
                    error += (neuron['weights'][j] * neuron['delta'])
                errors.append(error)
        else:
            for j in range(len(layer)):
                neuron = layer[j]
                errors.append(expected[j] - neuron['output'])
        for j in range(len(layer)):
            neuron = layer[j]
            neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])
 
# Update network weights with error
def update_weights(network, row, l_rate):
    for i in range(len(network)):
        inputs = row[:-1]
        if i != 0:
            inputs = [neuron['output'] for neuron in network[i - 1]]
        for neuron in network[i]:
            for j in range(len(inputs)):
                neuron['weights'][j] += l_rate * neuron['delta'] * inputs[j]
            neuron['weights'][-1] += l_rate * neuron['delta']
 
# Train a network for a fixed number of epochs
def train_network_example(network, train, l_rate, n_epoch, n_outputs):
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            outputs = forward_propagate(network, row)
            expected = [0 for i in range(n_outputs)]
            expected[row[-1]] = 1
            sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
            backward_propagate_error(network, expected)
            update_weights(network, row, l_rate)
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
 
# Train a network for a fixed number of epochs
def train_network(network, train, l_rate, n_epoch, n_outputs):
    for epoch in range(n_epoch):
        #sum_error = 0
        network_outputs = []
        for row in train:
            outputs = forward_propagate(network, row)
            network_outputs.append(outputs)
            #expected = [0 for i in range(n_outputs)]
            #expected[row[-1]] = 1
            #sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
            #backward_propagate_error(network, expected)
            #update_weights(network, row, l_rate)
        #sum_output = sum(network_output)
        #print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))

# Test training backprop algorithm
seed(1)
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
n_inputs = len(dataset[0]) - 1
n_outputs = len(set([row[-1] for row in dataset]))
network = initialize_network(n_inputs, 2, n_outputs)
train_network_example(network, dataset, 0.5, 25, n_outputs)
for layer in network:
    print(layer)


Inputs: [2.7810836, 2.550537003, 0]
Inputs: [1.465489372, 2.362125076, 0]
Inputs: [3.396561688, 4.400293529, 0]
Inputs: [1.38807019, 1.850220317, 0]
Inputs: [3.06407232, 3.005305973, 0]
Inputs: [7.627531214, 2.759262235, 1]
Inputs: [5.332441248, 2.088626775, 1]
Inputs: [6.922596716, 1.77106367, 1]
Inputs: [8.675418651, -0.242068655, 1]
Inputs: [7.673756466, 3.508563011, 1]
>epoch=0, lrate=0.500, error=6.350
Inputs: [2.7810836, 2.550537003, 0]
Inputs: [1.465489372, 2.362125076, 0]
Inputs: [3.396561688, 4.400293529, 0]
Inputs: [1.38807019, 1.850220317, 0]
Inputs: [3.06407232, 3.005305973, 0]
Inputs: [7.627531214, 2.759262235, 1]
Inputs: [5.332441248, 2.088626775, 1]
Inputs: [6.922596716, 1.77106367, 1]
Inputs: [8.675418651, -0.242068655, 1]
Inputs: [7.673756466, 3.508563011, 1]
>epoch=1, lrate=0.500, error=5.531
Inputs: [2.7810836, 2.550537003, 0]
Inputs: [1.465489372, 2.362125076, 0]
Inputs: [3.396561688, 4.400293529, 0]
Inputs: [1.38807019, 1.850220317, 0]
Inputs: [3.06407232, 3.005305

# Measure Calculations

## Load in RPy2 and configure settings so that we can perform the Neural Network Test + FARIMA test

In [193]:
from scipy.special import inv_boxcox
from statsmodels.stats.diagnostic import acorr_ljungbox
#from pypr.stattest.ljungbox import boxpierce
def calculate_measures(x):
    # Save ts version of our data for some of the below functions
    rbase.set_seed(123) # reproducibility seed
    x_ts_contiguous = r.ts(FloatVector(na_contiguous(x)))
    #x = x_ts_contiguous
    #print(x_ts_contiguous)
    
    # Now "activate" pandas2ri and numpy2ri
    #pandas2ri.activate()
    #numpy2ri.activate()
    
    N = len(x)
    freq = find_freq_r(x_ts_contiguous)
    fx = (math.exp((freq-1)/50)-1)/(1+math.exp((freq-1)/50))
    
    # Decomposition
    decomp_x = decompose(x)
    
    # Adjust data
    # Unfortunately it looks like frequency is calculated a different way in the decompose function
    # Thus there are users for which this function is evaulated when 'seasonality' is null
    # Going to add an extra check to make sure to not evaluate this if all the values are null
    #print(decomp_x['seasonality'])
    if freq > 1 and (not decomp_x['seasonality'].isnull().all()):
        fit = decomp_x['trend'] + decomp_x['seasonality']
    else:
        # Nonseasonal data
        fit = decomp_x['trend']
    adj_x = decomp_x['x'] - fit + np.mean(decomp_x['trend'].dropna())
    
    # Backtransformation of adjusted data
    if decomp_x['transform']:
        # The below line of code doesn't work for some reason
        #t_adj_x = inv_boxcox(adj_x.values, decomp_x['lambda'])
        # Use actual formula instead (but do inverse because we're solving for x)
        '''
        The Box-Cox transform is given by:

            y = (x**lmbda - 1) / lmbda,  for lmbda > 0
                log(x),                  for lmbda = 0
        '''
        if decomp_x['lambda'] == 0:
            # Assuming base of 10 (x = 10^y)
            t_adj_x = 10 ** adj_x
        else:
            # x = ((y * lambda) + 1) ^ (1/lambda)
            t_adj_x = ((adj_x * decomp_x['lambda']) + 1) ** (1/decomp_x['lambda'])
    else:
        t_adj_x = adj_x
    
    # Trend and seasonal measures
    v_adj = np.var(adj_x.dropna())
    threshold = 0.00000000001
    if(freq > 1):
        detrend = decomp_x['x'] - decomp_x['trend']
        deseason = decomp_x['x'] - decomp_x['seasonality']
        
        if np.var(deseason.dropna()) < threshold:
            trend = 0
        else:
            trend = max(0,min(1,1-(v_adj/np.var(deseason.dropna()))))
        if np.var(detrend.dropna()) < threshold:
            seasonality = 0
        else:
            seasonality = max(0,min(1,1-(v_adj/np.var(detrend.dropna()))))
    else:
        # Nonseasonal data
        if np.var(decomp_x['x'].dropna()) < threshold:
            trend = 0
        else:
            trend = max(0,min(1,1-(v_adj/np.var(decomp_x['x'].dropna()))))
        seasonality = 0
    
    measures = [fx,trend,seasonality]
    
    # Measures on original data
    xbar = np.mean(x.dropna())
    std = np.std(x.dropna())
    
    # Serial correlation (make sure box pierce statistic is returned as well)
    #bp = boxpierce(x, lags=max_lag)
    #Had to fix stattest module in pypr package via: https://gist.github.com/betterxys/1def38e1fcbb7f3b2dab2393bcea52f0
    max_lag = 10
    lbvalue, pvalue, bpvalue, bppvalue = acorr_ljungbox(x, lags=max_lag, boxpierce=True)
    # The above returns values for each lag, so just grab the final value
    Q = bpvalue[-1] / (N*max_lag)
    fQ = f2_transformation(Q,7.53,0.103)
    
    # Nonlinearity (THIS REQUIRES THE TIMESERIES OBJECT VERSION OF OUR DATA)

    non_linear_test = rtseries.terasvirta_test_ts(x_ts_contiguous,type = "Chisq")
    #non_linear_test = rtseries.terasvirta_test_default(y=x_contiguous,x=x_contiguous.index.dayofyear,type = "Chisq")
    '''
    x = r.ts(FloatVector(na_contiguous(analysis_data_by_user_id[100613640])))
    non_linear_test = rtseries.terasvirta_test_ts(x,type = "Chisq")
    print(non_linear_test.names)
    [1] "statistic" "parameter" "p.value"   "method"    "data.name" "arguments"
    '''
    p = list(non_linear_test[0])[0]
    fp = f1_transformation(p,0.069,2.304)
    
    # Skewness
    skew = abs(np.mean((x.dropna()-xbar) ** 3)/std ** 3)
    fs = f1_transformation(skew,1.510,5.993)
    
    # Kurtosis
    kurtosis = np.mean((x.dropna()-xbar) ** 4)/std ** 4
    fk = f1_transformation(kurtosis,2.273,11567)
    
    # Hurst=d+0.5 where d is fractional difference
    hurst = rfracdiff.fracdiff(x_ts_contiguous,0,0)
    '''
    x = r.ts(FloatVector(na_contiguous(analysis_data_by_user_id[100613640])))
    hurst = rfracdiff.fracdiff(x_ts_contiguous,0,0)
    print(hurst.names)
     [1] "log.likelihood"  "n"               "msg"             "d"              
     [5] "ar"              "ma"              "covariance.dpq"  "fnormMin"       
     [9] "sigma"           "stderror.dpq"    "correlation.dpq" "h"              
    [13] "d.tol"           "M"               "hessian.dpq"     "length.w"       
    [17] "residuals"       "fitted"          "call"     
    '''
    # Grab the fourth value in the hurst variable
    H = list(hurst[3])[0] + 0.5
    
    # Lyapunov Exponent
    if freq > (N-10):
        # There is insufficient data, declare this variable as none
        fLyap = None
    else:
        Ly = np.zeros(N-freq)
        for i in range(0,(N-freq)):
            diffs = abs(x.iloc[i] - x)
            date_idx = diffs.sort_values().index
            int_idx = pd.Index([diffs.index.get_loc(date) for date in date_idx])
            idx = int_idx[int_idx < (N-freq)]
            j = idx[1]
            try:
                Ly[i] = math.log(abs((x.iloc[i+freq] - x.iloc[j+freq])/(x.iloc[i]-x.iloc[j]))) / freq
            except ValueError: # domain error, means log(0) was taken
                Ly[i] = 0
            if(np.isnan(Ly[i]) or (Ly[i] == np.Inf) or (Ly[i] == -np.Inf)):
                Ly[i] = np.nan
        Lyap = np.mean(Ly[~np.isnan(Ly)])
        fLyap = math.exp(Lyap) / (1+math.exp(Lyap))
    
    measures = measures + [fQ,fp,fs,fk,H,fLyap]
    
    # Measures on adjusted data
    xbar = np.mean(t_adj_x.dropna())
    std = np.std(t_adj_x.dropna())

    # Serial correlation (make sure box pierce statistic is returned as well)
    #bp = boxpierce(adj_x, lags=max_lag)
    max_lag = 10
    lbvalue, pvalue, bpvalue, bppvalue = acorr_ljungbox(na_contiguous(adj_x), lags=max_lag, boxpierce=True)
    # The above returns values for each lag, so just grab the final value
    Q = bpvalue[-1] / (N*max_lag)
    fQ = f2_transformation(Q,7.53,0.103)

    # Nonlinearity (add try/except block to capture USER IDs where this doesn't work)
    # (THIS REQUIRES THE TIMESERIES OBJECT VERSION OF OUR DATA)
    adj_x_contiguous = r.ts(FloatVector(na_contiguous(adj_x)))
    non_linear_test = rtseries.terasvirta_test_ts(adj_x_contiguous,type = "Chisq")
    #non_linear_test = rtseries.terasvirta_test_default(y=adj_x_contiguous,x=adj_x_contiguous.index.dayofyear,type = "Chisq")
    '''
    x = r.ts(FloatVector(na_contiguous(analysis_data_by_user_id[100613640])))
    non_linear_test = rtseries.terasvirta_test_ts(x,type = "Chisq")
    print(non_linear_test.names)
    [1] "statistic" "parameter" "p.value"   "method"    "data.name" "arguments"
    '''
    # Grab first element
    p = list(non_linear_test[0])[0]
    fp = f1_transformation(p,0.069,2.304)
    
    # Skewness
    skew = abs(np.mean((t_adj_x.dropna() - xbar) ** 3)/(std ** 3))
    fs = f1_transformation(skew,1.510,5.993)

    # Kurtosis
    kurtosis = np.mean((t_adj_x.dropna() - xbar) ** 4)/(std ** 4)
    fk = f1_transformation(kurtosis,2.273,11567)
    
    measures_list = measures + [fQ,fp,fs,fk]

    return measures_list

## Input Data

The code here was based on input data that looked like the following:
```
   DATE_COLUMN  ID_COL  COL_1   COL_2	COL_3	COL_4	COL_5	COL_6
0	2020-01-01	  1    71.068	71.880	48.989	91.525	268	25	1.94
1	2020-01-02	  1    67.620	58.387	60.222	93.487	323	54	5.65
2	2020-01-03	  2    79.221	82.174	66.070	86.466	476	103	6.59
3	2020-01-04	  2    74.678	80.550	66.839	70.774	464	68	10.56
...
```

The `ID_COL` column was an identifying value distinguishing between different users on a platform. The below code separates and groups data by this column and then does the feature extraction on each grouping; if there is no identifying value then that step can be skipped and run on a dataset consisting of a date column and a numerical column.

Ideally there is enough data where trends take shape and meaningful features could actually be extracted (ideally more than a year).

Also, there isn't a specific method for addressing missing data in the paper; that will need to be chosen beforehand.

Example code is shown below of what the whole thing would look like:

In [1]:
'''
# Declare variables that we want to analyze 
#analysis_df = pd.read_csv(...)
analysis_col = 'COL_1'
date_col = 'DATE_COLUMN'
analysis_columns = ['ID_COL', date_col, analysis_col]
analysis_data = analysis_df[analysis_columns]
# Group data by ID
analysis_data_by_id = {}
for id in analysis_data['ID_COL'].unique():
    subset_df = analysis_data.loc[analysis_data['ID_COL'].isin([id])]
    sorted_subset_df = subset_df.sort_values(by=['ID_COL',date_col])
    analysis_data_by_id[id] = sorted_subset_df[[date_col,analysis_col]].set_index(date_col)[analysis_col]
    
measures_dct = {}
for id,series in analysis_data_by_user_id.items():
    print('Now calculating measures for user ',id)
    measures_dct[id] = calculate_measures(series)
    
measures_df = pd.DataFrame.from_dict(measures_dct,orient='index',columns=["frequency", "trend","seasonal", "autocorrelation","non-linear","skewness","kurtosis","Hurst","Lyapunov","dc autocorrelation","dc non-linear","dc skewness","dc kurtosis"])
display(measures_df)
'''

'\n# Declare variables that we want to analyze \n#analysis_df = pd.read_csv(...)\nanalysis_col = \'COL_1\'\ndate_col = \'DATE_COLUMN\'\nanalysis_columns = [\'ID_COL\', date_col, analysis_col]\nanalysis_data = analysis_df[analysis_columns]\n# Group data by ID\nanalysis_data_by_id = {}\nfor id in analysis_data[\'ID_COL\'].unique():\n    subset_df = analysis_data.loc[analysis_data[\'ID_COL\'].isin([id])]\n    sorted_subset_df = subset_df.sort_values(by=[\'ID_COL\',date_col])\n    analysis_data_by_id[id] = sorted_subset_df[[date_col,analysis_col]].set_index(date_col)[analysis_col]\n    \nmeasures_dct = {}\nfor id,series in analysis_data_by_user_id.items():\n    print(\'Now calculating measures for user \',id)\n    measures_dct[id] = calculate_measures(series)\n    \nmeasures_df = pd.DataFrame.from_dict(measures_dct,orient=\'index\',columns=["frequency", "trend","seasonal", "autocorrelation","non-linear","skewness","kurtosis","Hurst","Lyapunov","dc autocorrelation","dc non-linear","dc skewn

### Example output data

```
id_col frequency	trend	seasonal	autocorrelation	non-linear	skewness	kurtosis	Hurst	Lyapunov	dc autocorrelation	dc non-linear	dc skewness	dc kurtosis
1	0.059928	0.124919	0.243887	0.109589	0.028767	0.067714	0.046443	0.500046	0.618593	0.206500	0.044546	0.060988	0.237828
2	0.000000	0.267712	0.000000	0.240373	0.004038	0.140565	0.040838	0.729902	0.982717	0.230324	0.026835	0.142644	0.108635
3	0.069886	0.118976	0.456264	0.298858	0.053476	0.015268	0.037035	0.500046	0.619930	0.165401	0.002541	0.038293	0.087527
4	0.000000	0.166727	0.000000	0.106591	0.030656	0.281176	0.284982	0.666537	0.983842	0.139930	0.025257	0.431716	0.988605
5	0.000000	0.272094	0.000000	0.096659	0.046871	0.259884	0.222697	0.555352	0.985034	0.176478	0.006834	0.174641	0.065182
```