Contributors: Sridhar K. N. Rao

Copyright 2022-23 The Linux Foundation

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.


# Time-Series Analysis

This Notebook includes time-series analysis of:

1. CPU
2. Memory
3. System Load

The analysis includes both individual and comparative analysis 

## Data Preparation

### CPU

In [None]:
import os
import pandas as pd

# If there are separate cpu-<n> files for each core, then we will have combine them.
# Here combining is done by taking the average value.

paths_one = []
for path, currentDirectory, files in os.walk("/data/pod18-node4/"):
    for file in files:
        if file.startswith("percent-idle"):
            #print(file)
            paths_one.append(os.path.join(path, file))

dfs_one = (pd.read_csv(f, index_col=False) for f in paths_one)
data_one = pd.concat(dfs_one).groupby(level=0).mean()

# if the data is already in a single file
data_two = pd.read_csv('/data/node5/percent-idle.csv')

### Memory

In [None]:
import os
import pandas as pd
data_one = pd.read_csv('/data/pod18-node4/memory/memory-used-2022-06-19', index_col=False)
data_two = pd.read_csv('/data/pod18-node5/memory/memory-used.csv', index_col=False)

### System Load

In [None]:
import os
import pandas as pd
data_one = pd.read_csv('/data/pod18-node4/load/load-2022-06-19', index_col=False)
data_two = pd.read_csv('/data/pod18-node5/load/load.csv', index_col=False)

### Convert Epoch, Set Index make the size of two dataset equal

In [None]:
data_one['epoch'] = pd.to_datetime(data_one['epoch'],unit='s')
data_one.set_index('epoch', inplace=True)
data_two['epoch'] = pd.to_datetime(data_two['epoch'],unit='s')
data_two.set_index('epoch', inplace=True)

In [None]:
if data_one.shape[0] > data_two.shape[0]:
    diff = data_one.shape[0] - data_two.shape[0]
    data_one = data_one.drop(data_one.index[(data_one.shape[0] - diff):])
else:
    diff2 = data_two.shape[0] - data_one.shape[0]
    data_two = data_two.drop(data_two.index[(data_two.shape[0] - diff2):])

print(data_one.shape[0])
print(data_two.shape[0])

## Autocorrelation

In [None]:
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima.model import ARIMA

In [None]:
autocorrelation_plot(data_one)
pyplot.show()

In [None]:
autocorrelation_plot(data_two)
pyplot.show()

## ARIMA

In [None]:
c1_copy = data_one.copy(deep=True)
c2_copy = data_two.copy(deep=True)

In [None]:
c1_copy.index = c1_copy.index.to_period('s')
c1_copy.index

In [None]:
#concat_two.index = concat_two.index.to_timestamp(freq='s')
c2_copy.index = c2_copy.index.to_period('s')
c2_copy.index

In [None]:
model = ARIMA(c2_copy, order=(1,1,0))
model_fit = model.fit()
print(model_fit.summary())

In [None]:
model = ARIMA(c1_copy, order=(1,1,0))
model_fit = model.fit()
print(model_fit.summary())

## Histograms

In [None]:
data_two.hist(column="value", bins=20)

In [None]:
data_one.hist(column="value", bins=20)

### Load

In [None]:
data_one.hist(column="longterm", bins=20)

In [None]:
data_two.hist(column="longterm", bins=20)

# Compute Probabilities

In [None]:
import seaborn as sns

In [None]:
data_one['value'] = data_one['value'].round(decimals=0)
probabilities_one = data_one['value'].value_counts(normalize=True)

In [None]:
data_two['value'] = data_two['value'].round(decimals=0)
probabilities_two = data_two['value'].value_counts(normalize=True)

### Load

In [None]:
data_one['longterm'] = data_one['longterm'].round(decimals=0)
probabilities_one = data_one['longterm'].value_counts(normalize=True)

In [None]:
data_two['longterm'] = data_two['longterm'].round(decimals=0)
probabilities_two = data_two['longterm'].value_counts(normalize=True)

In [None]:
sns.barplot(x=probabilities_one.index, y=probabilities_one.values, color='blue')
sns.barplot(x=probabilities_two.index, y=probabilities_two.values, color='blue')

# Comparative Analysis

## Dynamic Time Warping

## Interpretation
Dynamic time warping is a seminal time series comparison technique.The objective of time series comparison methods is to produce a distance metric between two input time series. The similarity or dissimilarity of two-time series is typically calculated by converting the data into vectors and calculating the Euclidean distance between those points in vector space.

In [None]:
import numpy as np 
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
x = data_one.to_numpy()
y = data_two.to_numpy()
distance, path = fastdtw(x,y, dist=euclidean)
print(distance)

In [None]:
from scipy.spatial.distance import chebyshev, cityblock
values_one = data_one[['value']].to_numpy()
values_two = data_two[['value']].to_numpy()
dist, path = fastdtw(values_one, values_two, dist=cityblock)
print(dist)

### Load

In [None]:
from scipy.spatial.distance import chebyshev, cityblock
values_one = data_one[['longterm']].to_numpy()
values_two = data_two[['longterm']].to_numpy()
dist, path = fastdtw(values_one, values_two, dist=cityblock)
print(dist)

# Wasserstein Distance

## Interpretation
Wasserstein distance provide a meaningful and smooth representation of the distance between distributions. They measure the minimal effort required to reconfigure the probability mass of one distribution in order to recover the other distribution.
Expectation: Less than 2.

In [None]:
from scipy.stats import wasserstein_distance
wd = wasserstein_distance (data_one['value'], data_two['value'])
wd

### Load

In [None]:
from scipy.stats import wasserstein_distance
wd = wasserstein_distance (data_one['longterm'], data_two['longterm'])
wd

# Maximum Mean Discrepancy

## Reference
https://www.kaggle.com/code/onurtunali/maximum-mean-discrepancy/notebook

## Interpretation
MMD is defined by the idea of representing distances between distributions as distances between mean embeddings of features. Two distributions are similar if their moments are similar. By applying a kernel, we can transform the variable such that all moments (first, second, third etc.) are computed. In the latent space we can compute the difference between the moments and average it. This gives a measure of the similarity/dissimilarity between the datasets.

In [None]:
import torch

def MMD(x, y, kernel):
    """Emprical maximum mean discrepancy. The lower the result
       the more evidence that distributions are the same.

    Args:
        x: first sample, distribution P
        y: second sample, distribution Q
        kernel: kernel type such as "multiscale" or "rbf"
    """
    xx, yy, zz = torch.mm(x, x.t()), torch.mm(y, y.t()), torch.mm(x, y.t())
    rx = (xx.diag().unsqueeze(0).expand_as(xx))
    ry = (yy.diag().unsqueeze(0).expand_as(yy))
    
    dxx = rx.t() + rx - 2. * xx # Used for A in (1)
    dyy = ry.t() + ry - 2. * yy # Used for B in (1)
    dxy = rx.t() + ry - 2. * zz # Used for C in (1)
    
    XX, YY, XY = (torch.zeros(xx.shape).to(device),
                  torch.zeros(xx.shape).to(device),
                  torch.zeros(xx.shape).to(device))
    
    if kernel == "multiscale":
        
        bandwidth_range = [0.2, 0.5, 0.9, 1.3]
        for a in bandwidth_range:
            XX += a**2 * (a**2 + dxx)**-1
            YY += a**2 * (a**2 + dyy)**-1
            XY += a**2 * (a**2 + dxy)**-1
            
    if kernel == "rbf":
      
        bandwidth_range = [10, 15, 20, 50]
        for a in bandwidth_range:
            XX += torch.exp(-0.5*dxx/a)
            YY += torch.exp(-0.5*dyy/a)
            XY += torch.exp(-0.5*dxy/a)
      
      

    return torch.mean(XX + YY - 2. * XY)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from scipy.stats import dirichlet 
from torch.distributions.multivariate_normal import MultivariateNormal



In [None]:
device = "cpu"

m = 20 # sample size
x_mean = torch.zeros(2)+1
y_mean = torch.zeros(2)
x_cov = 2*torch.eye(2) # IMPORTANT: Covariance matrices must be positive definite
y_cov = 3*torch.eye(2) - 1

px = MultivariateNormal(x_mean, x_cov)
qy = MultivariateNormal(y_mean, y_cov)
#x = px.sample([m]).to(device)
#y = qy.sample([m]).to(device)

x = torch.from_numpy(data_one.values).float().to(device)
y = torch.from_numpy(data_two.values).float().to(device)
print(x.t())
print(y.t())
print(type(x))
result = MMD(x, y, kernel="multiscale")

print(f"MMD result of X and Y is {result.item()}")

### Load

May have to change the device value and then test.

# Root mean square difference

## Interpretation
The closer RMSE is to 0, the more similar the generated data is to the real-data. But RMSE is returned on the same scale as the target we are emulating for and therefore there isn’t a general rule for how to interpret ranges of values.
For CPU usage,  If it is more than 30, then we may have to relook at it.

In [None]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(data_one['value'], data_two['value'], squared = False)

In [None]:
print(rmse)

In [None]:
rmse2 = mean_squared_error(data_one['value'], data_two['value'], squared = False)

In [None]:
print(rmse2)

In [None]:
rmse3 = mean_squared_error(data_one.sort_values(by='value')['value'], data_two.sort_values(by='value')['value'], squared = False)
print(rmse3)

# Mutual Information

## Interpretation
Mutual information describes relationships in terms of uncertainty. The mutual information (MI) between two quantities is a measure of the extent to which knowledge of one quantity reduces uncertainty about the other. High mutual information indicates a large reduction in uncertainty; low mutual information indicates a small reduction; and zero mutual information between two random variables means the variables are independent. The least possible mutual information between quantities is 0.0. When MI is zero, the quantities are independent: neither can tell you anything about the other. Conversely, in theory there's no upper bound to what MI can be. In practice though values above 2.0 or so are uncommon. (Mutual information is a logarithmic quantity, so it increases very slowly.)

Should be 1.5+

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
discrete_features = data_one.dtypes == int

In [None]:
mi_scores = make_mi_scores(data_one, data_two['value'], discrete_features)

In [None]:
print(mi_scores)