# Time Series Processing

In [2]:
import stumpy
import pandas as pd
from fastdtw import fastdtw
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from IPython.display import display, HTML
from scipy.spatial.distance import euclidean

In [3]:
def get_subdirectories(directory=""):
    subdirectories = []
    p = Path("./../experiments_data/" + directory)
    for item in p.glob('*/'):
        if item.suffix not in (['.csv', '.zip']):
            subdirectories.append(directory + "/" + item.name)
    return subdirectories

In [4]:
all_directories = get_subdirectories()
nodes_directories = [x for x in all_directories if "node" in x]

data_directories = []
data_directories_groups = []
for directory in nodes_directories:
    cur_node_subdirectories = get_subdirectories(directory)
    data_directories.append(cur_node_subdirectories)

data_directories_groups = data_directories
data_directories = [item for sublist in data_directories for item in sublist]

function_names = data_directories_groups[1]
function_names = list(map(lambda x: x[8:], data_directories_groups[1]))
function_names

['/avgNetProfitGroupedBySoldDate',
 '/avgNetProfitGroupedBySoldDateWhereProfitNegative',
 '/avgNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/avgWholeSaleCostGroupedBySoldDate',
 '/countDistinctTicketNumber',
 '/countNetProfitGroupedBySoldDate',
 '/countNetProfitGroupedBySoldDateWhereProfitNegative',
 '/countNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/countWholeSaleCostGroupedBySoldDate',
 '/filterCatalogSalesWhereProfitNegative',
 '/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
 '/filterCatalogSalesWhereYearAfter2000',
 '/filterStoreSalesWhereProfitNegative',
 '/filterStoreSalesWhereProfitNegativeAndYearAfter2000',
 '/filterStoreSalesWhereYearAfter2000',
 '/maxNetProfitGroupedBySoldDate',
 '/maxNetProfitGroupedBySoldDateWhereProfitNegative',
 '/maxNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/maxWholeSaleCostGroupedBySoldDate',
 '/minNetProfitGroupedBySoldDate',
 '/minNetProfitGroupedBySoldDateWhereProfitNegative',
 '/minNetProfitGroupedBySoldDateWhereYearAfter2

### Aggregating labels by functions names

In [5]:
labels = pd.read_csv("./../notebooks/functions.csv")
labels = labels.groupby('function_name')['label'].apply(', '.join).reset_index()
labels

Unnamed: 0,function_name,label
0,avgNetProfitGroupedBySoldDate,aggregation
1,avgNetProfitGroupedBySoldDateWhereProfitNegative,"aggregation, filtration"
2,avgNetProfitGroupedBySoldDateWhereYearAfter2000,"aggregation, filtration, join"
3,avgWholeSaleCostGroupedBySoldDate,aggregation
4,countDistinctTicketNumber,aggregation
5,countNetProfitGroupedBySoldDate,aggregation
6,countNetProfitGroupedBySoldDateWhereProfitNega...,"aggregation, filtration"
7,countNetProfitGroupedBySoldDateWhereYearAfter2000,"aggregation, filtration, join"
8,countWholeSaleCostGroupedBySoldDate,aggregation
9,filterCatalogSalesWhereProfitNegative,filtration


### Adding symbols (ts1, ts2, ...) to function names

In [6]:
iterator = 1
symbolic_data = pd.DataFrame(columns=["sym", "f_name"])
for function in function_names:
    symbolic_data = symbolic_data.append({"sym": f"ts{iterator}", "f_name": function[1:]}, ignore_index=True)
    iterator += 1

### Merging labels with symbols

In [7]:
print("Legend:")
labeled_data = pd.merge(symbolic_data, labels, left_on='f_name', right_on='function_name', how='left').drop('function_name', axis=1)
labeled_data = labeled_data
display(labeled_data)
# print(labeled_data['label'].to_string(index=False))
labeled_data.to_csv(f"./../experiments_data_/preprocessed-data/corr_legend.csv")

Legend:


Unnamed: 0,sym,f_name,label
0,ts1,avgNetProfitGroupedBySoldDate,aggregation
1,ts2,avgNetProfitGroupedBySoldDateWhereProfitNegative,"aggregation, filtration"
2,ts3,avgNetProfitGroupedBySoldDateWhereYearAfter2000,"aggregation, filtration, join"
3,ts4,avgWholeSaleCostGroupedBySoldDate,aggregation
4,ts5,countDistinctTicketNumber,aggregation
5,ts6,countNetProfitGroupedBySoldDate,aggregation
6,ts7,countNetProfitGroupedBySoldDateWhereProfitNega...,"aggregation, filtration"
7,ts8,countNetProfitGroupedBySoldDateWhereYearAfter2000,"aggregation, filtration, join"
8,ts9,countWholeSaleCostGroupedBySoldDate,aggregation
9,ts10,filterCatalogSalesWhereProfitNegative,filtration


In [15]:
agg_labeled = labels.loc[labels["label"] == "aggregation"]
agg_fil_labeled = labels.loc[labels["label"] == "aggregation, filtration"]
fil_labeled = labels.loc[labels["label"] == "filtration"]
agg_fil_join_labeled = labels.loc[labels["label"] == "aggregation, filtration, join"]

avgNetProfitGroupedBySoldDate
avgWholeSaleCostGroupedBySoldDate
countDistinctTicketNumber
countNetProfitGroupedBySoldDate
countWholeSaleCostGroupedBySoldDate
maxNetProfitGroupedBySoldDate
maxWholeSaleCostGroupedBySoldDate
minNetProfitGroupedBySoldDate
minWholeSaleCostGroupedBySoldDate
sumNetProfitGroupedBySoldDate
sumWholeSaleCostGroupedBySoldDate
summaryNetProfitGroupedBySoldDate
summaryWholeSaleCostGroupedBySoldDate


In [40]:
def dtw_distance(x_labels, y_labels):
    tmp_euc_dist = []
    array_dist = np.empty((agg_labeled.shape[0], agg_labeled.shape[0]))
    x_index = 0
    y_index = 0
    for x_name in x_labels["function_name"]:
        try:
            x_data = pd.read_csv(f"./../experiments_data_/preprocessed-data/workers-mean-data/{x_name}/smooth_mean_data.csv")
            print("\n")
            print(x_name)
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))
        for y_name in y_labels["function_name"]:
            try:
                y_data = pd.read_csv(f"./../experiments_data_/preprocessed-data/workers-mean-data/{y_name}/smooth_mean_data.csv")
                distance, path = fastdtw(x_data["CPU"], y_data["CPU"], dist=euclidean)
                print(f"{round(distance, 2)} | {y_name}")
                # array_dist[x_index][y_index] = round(distance, 2)
                tmp_euc_dist.append(distance)
            except OSError as e:
                print("Error: %s - %s." % (e.filename, e.strerror))

            y_index =+ 1
        x_index += 1
    return

In [41]:
dtw_distance(agg_labeled, fil_labeled)



avgNetProfitGroupedBySoldDate
423.24 | filterCatalogSalesWhereProfitNegative
501.91 | filterStoreSalesWhereProfitNegative


avgWholeSaleCostGroupedBySoldDate
354.28 | filterCatalogSalesWhereProfitNegative
478.53 | filterStoreSalesWhereProfitNegative


countDistinctTicketNumber
770.2 | filterCatalogSalesWhereProfitNegative
808.64 | filterStoreSalesWhereProfitNegative


countNetProfitGroupedBySoldDate
431.86 | filterCatalogSalesWhereProfitNegative
508.15 | filterStoreSalesWhereProfitNegative


countWholeSaleCostGroupedBySoldDate
429.74 | filterCatalogSalesWhereProfitNegative
580.13 | filterStoreSalesWhereProfitNegative


maxNetProfitGroupedBySoldDate
192.83 | filterCatalogSalesWhereProfitNegative
314.57 | filterStoreSalesWhereProfitNegative


maxWholeSaleCostGroupedBySoldDate
471.75 | filterCatalogSalesWhereProfitNegative
594.76 | filterStoreSalesWhereProfitNegative


minNetProfitGroupedBySoldDate
370.08 | filterCatalogSalesWhereProfitNegative
509.94 | filterStoreSalesWhereProfitNegati

In [42]:
dtw_distance(agg_labeled, agg_fil_labeled)



avgNetProfitGroupedBySoldDate
410.67 | avgNetProfitGroupedBySoldDateWhereProfitNegative
404.61 | countNetProfitGroupedBySoldDateWhereProfitNegative
391.09 | maxNetProfitGroupedBySoldDateWhereProfitNegative
429.32 | minNetProfitGroupedBySoldDateWhereProfitNegative
425.87 | sumNetProfitGroupedBySoldDateWhereProfitNegative


avgWholeSaleCostGroupedBySoldDate
369.21 | avgNetProfitGroupedBySoldDateWhereProfitNegative
386.94 | countNetProfitGroupedBySoldDateWhereProfitNegative
358.39 | maxNetProfitGroupedBySoldDateWhereProfitNegative
375.01 | minNetProfitGroupedBySoldDateWhereProfitNegative
360.54 | sumNetProfitGroupedBySoldDateWhereProfitNegative


countDistinctTicketNumber
710.31 | avgNetProfitGroupedBySoldDateWhereProfitNegative
801.54 | countNetProfitGroupedBySoldDateWhereProfitNegative
688.65 | maxNetProfitGroupedBySoldDateWhereProfitNegative
743.97 | minNetProfitGroupedBySoldDateWhereProfitNegative
724.69 | sumNetProfitGroupedBySoldDateWhereProfitNegative


countNetProfitGroupedBySol

In [43]:
dtw_distance(fil_labeled, agg_fil_labeled)



filterCatalogSalesWhereProfitNegative
173.46 | avgNetProfitGroupedBySoldDateWhereProfitNegative
168.45 | countNetProfitGroupedBySoldDateWhereProfitNegative
186.68 | maxNetProfitGroupedBySoldDateWhereProfitNegative
189.44 | minNetProfitGroupedBySoldDateWhereProfitNegative
195.92 | sumNetProfitGroupedBySoldDateWhereProfitNegative


filterStoreSalesWhereProfitNegative
280.25 | avgNetProfitGroupedBySoldDateWhereProfitNegative
298.79 | countNetProfitGroupedBySoldDateWhereProfitNegative
299.79 | maxNetProfitGroupedBySoldDateWhereProfitNegative
248.6 | minNetProfitGroupedBySoldDateWhereProfitNegative
284.49 | sumNetProfitGroupedBySoldDateWhereProfitNegative


In [8]:
ts1 = pd.read_csv("./../experiments_data_/preprocessed-data/workers-mean-data/avgWholeSaleCostGroupedBySoldDate/smooth_mean_data.csv")
# ts1 = ts1["CPU"].tolist()
# print(ts1[:50])
ts2 = pd.read_csv("./../experiments_data_/preprocessed-data/workers-mean-data/countNetProfitGroupedBySoldDate/smooth_mean_data.csv")
# ts2 = ts2["CPU"].tolist()
# print(ts2[:50])
distance, path = fastdtw(ts1["CPU"], ts1["CPU"], dist=euclidean)

print(distance)

0.0


In [3]:
x = np.array([[1,1], [2,2], [3,3], [4,4], [5,5]])
y = np.array([[2,2], [3,3], [4,4]])

distance2, path2 = fastdtw(x, y, dist=euclidean)
print(distance2)

2.8284271247461903


### Filling missing values with '0'

In [27]:
corr_df = pd.DataFrame()
ts_counter = 1

for function in function_names:
    df_ts = pd.read_csv(f"./../experiments_data/preprocessed-data/workers-mean-data{function}/smooth_mean_data.csv")
    corr_df[f"ts{ts_counter}"] = df_ts["CPU"]
    ts_counter += 1

corr_df = corr_df.fillna(0)


### Time series correlation
* Pearson
* Kendall
* Spearman

In [28]:
measures = {"pearson", "kendall", "spearman"}
measures_data = []

for measure in measures:
    df = corr_df.corr(method=measure)
    measures_data.append(df)
    df.to_csv(f"./../experiments_data/preprocessed-data/smooth_corr_{measure}.csv")

