# Time Series Processing

In [37]:
import os
import pandas as pd
from fastdtw import fastdtw
from statistics import mean, pstdev
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.spatial.distance import euclidean


In [2]:
def get_subdirectories(directory=""):
    subdirectories = []
    p = Path("./../experiments_data_/" + directory)
    for item in p.glob('*/'):
        if item.suffix not in (['.csv', '.zip']):
            subdirectories.append(directory + "/" + item.name)
    return subdirectories

In [3]:
all_directories = get_subdirectories()
nodes_directories = [x for x in all_directories if "node" in x]

data_directories = []
data_directories_groups = []
for directory in nodes_directories:
    cur_node_subdirectories = get_subdirectories(directory)
    data_directories.append(cur_node_subdirectories)

data_directories_groups = data_directories
data_directories = [item for sublist in data_directories for item in sublist]

function_names = data_directories_groups[1]
function_names = list(map(lambda x: x[8:], data_directories_groups[1]))

### All functions in `experiments_data`

In [4]:
for function in function_names:
    print(f"- {function[1:]}")

- avgNetProfitGroupedBySoldDate
- avgNetProfitGroupedBySoldDateWhereProfitNegative
- avgWholeSaleCostGroupedBySoldDate
- countDistinctTicketNumber
- countNetProfitGroupedBySoldDate
- countNetProfitGroupedBySoldDateWhereProfitNegative
- countWholeSaleCostGroupedBySoldDate
- filterCatalogSalesWhereProfitNegative
- filterCatalogSalesWhereProfitNegativeAndYearAfter2000
- filterCatalogSalesWhereYearAfter2000
- filterStoreSalesWhereProfitNegative
- filterStoreSalesWhereProfitNegativeAndYearAfter2000
- filterStoreSalesWhereYearAfter2000
- maxNetProfitGroupedBySoldDate
- maxNetProfitGroupedBySoldDateWhereProfitNegative
- maxWholeSaleCostGroupedBySoldDate
- minNetProfitGroupedBySoldDate
- minNetProfitGroupedBySoldDateWhereProfitNegative
- minWholeSaleCostGroupedBySoldDate
- summaryNetProfitGroupedBySoldDate
- summaryWholeSaleCostGroupedBySoldDate
- sumNetProfitGroupedBySoldDate
- sumNetProfitGroupedBySoldDateWhereProfitNegative
- sumWholeSaleCostGroupedBySoldDate


### Aggregating labels by functions names

In [5]:
labels = pd.read_csv("./../notebooks/functions.csv")
labels = labels.groupby('function_name')['label'].apply(', '.join).reset_index()
labels

Unnamed: 0,function_name,label
0,avgNetProfitGroupedBySoldDate,aggregation
1,avgNetProfitGroupedBySoldDateWhereProfitNegative,"aggregation, filtration"
2,avgNetProfitGroupedBySoldDateWhereYearAfter2000,"aggregation, filtration, join"
3,avgWholeSaleCostGroupedBySoldDate,aggregation
4,countDistinctTicketNumber,aggregation
5,countNetProfitGroupedBySoldDate,aggregation
6,countNetProfitGroupedBySoldDateWhereProfitNega...,"aggregation, filtration"
7,countNetProfitGroupedBySoldDateWhereYearAfter2000,"aggregation, filtration, join"
8,countWholeSaleCostGroupedBySoldDate,aggregation
9,filterCatalogSalesWhereProfitNegative,filtration


### Adding symbols (ts1, ts2, ...) to function names

In [6]:
iterator = 1
symbolic_data = pd.DataFrame(columns=["sym", "f_name"])
for function in function_names:
    symbolic_data = symbolic_data.append({"sym": f"ts{iterator}", "f_name": function[1:]}, ignore_index=True)
    iterator += 1

### Merging labels with symbols

In [7]:
# print("Legend:")
# labeled_data = pd.merge(symbolic_data, labels, left_on='f_name', right_on='function_name', how='left').drop('function_name', axis=1)
# labeled_data = labeled_data
# display(labeled_data)
# # print(labeled_data['label'].to_string(index=False))
# labeled_data.to_csv(f"./../experiments_data_/preprocessed-data/corr_legend.csv")

Grouping labels by UDF type

In [7]:
agg_labeled = labels.loc[labels["label"] == "aggregation"]
agg_fil_labeled = labels.loc[labels["label"] == "aggregation, filtration"]
fil_labeled = labels.loc[labels["label"] == "filtration"]
agg_fil_join_labeled = labels.loc[labels["label"] == "aggregation, filtration, join"]

In [35]:
def translate_scale(dataframe):
    dataframe["translated"] = dataframe['CPU'] - dataframe['CPU'].mean()
    dataframe["scaled"] = dataframe["translated"] * (1/dataframe["translated"].std())
    return dataframe

def dtw_distance(x_labels, y_labels):
    distances = []
    for x_name in x_labels["function_name"]:
        try:
            x_data = pd.read_csv(f"./../experiments_data_/preprocessed-data/workers-mean-data/{x_name}/translated_scaled_smoothed_data.csv")
            # print("\n")
            # print("------------------------")
            # print(f"X: {x_name}")
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))
        for y_name in y_labels["function_name"]:
            try:
                y_data = pd.read_csv(f"./../experiments_data_/preprocessed-data/workers-mean-data/{y_name}/translated_scaled_smoothed_data.csv")
                # distance_scaled, path_scaled = fastdtw(x_data["scaled"], y_data["scaled"], dist=euclidean)
                # distance_one_sec, path_one_sec = fastdtw(x_data["one_sec"], y_data["one_sec"], dist=euclidean)
                # distance_two_sec, path_two_sec = fastdtw(x_data["two_sec"], y_data["two_sec"], dist=euclidean)
                distance_five_sec, path_five_sec = fastdtw(x_data["five_sec"], y_data["five_sec"], dist=euclidean)
                distances.append(distance_five_sec)
                # distance_ten_sec, path_ten_sec = fastdtw(x_data["ten_sec"], y_data["ten_sec"], dist=euclidean)
                # print(f"Y: {y_name}")
                # print(f"Scaled: {distance_scaled}")
                # print(f"1 sec: {distance_one_sec}")
                # print(f"2 sec: {distance_two_sec}")
                # print(f"5 sec: {distance_five_sec}")
                # print(f"10 sec: {distance_ten_sec}")
                # print()
            except OSError as e:
                print("Error: %s - %s." % (e.filename, e.strerror))
    distances.sort()
    distances = [i for i in distances if i != 0]
    return round(min(distances), 2), round(max(distances), 2), round(mean(distances), 2), round(pstdev(distances), 2)

def group_distance_statistics(labels_X, labels_Y, name_X, name_Y):
    min_value, max_value, mean_value, std_value = dtw_distance(labels_X, labels_Y)
    print("\n")
    print("----------------------")
    print(f"{name_X} - {name_Y}")
    print(f"MIN: {min_value}")
    print(f"MAX: {max_value}")
    print(f"MEAN: {mean_value}")
    print(f"STD: {std_value}")

In [36]:
group_distance_statistics(agg_labeled, agg_labeled, "aggregation", "aggregation")
group_distance_statistics(agg_labeled, fil_labeled, "aggregation", "filtration")
group_distance_statistics(fil_labeled, fil_labeled, "filtration", "filtration")
group_distance_statistics(agg_labeled, agg_fil_labeled, "aggregation", "aggregation-filtration")
group_distance_statistics(fil_labeled, agg_fil_labeled, "filtration", "aggregation-filtration")
group_distance_statistics(agg_fil_labeled, agg_fil_labeled, "aggregation-filtration", "aggregation-filtration")
group_distance_statistics(agg_labeled, agg_fil_join_labeled, "aggregation", "aggregation-filtration-join")
group_distance_statistics(fil_labeled, agg_fil_join_labeled, "filtration", "aggregation-filtration-join")
group_distance_statistics(agg_fil_join_labeled, agg_fil_join_labeled, "aggregation-filtration-join", "aggregation-filtration-join")



----------------------
aggregation - aggregation
MIN: 0.0
MAX: 172.97
MEAN: 48.21
STD: 41.91


----------------------
aggregation - filtration
MIN: 156.25
MAX: 398.46
MEAN: 260.63
STD: 48.48


----------------------
filtration - filtration
MIN: 21.35
MAX: 21.35
MEAN: 21.35
STD: 0.0


----------------------
aggregation - aggregation-filtration
MIN: 8.03
MAX: 193.18
MEAN: 74.5
STD: 36.86


----------------------
filtration - aggregation-filtration
MIN: 142.24
MAX: 253.81
MEAN: 210.54
STD: 42.33


----------------------
aggregation-filtration - aggregation-filtration
MIN: 6.07
MAX: 18.1
MEAN: 10.61
STD: 3.91
Error: ./../experiments_data_/preprocessed-data/workers-mean-data/avgNetProfitGroupedBySoldDateWhereYearAfter2000/translated_scaled_smoothed_data.csv - No such file or directory.
Error: ./../experiments_data_/preprocessed-data/workers-mean-data/countNetProfitGroupedBySoldDateWhereYearAfter2000/translated_scaled_smoothed_data.csv - No such file or directory.
Error: ./../experiments_d

ValueError: min() arg is an empty sequence

In [16]:
# IN dataframe: "CPU"
# OUT dataframe: "translated", "scaled", "one_sec", "two_sec", "five_sec", "ten_sec"

for name in function_names:
    file_path = f"./../experiments_data_/preprocessed-data/workers-mean-data{name}"
    original_data = pd.read_csv(f"{file_path}/mean_data.csv")
    original_data = original_data[["CPU"]]
    transformed_data = translate_scale(original_data)
    transformed_data["one_sec"] = transformed_data["scaled"].rolling(4, min_periods=1).mean()
    transformed_data["two_sec"] = transformed_data["scaled"].rolling(8, min_periods=1).mean()
    transformed_data["five_sec"] = transformed_data["scaled"].rolling(20, min_periods=1).mean()
    transformed_data["ten_sec"] = transformed_data["scaled"].rolling(40, min_periods=1).mean()
    transformed_data.to_csv(f"{file_path}/translated_scaled_smoothed_data.csv", index=False)

In [14]:
# Deleting created files

for function_name in function_names:
    try:
        os.remove(f'./../experiments_data_/preprocessed-data/workers-mean-data{function_name}/translated_scaled_smoothed_data.csv')
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))