## Imports

In [None]:
from gtd.input import CsvFullReader

from gtd.preprocessor import TimeConfigurator, Padder, TaskNormalizer, Cropper, OutlierHandler

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd

In [None]:
from pyclustering.cluster.kmeans import kmeans
from pyclustering.utils.metric import distance_metric
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.encoder import cluster_encoder

from gtd.comparator.calculators import l1_img, l2_img, dtwl2 

In [None]:
mpl.rcParams.update(mpl.rcParamsDefault)
colors = ['gainsboro', 'lightgray', 'grey']
hatches = ['///', '', '', '..', '\\', 'X', 'o', '+']
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=colors)
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

## Data Preprocessing

### Read Input

In [None]:
inp = CsvFullReader(input_dir='./input-sim/', structured=True, columns=['time', 'avg_cpu_usage']).read_input()
inp_ = CsvFullReader(input_dir='./input-dis/', structured=True, columns=['time', 'avg_cpu_usage']).read_input()

### Preprocess Data

In [None]:
size = 1024
offset = 0

In [None]:
inp_1 = TimeConfigurator(time_col='time', time_unit='us', freq='5min').run(inp)
inp_1_ = TimeConfigurator(time_col='time', time_unit='us', freq='5min').run(inp_)

In [None]:
inp_2 = Padder(freq='5min').run(inp_1)
inp_2_ = Padder(freq='5min').run(inp_1_)

In [None]:
#inp_2_1 = OutlierHandler(col='avg_cpu_usage', llim=0.01, ulim=0.99).run(inp_2)
#inp_2_1_ = OutlierHandler(col='avg_cpu_usage', llim=0.01, ulim=0.99).run(inp_2_)

In [None]:
inp_3 = Cropper(llim=offset, ulim=offset+size).run(inp_2)
inp_3_ = Cropper(llim=offset, ulim=offset+size).run(inp_2_)

In [None]:
inp_end = TaskNormalizer(col='avg_cpu_usage').run(inp_3)
inp_end_ = TaskNormalizer(col='avg_cpu_usage').run(inp_3_)

### Plot Tasks

In [None]:
for task in inp_end.get_tasks():
    x = task.get_fraction_by_idx(0).data.index
    y = task.get_fraction_by_idx(0).data['avg_cpu_usage']
    
    fig, ax = plt.subplots(figsize=(18, 3))
    
    plt.xticks(size = 30)
    plt.yticks(size = 30)

    x_ticks = [x[0], x[287], x[575], x[863]]

    x_labels = [0, 1, 2, 3]

    ax.plot(x, y, color='black', label=f"Job {str(task.job_id)[:3]}")

    ax.grid(False)
    
    plt.xticks(ticks=x_ticks, labels=x_labels)
    
    if task.job_id == 399444405700:
        ax.set_xlabel('Time (days)',  size = 35)
    ax.set_ylabel('CPU Usage',  size = 35, loc='center')

    ax.legend(loc='lower right', bbox_to_anchor=(1.01, 0.9), fontsize=35, handlelength=0, handletextpad=0, frameon=False)

    fig.savefig(f'./output/tasks-sim/{task.job_id}-{task.idx}.pdf', dpi=300, bbox_inches='tight', format='pdf')
    
    #plt.show()
    plt.close()


In [None]:
for task in inp_end_.get_tasks():
    x = task.get_fraction_by_idx(0).data.index
    y = task.get_fraction_by_idx(0).data['avg_cpu_usage']
    
    fig, ax = plt.subplots(figsize=(18, 3))
    
    plt.xticks(size = 30)
    plt.yticks(size = 30)
    
    x_ticks = [x[0], x[287], x[575], x[863]]

    x_labels = [0, 1, 2, 3]

    ax.plot(x, y, color='black', label=f"Task {str(task.idx)}")

    ax.grid(False)

    plt.xticks(ticks=x_ticks, labels=x_labels)
    
    if task.idx == 61:
        ax.set_xlabel('Time (days)',  size = 35)
    ax.set_ylabel('CPU Usage',  size = 35, loc='center')

    ax.legend(loc='lower right', bbox_to_anchor=(1.01, 0.9), fontsize=35, handlelength=0, handletextpad=0, frameon=False)

    fig.savefig(f'./output/tasks-dis/{task.job_id}-{task.idx}.pdf', dpi=300, bbox_inches='tight', format='pdf')
    
    plt.close()

### Prepare Dataset

In [None]:
lst = []
labels = []
labels_idxs = []
for task in inp_end.get_tasks():

    task_data = task.get_fraction_by_idx(0).data['avg_cpu_usage'].to_numpy(copy=True)
    
    lst.append(task_data)
    labels.append(task.job_id)
    labels_idxs.append(task.idx)

df = pd.DataFrame(lst)
data = df.to_numpy()

## Clustering

In [None]:
clusters_centers = {}

In [None]:
for metric, dist_func in zip(['L1', 'L2', 'DTW'], [l1_img, l2_img, dtwl2]):
    for n_clusters in range(6, 7):
        initial_centers = kmeans_plusplus_initializer(data, n_clusters, random_state=42).initialize()

        instanceKm = kmeans(data, initial_centers=initial_centers, metric=distance_metric(1000, func=dist_func))

        instanceKm.process()

        pyClusters = instanceKm.get_clusters()
        pyCenters = instanceKm.get_centers()

        clusters_centers[metric] = (pyClusters, pyCenters)

        pyEncoding = instanceKm.get_cluster_encoding()
        pyEncoder = cluster_encoder(pyEncoding, pyClusters, data)
        pyLabels = pyEncoder.set_encoding(0).get_clusters()

        clusters = {}

        for i in range(n_clusters):
            clusters[i] = {}

            for job_id in set(labels):
                clusters[i][job_id] = 0

        for i in range(len(labels)):
            clusters[pyLabels[i]][labels[i]] = clusters[pyLabels[i]][labels[i]] + 1

        for cluster, vals in clusters.items():
            tmp = []
            for _, val in sorted(vals.items()):
                tmp.append(val)

            clusters[cluster] = tmp

        fig, ax = plt.subplots(figsize=(10/2,6/2))

        plt.xticks(size = 18)
        plt.yticks(size = 18)

        jobs = [str(x)[:3] for x in sorted(set(labels))]
        bottom = np.zeros(len(jobs))

        lines = []
        leg_labels = []
        i = 0
        for cluster_id, cnts in clusters.items():
            line = ax.bar(jobs, cnts, 0.5, label=cluster_id, bottom=bottom, hatch=hatches[i])

            lines.append(line)
            leg_labels.append(cluster_id)

            i += 1
            bottom += cnts

        ax.set_xlabel('Jobs', size = 20)
        ax.set_ylabel('Number of Tasks', size = 20)

        # ax.legend(loc=2, bbox_to_anchor=(1, 1), title="Cluster", fontsize=30, title_fontsize=30)
        ax.set_ylim(0, 10.5)
        ax.set_yticks(range(0, 11))

        ax.yaxis.grid()

        fig.savefig(f'./output/clusters-sim/numeric-{metric}-{n_clusters}.pdf', dpi=300, bbox_inches='tight', format='pdf')

        plt.show()
        plt.close()

        legendFig = plt.figure()
        legendFig.legend(lines, leg_labels, loc='center', title="Cluster", fontsize=15, title_fontsize=18, ncols=6)
        legendFig.savefig('./output/numeric-legend.pdf', dpi=300, bbox_inches='tight', format='pdf')
        plt.close()

In [None]:
cluster_to_job_map = {
    "L1": {
        0: 91724979887,
        1: 382417448240,
        2: 113812204462,
        3: 374877055556,
        4: 399444405700,
        5: 380263806889
    },
    "L2": {
        0: 91724979887,
        1: 382417448240,
        2: 113812204462,
        3: 374877055556,
        4: 399444405700,
        5: 380263806889
    },
    "DTW": {
        0: 91724979887,
        1: 382417448240,
        2: 113812204462,
        3: 374877055556,
        4: 399444405700,
        5: 380263806889
    },
}

### Dissimilar Tasks - Time Shifted Tasks (Job 113)

In [None]:
for metric, dist_func in zip(['L1', 'L2', 'DTW'], [l1_img, l2_img, dtwl2]):
    centers = clusters_centers[metric][1]
    
    cluster_cnts = [0 for i in range(len(centers))]

    real_cnt = 0
    cluster_cnts_real = [[0 for i in range(len(centers))] for i in range(5)]
    labels_real = []
    hatches = ['///', '', '..', '\\', 'x',]
    
    x_labels = [str(val)[:3] for _, val in sorted(cluster_to_job_map[metric].items(), key=lambda item: item[1])]

    for task in inp_end_.get_tasks():
        task_data = task.get_fraction_by_idx(0).data['avg_cpu_usage'].to_numpy(copy=True)
        
        distances = []
        for cluster_id, center in enumerate(centers):
            distances.append(dist_func(task_data, np.array(center)))

        # Task vs Cluster Centers
        fig, ax = plt.subplots(figsize=(10,6))

        x = np.arange(len(distances))
        y = [distances[key] for key, val in sorted(cluster_to_job_map[metric].items(), key=lambda item: item[1])]

        bars = ax.bar(x, y)

        closest_cluster_id = np.argmin(distances)
        for i, item in enumerate(sorted(cluster_to_job_map[metric].items(), key=lambda item: item[1])):
            if item[0] == closest_cluster_id:
                bars[i].set_color('grey')
                break

        ax.set_xticks(x, x_labels)

        ax.set_xlabel("Jobs")
        ax.set_ylabel("Distance")
        ax.set_title(f"Task {task.idx} vs Cluster Center of each Job")

        fig.savefig(f'./output/clusters-distances/numeric-{metric}-{task.idx}.png', dpi=300, bbox_inches='tight')

        plt.close()

        if task.idx < 100:
            print(task.idx, "->", cluster_to_job_map[metric][closest_cluster_id])

            labels_real.append(task.idx)
            cluster_cnts_real[real_cnt][closest_cluster_id] = 1
            real_cnt = real_cnt + 1
        else:
            cluster_cnts[closest_cluster_id] = cluster_cnts[closest_cluster_id] + 1

    # Aggregated Clustering of Dissimilar Tasks
    fig, ax = plt.subplots(figsize=(10/2,6/2))

    plt.xticks(size = 18)
    plt.yticks(size = 18)
    
    lines = []
    leg_labels = []
    x = np.arange(len(cluster_cnts))
    y = [cluster_cnts[key] for key, val in sorted(cluster_to_job_map[metric].items(), key=lambda item: item[1])]

    line = ax.bar(x, y, color='grey', label="Synthetic")

    lines.append(line)
    leg_labels.append("Synthetic")

    bottom = np.array(y)
    for i, cnts in enumerate(cluster_cnts_real):
        y_real = [cnts[key] for key, val in sorted(cluster_to_job_map[metric].items(), key=lambda item: item[1])]
        line = ax.bar(x, y_real, color='lightgray', bottom=bottom, label=f"Task {labels_real[i]}", hatch=hatches[i])

        lines.append(line)
        leg_labels.append(f"Task {labels_real[i]}")

        bottom = bottom + np.array(y_real)

    ax.set_xticks(x, x_labels)

    # ax.set_title("Clustering of Dissimilar Tasks of Job 113")
    ax.set_xlabel('Jobs', size = 20)
    ax.set_ylabel('Number of Tasks', size = 20)

    ax.set_ylim(0, 20.5)
    ax.set_yticks([2, 4, 6, 8, 10, 12, 14, 16, 18, 20])

    ax.yaxis.grid()
    
    # ax.legend(loc=2, bbox_to_anchor=(1, 1), fontsize=30)
    
    fig.savefig(f'./output/clusters-dis/numeric-{metric}-{n_clusters}.pdf', dpi=300, bbox_inches='tight', format='pdf')

    plt.show()
    plt.close()

    legendFig = plt.figure()
    legendFig.legend(lines, leg_labels, loc='center', fontsize=15, ncols=3)
    legendFig.savefig('./output/numeric-legend-dis.pdf', dpi=300, bbox_inches='tight', format='pdf')
    plt.close()