In [1]:
import os
import matplotlib.pyplot as plt
import datetime
import json
import matplotlib.gridspec as gridspec

import numpy as np
import pandas as pd
import seaborn as sns

from itertools import compress

from helpers.expr_data_mem_cpu import ExprDataMemCPU
from helpers.scale_data import ScaleData
from helpers.similarity import Similarity
from helpers.feature_selection import FeatureSelection

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 22
SMALL_SMALL_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize

plt.rc('legend', fontsize=SMALL_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import random
np.random.seed(42)
random.seed(42)

In [4]:
# all_groups = ['10', '11', '12', '20', '21', '22']
# candidate_group = '22'

# from_sku = ('cpu2', 32)
# to_sku = ('cpu8', 64)

In [5]:
def plot_scores_diff_feature_num(simi_mtxs, simi_data, feature_groups, root_idx, method_type, colwise=False, note=''):
    fig, ax = plt.subplots(figsize=(3*len(feature_groups), 2.8))
    zipped = []
    for simi_mtx, feature_group in zip(simi_mtxs, feature_groups):       
        curr_zipped = list(zip(simi_data.wl_groups, simi_data.wl_names, [feature_group]*simi_data.get_num_exprs(), simi_mtx[root_idx]))
        del curr_zipped[root_idx]
        zipped += curr_zipped
    X_p = pd.DataFrame(zipped, columns=['run', 'workload','feature_group', 'distance'])
    X_p = X_p.astype({'distance': float})
    X_p.drop([root_idx],inplace=True)
    colors = sns.color_palette()[:X_p['workload'].nunique()+1]
    colors = colors[:-2]+colors[-1:]

    order=['tpcc', 'tpch', 'twitter']
    if X_p['workload'].nunique() == 4:
        order.append('ycsb')
    
    sns.barplot(y="distance", x="feature_group", hue_order=order,
                hue="workload", data=X_p, ax=ax, 
                palette=colors)
    plt.xlabel('')
    plt.ylabel('Normalized Distance')
    
    expr_idx = simi_data.wl_groups[root_idx]
    wl_name = simi_data.wl_names[root_idx] 
    cpu_num_val = simi_data.cpu_nums[root_idx]
    if colwise:
        colwise_label = 'Independent'
    else:
        colwise_label = 'Dependent'
    title = 'Base Workload: {}, Method: {} {}.{}'.format(
        wl_name, cpu_num_val, colwise_label, method_type, note)
    fname = '{}{}_cpu{}_{}_{}{}.pdf'.format(wl_name, expr_idx, cpu_num_val, colwise_label, method_type, note)
    # plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
    plt.legend(bbox_to_anchor=(0, 1, 1, 0), loc='lower left', ncol=4, mode="expand")
               #, borderaxespad=0., fontsize=9) # 
    print(title)
    plt.savefig(f'figs/ycsb_{fname}', bbox_inches = 'tight')
    plt.show()

In [6]:
def plot_scores_single(simi_mtx, simi_data, root_idx, data_type, method_type, note=''):
    # fig, ax = plt.subplots(figsize=(3.2,1.3))
    # fig, ax = plt.subplots(figsize=(2.4, 3.2))
    fig, ax = plt.subplots(figsize=(4.8, 1.6))
    

    zipped = list(zip(simi_data.wl_groups, simi_data.wl_names, simi_data.cpu_nums, simi_mtx[root_idx]))
    X_p = pd.DataFrame(zipped, columns=['run', 'workload','cpu_num', 'distance'])
    X_p = X_p.astype({'distance': float})
    X_p.drop([root_idx],inplace=True)
    colors = sns.color_palette()[:X_p['workload'].nunique()+1]
    colors = colors[:-2]+colors[-1:]
    
    order=['tpcc', 'tpch', 'twitter']
    if X_p['workload'].nunique() == 4:
        order.append('ycsb')

    sns.barplot(y="distance", x="workload", order=order,
                data=X_p, ax=ax, errorbar="sd",
                palette=colors)
    ax.set_xticklabels(['TPC-C', 'TPC-H', 'Twitter', 'YCSB'])

    ax.set_xlabel('')
    ax.set_ylabel('Distance')
    # ax.tick_params(axis='x', labelrotation=45)
    # handles, labels = ax.get_legend_handles_labels()
    # ax.get_legend().set_visible(False)

    expr_idx = simi_data.wl_groups[root_idx]
    wl_name = simi_data.wl_names[root_idx] 
    candid = f'Ycsb{candidate_group}'
    # plt.legend(bbox_to_anchor=(0, 1, 1, 0), loc='lower left', ncol=4, mode="expand")

    plt.savefig(f'figs/Test_Single_{candid}_2.pdf', bbox_inches = 'tight')
    plt.show()

In [7]:
def get_simis(simi_data, simi_mtx, candid_idx):
    print(simi_data.cpu_nums[candid_idx], len(simi_mtx[candid_idx]))
    arr = []
    true_idx = []
    wl_g = []
    for idx, val in enumerate(simi_mtx[candid_idx]):
        if simi_data.wl_names[idx] != simi_data.wl_names[candid_idx] and simi_data.cpu_nums[idx] == simi_data.cpu_nums[candid_idx]:
            arr.append(val)
            true_idx.append(idx)
            wl_g.append(simi_data.wl_groups[idx])
   
    df = pd.DataFrame({'dist': arr, 'true_idx': true_idx, 'wl_groups': wl_g} )
    df.sort_values(by=['dist'], ascending=True, inplace=True)
    wls = df['wl_groups'].to_list()
    tops = []
    for wln in wls:
        if wln not in tops:
            tops.append(wln)
    return tops

In [10]:
data = ExprDataMemCPU()
data.load_pickle()
data.fix_tpch()

data = data.remove_by_wlname(['xml', 'chbenchmark'])
# data = data.remove_by_group([g for g in all_groups if g != candidate_group])
all_ycsb_groups = [ # name terminal pair
    ('ycsb', 32), ('ycsb', 8), ('ycsb', 4)
]
candidate_group = ('ycsb', 8)

from_sku = ('cpu2', 32)
to_sku = ('cpu8', 64)

data = data.remove_by_config([g for g in all_ycsb_groups if g != candidate_group])
print(zip(data.wl_names, data.cpu_nums))

<zip object at 0x7fc5db401d40>


In [11]:
data_by_sku = data.split_by_sku()
known_data = data_by_sku[from_sku]
pred_data = data_by_sku[to_sku]

In [12]:
# similarity for all
scaler = ScaleData()
plan_mtxs, plan_col_ranges = scaler.scale(known_data.plan_mtxs)
# perf_mtxs, perf_col_ranges = scaler.scale(known_data.perf_mtxs)

# simi_calc = Similarity(known_data, plan_mtxs, plan_col_ranges, perf_mtxs, perf_col_ranges)
simi_calc = Similarity(known_data, plan_mtxs, plan_col_ranges, None, [])
simi_calc.calc_bined_mtx(plan_only=True) # all features

In [13]:
fs = FeatureSelection(simi_calc)

In [14]:
method = 'RFE' # 'fANOVA' #'Variance' #
est_name='LogisticRegression'

In [15]:
# use for later distinguish btw plan and system metrics
plan_features = known_data.plan_feature_cols

feature_group_plan = ['plan_3', 'plan_7', 'plan']

plan_top_3 = fs.select_features(3, method, est_name=est_name, direction=None, feature_type='plan')
plan_top_7 = fs.select_features(7, method, est_name=est_name, direction=None, feature_type='plan')

features_plan = [plan_top_3, plan_top_7, plan_features]

(24, 24, 22)
(24, 24, 22)


In [16]:
print(plan_top_7)

['EstimateIO', 'CachedPlanSize', 'AvgRowSize', 'SerialDesiredMemory', 'StatementSubTreeCost', 'SerialRequiredMemory', 'MaxCompileMemory']


In [17]:
simi_mtxs = []
for feature_group in features_plan:
    # simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=feature_group, norm_type='canb')#, normalize=False)
    simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=feature_group, norm_type='l21')#, normalize=False)
    simi_mtxs.append(simi_calc.simi_mtx)

In [19]:
candid_idx = len(known_data.wl_names) - 3
print(known_data.wl_names[candid_idx], known_data.cpu_nums[candid_idx])

ycsb cpu2


In [21]:
simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=plan_top_7, norm_type='l21', normalize=False)

In [23]:
tops = get_simis(simi_calc.data, simi_calc.simi_mtx, candid_idx)

cpu2 24


In [24]:
print(tops)
simi_data = simi_calc.data
for wl in tops:
    min_idx, = np.where(np.array(simi_data.wl_groups) == wl)
    print(np.unique(np.array(simi_data.wl_names)[min_idx]), simi_data.cpu_nums[candid_idx], 
          np.unique(np.array(simi_data.terminal_num)[min_idx]), 
          np.unique(np.array(simi_data.mem_sizes)[min_idx]),
          np.unique(np.array(simi_data.run_idx)[min_idx])) 

['13', '14', '15', '17', '19', '18', '16']
['tpcc'] cpu2 [32] [32] ['35' '36' '37']
['tpcc'] cpu2 [4] [32] ['38' '39' '40']
['tpcc'] cpu2 [8] [32] ['41' '42' '43']
['twitter'] cpu2 [32] [32] ['47' '48' '49']
['twitter'] cpu2 [8] [32] ['53' '54' '55']
['twitter'] cpu2 [4] [32] ['50' '51' '52']
['tpch'] cpu2 [32] [32] ['44' '45' '46']
