In [1]:
# for table 1
import os
import matplotlib.pyplot as plt
import datetime
import time
import json
import matplotlib.gridspec as gridspec

import numpy as np
import pandas as pd
import seaborn as sns

from itertools import compress

from helpers.expr_data_ch import ExprDataCh
from helpers.scale_data import ScaleData
from helpers.similarity import Similarity
from helpers.feature_selection import FeatureSelection

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error as rmse_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

import warnings
warnings.filterwarnings("ignore")
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
import random
np.random.seed(47906)
random.seed(47906)

In [3]:
fs_method = 'MutualInfoGain'
est_name = 'DecisionTree'

In [4]:
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 22
SMALL_SMALL_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize

plt.rc('legend', fontsize=SMALL_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### Data preparation

In [5]:
data_all = ExprDataCh()
data_all.load_pickle()

In [6]:
all_groups = ['13', '14', '15', '16'] # [wl_tpcc2_tpch2, query_tpcc2_tpch2, wl_tpcc3_tpch1, query_tpcc3_tpch1]
candidate_query_group = '16'
candidate_wl_group = '15'

In [7]:
chwl_data = data_all.remove_by_wlname([ 'tpcc', 'tpch', 'twitter', 'xml', 'ycsb']) 
chwl_data = chwl_data.remove_by_group([g for g in all_groups if g != candidate_wl_group])
print(chwl_data.run_idx)

['38', '38', '38', '38']


In [8]:
osq_data = data_all.remove_by_wlname([ 'tpcc', 'tpch', 'twitter', 'xml', 'ycsb']) # remove known data
osq_data = osq_data.remove_by_group([g for g in all_groups if g != candidate_query_group])
print(osq_data.run_idx)

# keep tpch as it is as it provides some other behavior pattern when using different cpu number (const)
known_data = data_all.remove_by_wlname(['xml', 'chbenchmark'])
sampled_data = known_data.sample_data()
data_by_type = sampled_data.split_by_type()

['39', '39', '39', '39']


In [9]:
# osq_data = osq_data.sample_data()
# X_label = 'SKU'
X_label = 'cpu_num'
y_label = 'latency'

expr_label = 'EXPR'
y_true_label = 'Y_TRUE'
y_pred_label = 'Y_PRED'
suffix_labels = ['_small', '_large']

In [10]:
for tl, tn, tt, tc, ts in zip(osq_data.wl_latency, osq_data.wl_names, osq_data.terminal_num, osq_data.cpu_nums, osq_data.wl_latency_samples):
    print(tl, tn, tt, tc[3:], tc)

87.357 chbenchmark 22 16 cpu16
12.980333333333334 chbenchmark 22 2 cpu2
306.635 chbenchmark 22 4 cpu4
15.603 chbenchmark 22 8 cpu8


In [11]:
def get_cpu_nums_as_X(l):
    return np.array([int(e[3:]) for e in l]).reshape(-1, 1)

In [12]:
osq_X = get_cpu_nums_as_X(osq_data.cpu_nums)
osq_y = np.array(osq_data.wl_latency)
osq_expr = np.array(osq_data.sampled_run_idx)

In [13]:
chwl_X = get_cpu_nums_as_X(chwl_data.cpu_nums)
chwl_y = np.array(chwl_data.wl_latency)
chwl_expr = np.array(chwl_data.sampled_run_idx)

### Feature selection

In [14]:
# Feature selection using ONLYknow data
scaler = ScaleData()
plan_mtxs, plan_col_ranges = scaler.scale(known_data.plan_mtxs)
# perf_mtxs, perf_col_ranges = scaler.scale(known_data.perf_mtxs)

ksimi_calc = Similarity(known_data, plan_mtxs, plan_col_ranges,[], [])
ksimi_calc.calc_bined_mtx(plan_only=True) # all features

In [15]:
fs = FeatureSelection(ksimi_calc)

In [16]:
top_7 = fs.select_features(7, 'Lasso', est_name=est_name, direction=None, feature_type='plan')
print(top_7)

(144, 144, 22)
['AvgRowSize', 'CachedPlanSize', 'EstimatedAvailableMemoryGrant', 'StatementSubTreeCost', 'EstimatedPagesCached', 'MaxCompileMemory']


# Same model different scope

In [74]:
def get_simis(simi_data, simi_mtx, candid_idx):
    print(simi_data.cpu_nums[candid_idx], len(simi_mtx[candid_idx]))
    arr = []
    true_idx = []
    wl_g = []
    for idx, val in enumerate(simi_mtx[candid_idx]):
        # if simi_data.wl_names[idx] == simi_data.wl_names[candid_idx]:
            # arr.append(100000)
            # true_idx.append(idx)
            # wl_g.append(simi_data.wl_groups[idx])
        if simi_data.wl_names[idx] != simi_data.wl_names[candid_idx] and simi_data.cpu_nums[idx] == simi_data.cpu_nums[candid_idx]:
            arr.append(val)
            true_idx.append(idx)
            wl_g.append(simi_data.wl_groups[idx])
   
    df = pd.DataFrame({'dist': arr, 'true_idx': true_idx, 'wl_groups': wl_g} )
    # nearest = np.argsort(arr) 
    # print(true_idx, arr, nearest)
    # idx_order = [true_idx[i] for i in nearest]
    # nearest_wls = np.array([simi_data.wl_groups[true_idx[i]] for i in nearest])
    # print([true_idx[i] for i in nearest], nearest_wls)
    df.sort_values(by=['dist'], ascending=True, inplace=True)
    wls = df['wl_groups'].to_list()
    tops = []
    for wln in wls:
        if wln not in tops:
            tops.append(wln)

    # _, idx = np.unique(wls, return_index=True)
    # print(idx)
    # print(wls)
    # tops = [wls[i] for i in idx]
    return tops

### Build models for existing workloads

In [18]:
def nrmse_score(y_true, y_pred):
    return rmse_score(y_true, y_pred)/(1+np.max(y_true)-np.min(y_true))

score_func = make_scorer(nrmse_score, greater_is_better=False)

In [116]:
'''
Use a dictionary of models
- key: (lower SKU, higher SKU)
- value: model
'''
def trend_predict(model_dicts, X, y_true, expr_idxs, method, sku_pair=None, show_fig=True):
    overall_data = pd.DataFrame(zip(X, expr_idxs, y_true), columns=[X_label, expr_label, y_true_label])
    # for each pair of SKU
    num_cpus = np.sort(np.unique(X))
    test_scores = []
    models, datas_big, datas_small = [], [], []
    dfs = []
        
    for i in range(len(num_cpus)):
        for j in range(i, len(num_cpus)):
            if i == j:
                continue
            if sku_pair is not None and (i != sku_pair[0] or j != sku_pair[1]):
                continue
            cpu_a = num_cpus[i] # smaller
            cpu_b = num_cpus[j] # larger
            
            curr_smaller = overall_data[overall_data[X_label] == cpu_a]
            curr_bigger = overall_data[overall_data[X_label] == cpu_b]
            
            new_preds = []
            
            for model_dict in model_dicts:            
                new_y_true, new_y_pred = [], []

                curr_smaller_pred = model_dict[(cpu_a, cpu_b)].predict(curr_smaller[X_label].to_numpy().reshape(-1, 1))
                curr_bigger_pred = model_dict[(cpu_a, cpu_b)].predict(curr_bigger[X_label].to_numpy().reshape(-1, 1))
                curr_smaller = curr_smaller.assign(Y_PRED=curr_smaller_pred)
                curr_bigger = curr_bigger.assign(Y_PRED=curr_bigger_pred)

                for _, smaller_row in curr_smaller.iterrows():
                    curr_expr_idx = smaller_row[expr_label]
                    curr_diff = smaller_row[y_true_label] - smaller_row[y_pred_label]
                    bigger_row = curr_bigger[curr_bigger[expr_label] == curr_expr_idx]
                    assert(bigger_row.shape[0] == 1)
                    bigger_row = bigger_row.iloc[0]
                    new_y_true.append(bigger_row[y_true_label])
                    new_y_pred.append(bigger_row[y_pred_label] + curr_diff)
                new_preds.append(new_y_pred)

            curr_bigger = curr_bigger.assign(Y_PRED=np.mean(np.array(new_preds), axis=0))
            df = pd.merge(curr_smaller, curr_bigger, on=[expr_label], suffixes=suffix_labels)
            dfs.append(df)
            
            if len(new_y_true) == 1:
                score = rmse_score(new_y_true, new_y_pred)/new_y_true[0]
            else:
                score = rmse_score(new_y_true, new_y_pred)/(1+np.max(new_y_true) - np.min(new_y_true))
            test_scores.append(score)
                
            models.append(model_dict[(cpu_a, cpu_b)])
            datas_small.append(curr_smaller)
            datas_big.append(curr_bigger)

    overall_score = np.mean(test_scores)
    return overall_score, dfs

In [21]:
def build_regression_model(X, y, wl_name, grouping_type, groupping_id, plot=False, show_fig=False):  
    train_rmses, test_rmses = [], []
    k = 5
    num_cpus = np.sort(np.unique(X))
    train_time = 0
    model_dict = {}
    models = []
    fold_test_rmses = []
    
    kf = KFold(n_splits=k, shuffle=True)
    for train_index, test_index in kf.split(X):
        start = time.time()

        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        reg = LinearRegression()
        reg.fit(X_train, y_train)
        test_pred = reg.predict(X_test)
        train_pred = reg.predict(X_train)
        end = time.time()
        train_time += end - start
        
        num_pairs = 0
        for i in range(len(num_cpus)):
            for j in range(i, len(num_cpus)):
                if i == j:
                    continue
                cpu_a = num_cpus[i] # smaller
                cpu_b = num_cpus[j] # larger
                curr_mask = [x_lab == cpu_a or x_lab == cpu_b for x_lab in X_test.flatten()]

                curr_y_true = y_test[curr_mask]
                # rmse = np.sqrt(((curr_y_true-test_pred[curr_mask])**2).mean())
                rmse = rmse_score(curr_y_true, test_pred[curr_mask])
                if len(curr_y_true) == 1:
                    n_rmse = rmse/curr_y_true[0]
                else:
                    n_rmse = rmse / (1+np.max(curr_y_true)-np.min(curr_y_true))
                # n_rmse = rmse / (1+np.max(curr_y_true)-np.min(curr_y_true))
                test_rmses.append(n_rmse)
                
                curr_mask = [x_lab == cpu_a or x_lab == cpu_b for x_lab in X_train.flatten()]
                
                curr_y_true = y_train[curr_mask]
                rmse = rmse_score(curr_y_true, train_pred[curr_mask])
                # rmse = np.sqrt(((curr_y_true-train_pred[curr_mask])**2).mean())
                if len(curr_y_true) == 1:
                    n_rmse = rmse/curr_y_true[0]
                else:
                    n_rmse = rmse / (1+np.max(curr_y_true)-np.min(curr_y_true))
                train_rmses.append(n_rmse)
                num_pairs == 1
                
        fold_test_rmses.append(np.mean(test_rmses[-num_pairs:]))
        models.append(reg)
    
    train_time /= k
    
    best_model = models[np.argmin(fold_test_rmses)]
    for i in range(len(num_cpus)):
        for j in range(i, len(num_cpus)):
            if i == j:
                continue
            cpu_a = num_cpus[i] # smaller
            cpu_b = num_cpus[j] # larger
            model_dict[(cpu_a, cpu_b)] = best_model

    overall_test_rmse = np.mean(test_rmses)
    overall_train_rmse = np.mean(train_rmses)
    return overall_test_rmse, overall_train_rmse, model_dict, train_time

In [22]:
def build_svm_model_indi(X, y, wl_name, grouping_type, groupping_id, show_fig=True):  
    metrics = [X_label, y_label]
    zipped = zip(X, y)

    # append fixed effect and random effect to data
    overall_data = pd.DataFrame(zipped, columns=metrics)
    
    train_scores, test_scores = [], []

    models, datas = [], []
    model_dict = {}
    
    # epsilon range: according to this paper: http://adrem.uantwerpen.be/bibrem/pubs/IJCNN2007.pdf
    # C range: https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
    
    # Cross validation grid search (best parameters) 
    # c_range = np.logspace(1, 100, 10) # 1 and 100
    c_range = np.logspace(-5, 5, base=2.0, num=5) # 1 and 100
    epsilon_range = np.linspace(10 ** -3, 1, 5) # 1-e3 and 1
    # gamma_range = np.logspace(-4, 0, 20) # 2^{-15} to 2^3
    degree_range = np.array(list(range(1, 5)))
    tuned_parameters = [{'kernel': ['rbf'],'C': c_range, 'epsilon': epsilon_range}, # 'gamma':gamma_range,
                        {'kernel': ['linear'], 'C': c_range, 'epsilon': epsilon_range}, # 'gamma':gamma_range,
                        {'kernel': ['poly'], 'C': c_range, 'degree': degree_range, 'epsilon': epsilon_range} # 'gamma':gamma_range,
                       ]

    num_cpus = np.sort(np.unique(X))
    prev_cpu = num_cpus[0]

    train_time = 0
    for i in range(len(num_cpus)):
        for j in range(i, len(num_cpus)):
            if i == j:
                continue
            cpu_a = num_cpus[i] # smaller
            cpu_b = num_cpus[j] # larger
            curr_data = overall_data[ (overall_data[X_label] == cpu_a) | (overall_data[X_label] == cpu_b) ]
            
            svr = SVR()
            datas.append(curr_data)
            clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=0, n_jobs=4,
                               cv=5, 
                               scoring=score_func, 
                               return_train_score=True)
            clf.fit(curr_data[[X_label]], curr_data[y_label])           
            best_params = clf.best_params_
            results = pd.DataFrame.from_dict(clf.cv_results_)
            results_best = results[results['params'] == best_params].reset_index()
            train_time += results_best['mean_fit_time'][0]

            test_scores.append(results_best.iloc[0]['mean_test_score'])
            train_scores.append(results_best.iloc[0]['mean_train_score'])
            models.append(clf)
            model_dict[(cpu_a, cpu_b)] = clf
    
    # plot the last one as example
    overall_test = np.mean(test_scores)
    overall_train = np.mean(train_scores)
    return overall_test, overall_train, model_dict, train_time

In [88]:
def chose_and_pred(group_to_model_dict, tops, k, name, X, y, expr):
    min_groups = tops[:k]
    model_dict = [group_to_model_dict[min_group] for min_group in min_groups]
    overall_score, _ = trend_predict(model_dict, X, y, expr, name, None, False)

    print('{} Overall ycsb nrmse: {}'.format(name, overall_score))
    for p in [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]:
        pair_score, _ = trend_predict(model_dict, X, y, expr, f'name-{p[0]}-{p[1]}',  p, False)
        print('{} pairwise ycsb nrmse: {}'.format(p, pair_score))

In [24]:
all_results = {}

In [25]:
all_tests, all_trains = [], []
svm_model_dict = {}

for ty, curr_data in data_by_type.items():
    # # if ty not in tops:
    # #     continue
    # min_idx, = np.where(np.array(known_data.wl_groups) == ty)
    # print(np.unique(np.array(known_data.wl_names)[min_idx]))
    X = get_cpu_nums_as_X(curr_data.cpu_nums)
    y = np.array(curr_data.wl_latency)
    _, _, model_dict, _ = build_svm_model_indi(X, y, curr_data.wl_names[0], 'group', 
                                               curr_data.wl_groups[0], show_fig=False)
    svm_model_dict[ty] = model_dict

# Use Query

### Similarity computation

In [111]:
# Similarity computation using known data and experiment data
simi_query_data = data_all.remove_by_wlname(['xml', 'chbenchmark'])
simi_query_data = simi_query_data.remove_by_group([g for g in all_groups if g != candidate_query_group])
scaler = ScaleData()
plan_mtxs, plan_col_ranges = scaler.scale(simi_query_data.plan_mtxs)

simi_calc = Similarity(simi_query_data, plan_mtxs, plan_col_ranges, [], [])
simi_calc.calc_bined_mtx(plan_only=True) 

In [112]:
simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=top_7, norm_type='l21', timeit=True)
simi_mtx = simi_calc.simi_mtx

Simi calculation time 0.09764814376831055


In [113]:
candid_idx = len(simi_query_data.wl_names) - 1
tops = get_simis(simi_query_data, simi_mtx, candid_idx)

cpu8 144


In [114]:
tops = tops[:5]
print(tops)

for wl in tops:
    min_idx, = np.where(np.array(known_data.wl_groups) == wl)
    print(np.unique(np.array(known_data.wl_names)[min_idx]), known_data.cpu_nums[candid_idx], 
          np.unique(np.array(known_data.terminal_num)[min_idx]), 
          np.unique(np.array(known_data.wl_groups)[min_idx])) 

['1', '7', '8', '3', '4']
['tpcc'] cpu8 [32] ['1']
['tpcc'] cpu8 [8] ['7']
['tpcc'] cpu8 [4] ['8']
['twitter'] cpu8 [32] ['3']
['twitter'] cpu8 [8] ['4']


### Prediction

In [117]:
for k in [1 ,2, 3]:
    chose_and_pred(svm_model_dict, tops, k, 'svm', osq_X, osq_y, osq_expr)

['1']
svm Overall ycsb nrmse: 4.001788753571488
(0, 1) pairwise ycsb nrmse: 0.9511344468510646
(0, 2) pairwise ycsb nrmse: 0.2451167301960221
(1, 2) pairwise ycsb nrmse: 18.751294266818242
(1, 3) pairwise ycsb nrmse: 2.5996479460467303
(2, 3) pairwise ycsb nrmse: 0.7531381444658894
['1', '7']
svm Overall ycsb nrmse: 4.032247453116313
(0, 1) pairwise ycsb nrmse: 0.9600723263101791
(0, 2) pairwise ycsb nrmse: 0.07372513617068317
(1, 2) pairwise ycsb nrmse: 19.004756920130003
(1, 3) pairwise ycsb nrmse: 2.64695415105039
(2, 3) pairwise ycsb nrmse: 0.7804096883729957
['1', '7', '8']
svm Overall ycsb nrmse: 4.036228015154044
(0, 1) pairwise ycsb nrmse: 0.9718184667685695
(0, 2) pairwise ycsb nrmse: 0.1602227854597051
(1, 2) pairwise ycsb nrmse: 18.878903666341905
(1, 3) pairwise ycsb nrmse: 2.631678948009263
(2, 3) pairwise ycsb nrmse: 0.7785802848445305


In [118]:
min_groups = tops[:2]
model_dict = [svm_model_dict[min_group] for min_group in min_groups]
print(min_groups)

pair_score, dfs = trend_predict(model_dict, osq_X, osq_y, osq_expr, f'svm13',  (1, 3), False)
curr_df = dfs[0]
curr_df['Diff'] = abs(curr_df['Y_TRUE_large']-curr_df['Y_PRED_large'])
curr_df['Perc_Diff'] = curr_df['Diff']/curr_df['Y_PRED_large']
temp = curr_df[['Y_TRUE_small', 'Y_TRUE_large', 'Y_PRED_large', 'Diff', 'Perc_Diff']]
print(curr_df[curr_df['Diff'] == curr_df['Diff'].min()])

['1', '7']
  cpu_num_small EXPR  Y_TRUE_small  Y_PRED_small cpu_num_large  Y_TRUE_large  \
0           [4]   39       306.635    390.758333          [16]        87.357   

   Y_PRED_large       Diff  Perc_Diff  
0     316.52071  229.16371   0.724009  


# Use workload

### Similarity computation

In [119]:
# Similarity computation using known data and experiment data
simi_wl_data = data_all.remove_by_wlname(['xml', 'chbenchmark'])
simi_wl_data = simi_wl_data.remove_by_group([g for g in all_groups if g != candidate_wl_group])
scaler = ScaleData()
plan_mtxs, plan_col_ranges = scaler.scale(simi_wl_data.plan_mtxs)

simi_calc = Similarity(simi_wl_data, plan_mtxs, plan_col_ranges, [], [])
simi_calc.calc_bined_mtx(plan_only=True) 

In [120]:
simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=top_7, norm_type='l21', timeit=True)
simi_mtx = simi_calc.simi_mtx

Simi calculation time 0.09092044830322266


In [121]:
candid_idx = len(simi_wl_data.wl_names) - 1
tops = get_simis(simi_wl_data, simi_mtx, candid_idx)

cpu8 144


In [122]:
tops = tops[:5]
print(tops)

for wl in tops:
    min_idx, = np.where(np.array(simi_wl_data.wl_groups) == wl)
    print(np.unique(np.array(simi_wl_data.wl_names)[min_idx]), simi_wl_data.cpu_nums[candid_idx], 
          np.unique(np.array(simi_wl_data.terminal_num)[min_idx]), 
          np.unique(np.array(simi_wl_data.wl_groups)[min_idx])) 

['1', '7', '8', '3', '4']
['tpcc'] cpu8 [32] ['1']
['tpcc'] cpu8 [8] ['7']
['tpcc'] cpu8 [4] ['8']
['twitter'] cpu8 [32] ['3']
['twitter'] cpu8 [8] ['4']


In [123]:
for k in [1 ,2, 3]:
    chose_and_pred(svm_model_dict, tops, k, 'svm', chwl_X, chwl_y, chwl_expr)

['1']
svm Overall ycsb nrmse: 16.226161618987845
(0, 1) pairwise ycsb nrmse: 0.31062742090184176
(0, 2) pairwise ycsb nrmse: 54.33577459654684
(1, 2) pairwise ycsb nrmse: 41.14297830072808
(1, 3) pairwise ycsb nrmse: 0.12555131981344478
(2, 3) pairwise ycsb nrmse: 0.9667855207642678
['1', '7']
svm Overall ycsb nrmse: 16.235639792093576
(0, 1) pairwise ycsb nrmse: 0.3078806894508524
(0, 2) pairwise ycsb nrmse: 54.2230000407514
(1, 2) pairwise ycsb nrmse: 41.30975508053761
(1, 3) pairwise ycsb nrmse: 0.13017673705010518
(2, 3) pairwise ycsb nrmse: 0.9694520265482028
['1', '7', '8']
svm Overall ycsb nrmse: 16.19418375895415
(0, 1) pairwise ycsb nrmse: 0.3042709413475817
(0, 2) pairwise ycsb nrmse: 54.06906382765314
(1, 2) pairwise ycsb nrmse: 41.226944456877355
(1, 3) pairwise ycsb nrmse: 0.12868318694532616
(2, 3) pairwise ycsb nrmse: 0.9692731545669752


In [124]:
min_groups = tops[:2]
model_dict = [svm_model_dict[min_group] for min_group in min_groups]
print(min_groups)

pair_score, dfs = trend_predict(model_dict, chwl_X, chwl_y, chwl_expr, f'svm13',  (1, 3), False)
curr_df = dfs[0]
curr_df['Diff'] = abs(curr_df['Y_TRUE_large']-curr_df['Y_PRED_large'])
curr_df['Perc_Diff'] = curr_df['Diff']/curr_df['Y_PRED_large']
temp = curr_df[['Y_TRUE_small', 'Y_TRUE_large', 'Y_PRED_large', 'Diff', 'Perc_Diff']]
print(curr_df[curr_df['Diff'] == curr_df['Diff'].min()])

['1', '7']
  cpu_num_small EXPR  Y_TRUE_small  Y_PRED_small cpu_num_large  Y_TRUE_large  \
0           [4]   38       997.792    390.758333          [16]       893.439   

   Y_PRED_large       Diff  Perc_Diff  
0    1007.67771  114.23871   0.113368  


# Other Strategy

In [125]:
top_3 = fs.select_features(3, 'Chi2', est_name=None, direction=None, feature_type='plan')
print(top_3)

(144, 144, 22)
['AvgRowSize', 'CachedPlanSize', 'MaxCompileMemory']


In [126]:
simi_calc.calc_plan_lcss_simi_matrix(plan_feature_names=None)
# simi_calc.calc_plan_simi_matrix(plan_feature_names=top_3,  norm_type='corr')

simi_mtx = simi_calc.simi_mtx
print(simi_mtx.shape)

(144, 144)


In [128]:
tops = get_simis(simi_wl_data, simi_mtx, candid_idx)

cpu8 144


In [130]:
tops = tops[:5]
print(tops)
for wl in tops:
    min_idx, = np.where(np.array(simi_wl_data.wl_groups) == wl)
    print(np.unique(np.array(simi_wl_data.wl_names)[min_idx]), simi_wl_data.cpu_nums[candid_idx], 
          np.unique(np.array(simi_wl_data.terminal_num)[min_idx]), 
          np.unique(np.array(simi_wl_data.wl_groups)[min_idx])) 

['1', '9', '4', '3', '6']
['tpcc'] cpu8 [32] ['1']
['twitter'] cpu8 [4] ['9']
['twitter'] cpu8 [8] ['4']
['twitter'] cpu8 [32] ['3']
['tpch'] cpu8 [4] ['6']


### Prediction

In [134]:
all_results = {}

In [135]:
all_tests, all_trains = [], []
reg_model_dict = {}
reg_results = {}

for ty, curr_data in data_by_type.items():
    if ty not in tops:
        continue
    X = get_cpu_nums_as_X(curr_data.cpu_nums)
    y = np.array(curr_data.wl_latency)
    _, _, model_dict, _ = build_regression_model(X, y, curr_data.wl_names[0], 'group', 
                                                 curr_data.wl_groups[0], show_fig=False)
    reg_model_dict[ty] = model_dict

In [137]:
for k in [1 ,2, 3]:
    chose_and_pred(reg_model_dict, tops, k, 'reg', chwl_X, chwl_y, chwl_expr)

['1']
reg Overall ycsb nrmse: 16.303640988634715
(0, 1) pairwise ycsb nrmse: 0.31232140574199324
(0, 2) pairwise ycsb nrmse: 54.53120230398282
(1, 2) pairwise ycsb nrmse: 41.38938979278159
(1, 3) pairwise ycsb nrmse: 0.14160541510801475
(2, 3) pairwise ycsb nrmse: 0.9569212891843095
['1', '9']
reg Overall ycsb nrmse: 16.575034921227353
(0, 1) pairwise ycsb nrmse: 0.3196749119962866
(0, 2) pairwise ycsb nrmse: 55.459461450335155
(1, 2) pairwise ycsb nrmse: 42.00822922368315
(1, 3) pairwise ycsb nrmse: 0.19087975647312785
(2, 3) pairwise ycsb nrmse: 0.9240717282742341
['1', '9', '4']
reg Overall ycsb nrmse: 16.03475950221707
(0, 1) pairwise ycsb nrmse: 0.3050359750282424
(0, 2) pairwise ycsb nrmse: 53.6115365742901
(1, 2) pairwise ycsb nrmse: 40.776279306319786
(1, 3) pairwise ycsb nrmse: 0.09278723401629249
(2, 3) pairwise ycsb nrmse: 0.9894667432454582


In [139]:
min_groups = tops[:2]
model_dict = [reg_model_dict[min_group] for min_group in min_groups]
print(min_groups)

pair_score, dfs = trend_predict(model_dict, chwl_X, chwl_y, chwl_expr, f'svm13', (1, 3), False)
curr_df = dfs[0]
curr_df['Diff'] = abs(curr_df['Y_TRUE_large']-curr_df['Y_PRED_large'])
curr_df['Perc_Diff'] = curr_df['Diff']/curr_df['Y_PRED_large']
temp = curr_df[['Y_TRUE_small', 'Y_TRUE_large', 'Y_PRED_large', 'Diff', 'Perc_Diff']]
print(curr_df[curr_df['Diff'] == curr_df['Diff'].min()])
# temp.describe()

['1', '9']
  cpu_num_small EXPR  Y_TRUE_small  Y_PRED_small cpu_num_large  Y_TRUE_large  \
0           [4]   38       997.792   2952.623365          [16]       893.439   

   Y_PRED_large       Diff  Perc_Diff  
0    1041.96661  148.52761   0.142545  
