In [1]:
# for table 1
import os
import matplotlib.pyplot as plt
import datetime
import time
import json
import matplotlib.gridspec as gridspec

import numpy as np
import pandas as pd
import seaborn as sns

from itertools import compress

from helpers.expr_data import ExprData
from helpers.scale_data import ScaleData
from helpers.similarity import Similarity
from helpers.feature_selection import FeatureSelection

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error as rmse_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

import warnings
warnings.filterwarnings("ignore")

In [2]:
import random
np.random.seed(47906)
random.seed(47906)

In [4]:
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 22
SMALL_SMALL_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize

plt.rc('legend', fontsize=SMALL_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [5]:
all_groups = ['10', '11', '12']
candidate_group = '11'

In [6]:
data_all = ExprData()
data_all.load_pickle()
# keep tpch as it is as it provides some other behavior pattern when using different cpu number (const)
# data_all = data_all.merge_tpch()

known_data = data_all.remove_by_wlname(['xml', 'ycsb'])
ycsb_data = data_all.remove_by_wlname(['xml', 'tpcc', 'tpch', 'twitter'])

simi_data = data_all.remove_by_wlname(['xml'])
simi_data = simi_data.remove_by_group([g for g in all_groups if g != candidate_group])

In [7]:
ycsb_data = ycsb_data.remove_by_group([g for g in all_groups if g != candidate_group])
ycsb_data = ycsb_data.sample_data()
X_label = 'cpu_num'
y_label = 'throughput'

expr_label = 'EXPR'
y_true_label = 'Y_TRUE'
y_pred_label = 'Y_PRED'
suffix_labels = ['_small', '_large']
sampled_data = known_data.sample_data()
data_by_type = sampled_data.split_by_type()

In [8]:
# Similarity computation using known data and experiment data
scaler = ScaleData()
plan_mtxs, plan_col_ranges = scaler.scale(simi_data.plan_mtxs)
perf_mtxs, perf_col_ranges = scaler.scale(simi_data.perf_mtxs)

simi_calc = Similarity(simi_data, plan_mtxs, plan_col_ranges, perf_mtxs, perf_col_ranges)
simi_calc.calc_bined_mtx() 

In [46]:
def get_simis(simi_data, simi_mtx):
    print(simi_data.cpu_nums[candid_idx])
    arr = []
    true_idx = []
    for idx, val in enumerate(simi_mtx[candid_idx][:-12]):
        if simi_data.cpu_nums[idx] == simi_data.cpu_nums[candid_idx]:
            arr.append(val)
            true_idx.append(idx)
   
    nearest = np.argsort(arr)
    nearest_wls = np.array([simi_data.wl_groups[true_idx[i]] for i in nearest])
    _, idx = np.unique(nearest_wls, return_index=True)
    tops = nearest_wls[np.sort(idx)]
    return tops

In [10]:
def nrmse_score(y_true, y_pred):
    return rmse_score(y_true, y_pred)/(np.max(y_true)-np.min(y_true))

score_func = make_scorer(nrmse_score, greater_is_better=False)

In [11]:
def get_cpu_nums_as_X(l):
    return np.array([int(e[3:]) for e in l]).reshape(-1, 1)

In [12]:
'''
Use a dictionary of models
- key: (lower SKU, higher SKU)
- value: model
'''
def predict(model_dicts, X, y_true, expr_idxs, method, show_fig=True):
    overall_data = pd.DataFrame(zip(X, expr_idxs, y_true), columns=[X_label, expr_label, y_true_label])

    # for each pair of SKU
    num_cpus = np.sort(np.unique(X))
    test_scores = []
    models, datas_big, datas_small = [], [], []
    dfs = []
    
    for i in range(len(num_cpus)):
        for j in range(i, len(num_cpus)):
            if i == j:
                continue
            cpu_a = num_cpus[i] # smaller
            cpu_b = num_cpus[j] # larger
            
            curr_smaller = overall_data[overall_data[X_label] == cpu_a]
            curr_bigger = overall_data[overall_data[X_label] == cpu_b]
            
            new_preds = []
            
            for model_dict in model_dicts:            
                new_y_true, new_y_pred = [], []

                curr_smaller_pred = model_dict[(cpu_a, cpu_b)].predict(curr_smaller[X_label].to_numpy().reshape(-1, 1))
                curr_bigger_pred = model_dict[(cpu_a, cpu_b)].predict(curr_bigger[X_label].to_numpy().reshape(-1, 1))
                curr_smaller = curr_smaller.assign(Y_PRED=curr_smaller_pred)
                curr_bigger = curr_bigger.assign(Y_PRED=curr_bigger_pred)

                for _, smaller_row in curr_smaller.iterrows():
                    curr_expr_idx = smaller_row[expr_label]
                    curr_diff = smaller_row[y_true_label] - smaller_row[y_pred_label]
                    bigger_row = curr_bigger[curr_bigger[expr_label] == curr_expr_idx]
                    assert(bigger_row.shape[0] == 1)
                    bigger_row = bigger_row.iloc[0]
                    new_y_true.append(bigger_row[y_true_label])
                    new_y_pred.append(bigger_row[y_pred_label] + curr_diff)
                new_preds.append(new_y_pred)
            
            curr_bigger = curr_bigger.assign(Y_PRED=np.mean(np.array(new_preds), axis=0))
            df = pd.merge(curr_smaller, curr_bigger, on=[expr_label], suffixes=suffix_labels)
            dfs.append(df)
            
            new_y_true = np.array(new_y_true)
            new_y_pred = np.array(new_y_pred) 
            
            score = rmse_score(new_y_true, new_y_pred)/(np.max(new_y_true) - np.min(new_y_true))
            test_scores.append(score)
                
            models.append(model_dict[(cpu_a, cpu_b)])
            datas_small.append(curr_smaller)
            datas_big.append(curr_bigger)

    overall_score = np.mean(test_scores)
    return overall_score

In [87]:
'''
Use a dictionary of models
- key: (lower SKU, higher SKU)
- value: model
'''
def predict_pair(model_dicts, sku_pair, X, y_true, expr_idxs, method, plot=True, show_fig=True):
    overall_data = pd.DataFrame(zip(X, expr_idxs, y_true), columns=[X_label, expr_label, y_true_label])

    # for each pair of SKU
    num_cpus = np.sort(np.unique(X))
    test_scores = []
    models, datas_big, datas_small = [], [], []
    dfs = []
    
    for i in range(len(num_cpus)):
        for j in range(i, len(num_cpus)):
            if i != sku_pair[0] or j != sku_pair[1]:
                continue
            cpu_a = num_cpus[i] # smaller
            cpu_b = num_cpus[j] # larger
            
            curr_smaller = overall_data[overall_data[X_label] == cpu_a]
            curr_bigger = overall_data[overall_data[X_label] == cpu_b]
            
            new_preds = []
            
            for model_dict in model_dicts:            
                new_y_true, new_y_pred = [], []

                curr_smaller_pred = model_dict[(cpu_a, cpu_b)].predict(curr_smaller[X_label].to_numpy().reshape(-1, 1))
                curr_bigger_pred = model_dict[(cpu_a, cpu_b)].predict(curr_bigger[X_label].to_numpy().reshape(-1, 1))
                curr_smaller = curr_smaller.assign(Y_PRED=curr_smaller_pred)
                curr_bigger = curr_bigger.assign(Y_PRED=curr_bigger_pred)

                for _, smaller_row in curr_smaller.iterrows():
                    curr_expr_idx = smaller_row[expr_label]
                    curr_diff = smaller_row[y_true_label] - smaller_row[y_pred_label]
                    bigger_row = curr_bigger[curr_bigger[expr_label] == curr_expr_idx]
                    assert(bigger_row.shape[0] == 1)
                    bigger_row = bigger_row.iloc[0]
                    new_y_true.append(bigger_row[y_true_label])
                    new_y_pred.append(bigger_row[y_pred_label] + curr_diff)
                new_preds.append(new_y_pred)
            
            curr_bigger = curr_bigger.assign(Y_PRED=np.mean(np.array(new_preds), axis=0))
            df = pd.merge(curr_smaller, curr_bigger, on=[expr_label], suffixes=suffix_labels)
            dfs.append(df)
            
            new_y_true = np.array(new_y_true)
            new_y_pred = np.array(new_y_pred) 
            
            score = rmse_score(new_y_true, new_y_pred)/(np.max(new_y_true) - np.min(new_y_true))
            test_scores.append(score)
                
            models.append(model_dict[(cpu_a, cpu_b)])
            datas_small.append(curr_smaller)
            datas_big.append(curr_bigger)
    
    overall_score = np.mean(test_scores)
    return overall_score, dfs

In [14]:
def build_regression_model(X, y, wl_name, grouping_type, groupping_id, plot=False, show_fig=False):  
    train_rmses, test_rmses = [], []
    k = 5
    num_cpus = np.sort(np.unique(X))
    train_time = 0
    model_dict = {}
    models = []
    fold_test_rmses = []
    
    kf = KFold(n_splits=k, shuffle=True)
    for train_index, test_index in kf.split(X):
        start = time.time()

        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        reg = LinearRegression()
        reg.fit(X_train, y_train)
        test_pred = reg.predict(X_test)
        train_pred = reg.predict(X_train)
        end = time.time()
        train_time += end - start
        
        num_pairs = 0
        for i in range(len(num_cpus)):
            for j in range(i, len(num_cpus)):
                if i == j:
                    continue
                cpu_a = num_cpus[i] # smaller
                cpu_b = num_cpus[j] # larger
                curr_mask = [x_lab == cpu_a or x_lab == cpu_b for x_lab in X_test.flatten()]

                curr_y_true = y_test[curr_mask]
                rmse = np.sqrt(((curr_y_true-test_pred[curr_mask])**2).mean())
                n_rmse = rmse / (np.max(curr_y_true)-np.min(curr_y_true))
                test_rmses.append(n_rmse)
                
                curr_mask = [x_lab == cpu_a or x_lab == cpu_b for x_lab in X_train.flatten()]
                
                curr_y_true = y_train[curr_mask]
                rmse = np.sqrt(((curr_y_true-train_pred[curr_mask])**2).mean())
                n_rmse = rmse / (np.max(curr_y_true)-np.min(curr_y_true))
                train_rmses.append(n_rmse)
                num_pairs == 1
                
        fold_test_rmses.append(np.mean(test_rmses[-num_pairs:]))
        models.append(reg)
    
    train_time /= k
    
    best_model = models[np.argmin(fold_test_rmses)]
    for i in range(len(num_cpus)):
        for j in range(i, len(num_cpus)):
            if i == j:
                continue
            cpu_a = num_cpus[i] # smaller
            cpu_b = num_cpus[j] # larger
            model_dict[(cpu_a, cpu_b)] = best_model

    overall_test_rmse = np.mean(test_rmses)
    overall_train_rmse = np.mean(train_rmses)
    return overall_test_rmse, overall_train_rmse, model_dict, train_time

In [15]:
def build_svm_model_indi(X, y, wl_name, grouping_type, groupping_id, show_fig=True):  
    metrics = [X_label, y_label]
    zipped = zip(X, y)
    
    # append fixed effect and random effect to data
    overall_data = pd.DataFrame(zipped, columns=metrics)

    train_scores, test_scores = [], []

    models, datas = [], []
    model_dict = {}
    
    # epsilon range: according to this paper: http://adrem.uantwerpen.be/bibrem/pubs/IJCNN2007.pdf
    # C range: https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
    
    # Cross validation grid search (best parameters) 
    c_range = np.logspace(-5, 5, base=2.0, num=5) # 1 and 100
    epsilon_range = np.linspace(10 ** -3, 1, 5) # 1-e3 and 1
    degree_range = np.array(list(range(1, 5)))
    tuned_parameters = [{'kernel': ['rbf'],'C': c_range, 'epsilon': epsilon_range}, 
                        {'kernel': ['linear'], 'C': c_range, 'epsilon': epsilon_range}, 
                        {'kernel': ['poly'], 'C': c_range, 'degree': degree_range, 'epsilon': epsilon_range} 
                       ]

    num_cpus = np.sort(np.unique(X))
    prev_cpu = num_cpus[0]

    train_time = 0
    for i in range(len(num_cpus)):
        for j in range(i, len(num_cpus)):
            if i == j:
                continue
            cpu_a = num_cpus[i] # smaller
            cpu_b = num_cpus[j] # larger
            curr_data = overall_data[ (overall_data[X_label] == cpu_a) | (overall_data[X_label] == cpu_b) ]
            
            svr = SVR()
            datas.append(curr_data)
            clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=0, n_jobs=4,
                               cv=5, 
                               scoring=score_func,
                               return_train_score=True)
            clf.fit(curr_data[[X_label]], curr_data[y_label])           
            best_params = clf.best_params_
            results = pd.DataFrame.from_dict(clf.cv_results_)
            results_best = results[results['params'] == best_params].reset_index()
            train_time += results_best['mean_fit_time'][0]

            test_scores.append(results_best.iloc[0]['mean_test_score'])
            train_scores.append(results_best.iloc[0]['mean_train_score'])
            models.append(clf)
            model_dict[(cpu_a, cpu_b)] = clf
    
    # plot the last one as example
    overall_test = np.mean(test_scores)
    overall_train = np.mean(train_scores)
    return overall_test, overall_train, model_dict, train_time

In [16]:
def build_gb_model_indi(X, y, wl_name, grouping_type, groupping_id, plot=False, show_fig=True):  
    metrics = [X_label, y_label]
    zipped = zip(X, y)
    
    # append fixed effect and random effect to data
    overall_data = pd.DataFrame(zipped, columns=metrics)

    train_scores, test_scores = [], []

    models, datas = [], []
    model_dict = {}

    tuned_parameters = {
        "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
        "max_depth":[3,5,8],
        "max_features":["log2","sqrt"],
        "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
        "n_estimators":[10, 50, 100]
    }

    num_cpus = np.sort(np.unique(X))
    prev_cpu = num_cpus[0]
    
    train_time = 0

    for i in range(len(num_cpus)):
        for j in range(i, len(num_cpus)):
            if i == j:
                continue
            cpu_a = num_cpus[i] # smaller
            cpu_b = num_cpus[j] # larger
            curr_data = overall_data[ (overall_data[X_label] == cpu_a) | (overall_data[X_label] == cpu_b) ]
            
            datas.append(curr_data)
            model = GradientBoostingRegressor()
            clf = GridSearchCV(model, param_grid=tuned_parameters,verbose=0, n_jobs=-1,
                               cv=5, 
                               scoring=score_func,  
                               return_train_score=True)
            clf.fit(curr_data[[X_label]], curr_data[y_label])

            best_params = clf.best_params_
            results = pd.DataFrame.from_dict(clf.cv_results_)
            results_best = results[results['params'] == best_params].reset_index()
            train_time += results_best['mean_fit_time'][0]
            
            test_scores.append(results_best.iloc[0]['mean_test_score'])
            train_scores.append(results_best.iloc[0]['mean_train_score'])
            models.append(clf)
            model_dict[(cpu_a, cpu_b)] = clf

    overall_test = np.mean(test_scores)
    overall_train = np.mean(train_scores)
    return overall_test, overall_train, model_dict, train_time

In [17]:
ycsb_X = get_cpu_nums_as_X(ycsb_data.cpu_nums)
ycsb_y = np.array(ycsb_data.wl_throughput)
ycsb_expr = np.array(ycsb_data.sampled_run_idx)

In [18]:
candid_idx = len(simi_data.wl_names) - 1

In [96]:
def chose_and_pred(group_to_model_dict, tops, k, name):
    min_groups = tops[:k]
    model_dict = [group_to_model_dict[min_group] for min_group in min_groups]
    print(min_groups)
    overall_score = predict(model_dict, ycsb_X, ycsb_y, ycsb_expr, name, False)

    print('{} Overall ycsb nrmse: {}'.format(name, overall_score))
    for p in [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]:
        pair_score, _ = predict_pair(model_dict, p, ycsb_X, ycsb_y, ycsb_expr, f'name-{p[0]}-{p[1]}', False)
        print('{} pairwise ycsb nrmse: {}'.format(p, pair_score))

In [20]:
# Feature selection using know data
kscaler = ScaleData()
plan_mtxs, plan_col_ranges = scaler.scale(known_data.plan_mtxs)
perf_mtxs, perf_col_ranges = scaler.scale(known_data.perf_mtxs)

ksimi_calc = Similarity(known_data, plan_mtxs, plan_col_ranges, perf_mtxs, perf_col_ranges)
ksimi_calc.calc_bined_mtx() # all features

In [21]:
fs = FeatureSelection(ksimi_calc)

# Strategy 2

In [None]:
top_7 = fs.select_features(7, 'Lasso', est_name=None, direction=None, feature_type=None)

In [83]:
simi_calc.calc_dist_simi_matrix(cumulative=True, feature_names=top_7, norm_type='l21', timeit=True)
simi_mtx = simi_calc.simi_mtx

Simi calculation time 0.15011119842529297


In [84]:
tops = get_simis(simi_data, simi_mtx)

cpu8


In [85]:
print(tops)
for wl in tops:
    min_idx, = np.where(np.array(simi_data.wl_groups) == wl)
    print(np.unique(np.array(simi_data.wl_names)[min_idx]), simi_data.cpu_nums[candid_idx], 
          np.unique(np.array(simi_data.terminal_num)[min_idx]), 
          np.unique(np.array(simi_data.wl_groups)[min_idx])) 

['8' '9' '7' '3' '4' '1' '2' '6' '5']
['tpcc'] cpu8 [4] ['8']
['twitter'] cpu8 [4] ['9']
['tpcc'] cpu8 [8] ['7']
['twitter'] cpu8 [32] ['3']
['twitter'] cpu8 [8] ['4']
['tpcc'] cpu8 [32] ['1']
['tpch'] cpu8 [32] ['2']
['tpch'] cpu8 [4] ['6']
['tpch'] cpu8 [8] ['5']


In [81]:
print(simi_data.wl_names[candid_idx], simi_data.cpu_nums[candid_idx], 
      simi_data.terminal_num[candid_idx], 
      simi_data.wl_groups[candid_idx]) 

ycsb cpu8 8 11


### Prediction

In [25]:
all_results = {}

In [26]:
all_tests, all_trains = [], []
svm_model_dict = {}
svm_results = {}

for ty, curr_data in data_by_type.items():
    if ty not in tops:
        continue
    X = get_cpu_nums_as_X(curr_data.cpu_nums)
    y = np.array(curr_data.wl_throughput)
    test_r2_mean, train_r2_mean, model_dict, train_time = build_svm_model_indi(X, y, curr_data.wl_names[0], 'group', 
                                                                   curr_data.wl_groups[0], show_fig=False)
    svm_model_dict[ty] = model_dict
    all_tests.append(test_r2_mean)
    all_trains.append(train_r2_mean)
    svm_results[ty] = test_r2_mean

print('Overall test nrmse: {}, train {}'.format(np.mean(all_tests), np.mean(all_tests)))

Overall test nrmse: -0.2984995299208955, train -0.2984995299208955


In [27]:
for k in [1 ,2, 3]:
    chose_and_pred(svm_model_dict, tops, k, 'svm')

['8']
svm Overall ycsb nrmse: 0.17796459071596096
(0, 1) pairwise ycsb nrmse: 0.05787888360721371
(0, 2) pairwise ycsb nrmse: 0.1114068367745914
(1, 2) pairwise ycsb nrmse: 0.10456502811028062
(1, 3) pairwise ycsb nrmse: 0.2717916007760527
(2, 3) pairwise ycsb nrmse: 0.21350769960387353
['8' '9']
svm Overall ycsb nrmse: 0.1685264898283195
(0, 1) pairwise ycsb nrmse: 0.06339904392900461
(0, 2) pairwise ycsb nrmse: 0.1023289618827474
(1, 2) pairwise ycsb nrmse: 0.07532017623081813
(1, 3) pairwise ycsb nrmse: 0.25105570037101366
(2, 3) pairwise ycsb nrmse: 0.21450938407206369
['8' '9' '7']
svm Overall ycsb nrmse: 0.17649689519008852
(0, 1) pairwise ycsb nrmse: 0.05879547563886495
(0, 2) pairwise ycsb nrmse: 0.10936289450460114
(1, 2) pairwise ycsb nrmse: 0.10270361865882784
(1, 3) pairwise ycsb nrmse: 0.2688365830532106
(2, 3) pairwise ycsb nrmse: 0.21353628494357715


In [94]:
min_groups = tops[:2]
model_dict = [svm_model_dict[min_group] for min_group in min_groups]
print(min_groups)

pair_score, dfs = predict_pair(model_dict, (1, 3), ycsb_X, ycsb_y, ycsb_expr, f'svm13', False)
curr_df = dfs[0]
curr_df['Diff'] = abs(curr_df['Y_TRUE_large']-curr_df['Y_PRED_large'])
print(curr_df[curr_df['Diff'] == curr_df['Diff'].min()])
temp = curr_df[['Y_TRUE_small', 'Y_TRUE_large', 'Y_PRED_large', 'Diff']]
temp.describe()

['8' '9']
   cpu_num_small  EXPR  Y_TRUE_small  Y_PRED_small cpu_num_large  \
17           [4]    17   1325.623955   2895.011111          [16]   

    Y_TRUE_large  Y_PRED_large      Diff  
17   1368.144444   1368.875346  0.730902  


Unnamed: 0,Y_TRUE_small,Y_TRUE_large,Y_PRED_large,Diff
count,30.0,30.0,30.0,30.0
mean,1014.152739,1081.14,1057.40413,111.010977
std,266.040368,215.261745,266.040368,76.957344
min,664.172702,897.738889,707.424093,0.730902
25%,693.075209,922.231944,736.326599,28.859926
50%,1035.087744,946.151389,1078.339134,144.595189
75%,1308.350279,1350.628472,1351.601669,178.79985
max,1347.824513,1425.416667,1391.075903,224.267574


# Strategy 1

In [None]:
top_3 = fs.select_features(3, 'Chi2', est_name=None, direction=None, feature_type='perf')

In [None]:
simi_calc.calc_simi_matrix(perf_feature_names=top_3,  norm_type='corr')

simi_mtx = simi_calc.simi_mtx

In [100]:
tops = get_simis(simi_data, simi_mtx)

cpu8


In [101]:
print(tops)
for wl in tops:
    min_idx, = np.where(np.array(simi_data.wl_groups) == wl)
    print(np.unique(np.array(simi_data.wl_names)[min_idx]), simi_data.cpu_nums[candid_idx], 
          np.unique(np.array(simi_data.terminal_num)[min_idx]), 
          np.unique(np.array(simi_data.wl_groups)[min_idx])) 

['7' '1' '4' '3' '8' '2' '5' '6' '9']
['tpcc'] cpu8 [8] ['7']
['tpcc'] cpu8 [32] ['1']
['twitter'] cpu8 [8] ['4']
['twitter'] cpu8 [32] ['3']
['tpcc'] cpu8 [4] ['8']
['tpch'] cpu8 [32] ['2']
['tpch'] cpu8 [8] ['5']
['tpch'] cpu8 [4] ['6']
['twitter'] cpu8 [4] ['9']


### Prediction

In [31]:
all_results = {}

In [32]:
all_tests, all_trains = [], []
reg_model_dict = {}
reg_results = {}

for ty, curr_data in data_by_type.items():
    if ty not in tops:
        continue
    X = get_cpu_nums_as_X(curr_data.cpu_nums)
    y = np.array(curr_data.wl_throughput)
    test_r2_mean, train_r2_mean, model_dict, train_time = build_regression_model(X, y, curr_data.wl_names[0], 'group', 
                                                                   curr_data.wl_groups[0], show_fig=False)
    reg_model_dict[ty] = model_dict
    all_tests.append(test_r2_mean)
    all_trains.append(train_r2_mean)
    reg_results[ty] = test_r2_mean

print('Overall test nrmse: {}, train {}'.format(np.mean(all_tests), np.mean(all_tests)))

Overall test nrmse: 0.3308316949489202, train 0.3308316949489202


In [102]:
for k in [1 ,2, 3]:
    chose_and_pred(reg_model_dict, tops, k, 'reg')

['7']
reg Overall ycsb nrmse: 0.177388704527762
(0, 1) pairwise ycsb nrmse: 0.062036508812508676
(0, 2) pairwise ycsb nrmse: 0.10801164379654787
(1, 2) pairwise ycsb nrmse: 0.10375571023927778
(1, 3) pairwise ycsb nrmse: 0.2717728286714124
(2, 3) pairwise ycsb nrmse: 0.2132882158319074
['7' '1']
reg Overall ycsb nrmse: 0.17529740963386412
(0, 1) pairwise ycsb nrmse: 0.063039540038026
(0, 2) pairwise ycsb nrmse: 0.10596977836244141
(1, 2) pairwise ycsb nrmse: 0.10112366128523766
(1, 3) pairwise ycsb nrmse: 0.2663461127672552
(2, 3) pairwise ycsb nrmse: 0.21301835819851778
['7' '1' '4']
reg Overall ycsb nrmse: 0.19437631023008675
(0, 1) pairwise ycsb nrmse: 0.05760472378155589
(0, 2) pairwise ycsb nrmse: 0.1223464330070565
(1, 2) pairwise ycsb nrmse: 0.11838369316398559
(1, 3) pairwise ycsb nrmse: 0.30991269493798834
(2, 3) pairwise ycsb nrmse: 0.22191028297502186


In [103]:
min_groups = tops[:3]
model_dict = [svm_model_dict[min_group] for min_group in min_groups]
print(min_groups)

pair_score, dfs = predict_pair(model_dict, (1, 3), ycsb_X, ycsb_y, ycsb_expr, f'svm13', False)
curr_df = dfs[0]
curr_df['Diff'] = abs(curr_df['Y_TRUE_large']-curr_df['Y_PRED_large'])
print(curr_df[curr_df['Diff'] == curr_df['Diff'].min()])
temp = curr_df[['Y_TRUE_small', 'Y_TRUE_large', 'Y_PRED_large', 'Diff']]
temp.describe()

['7' '1' '4']
  cpu_num_small  EXPR  Y_TRUE_small  Y_PRED_small cpu_num_large  Y_TRUE_large  \
8           [4]     8    1311.16156   4676.772722          [16]   1328.644444   

   Y_PRED_large       Diff  
8    1318.06619  10.578254  


Unnamed: 0,Y_TRUE_small,Y_TRUE_large,Y_PRED_large,Diff
count,30.0,30.0,30.0,30.0
mean,1014.152739,1081.14,1021.05737,121.301543
std,266.040368,215.261745,266.040368,81.246254
min,664.172702,897.738889,671.077332,10.578254
25%,693.075209,922.231944,699.979839,48.149879
50%,1035.087744,946.151389,1041.992374,108.248429
75%,1308.350279,1350.628472,1315.254909,204.665537
max,1347.824513,1425.416667,1354.729143,260.614334
