In [1]:
import os
import matplotlib.pyplot as plt
import datetime
import time
import matplotlib.gridspec as gridspec

import pickle
import numpy as np
import pandas as pd
import seaborn as sns

from itertools import compress

from helpers.expr_data_mem_cpu import ExprDataMemCPU
from helpers.scale_data import ScaleData
from helpers.similarity import Similarity
from helpers.feature_selection import FeatureSelection

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error as rmse_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 22
SMALL_SMALL_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize

# plt.rc('legend', fontsize=SMALL_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import random
np.random.seed(42)
random.seed(42)

In [4]:
OVERALL_PLOT = False
# for measuring modeling time

In [5]:
def nrmse_score(y_true, y_pred):
    # return np.sqrt((((y_true-y_pred)/y_pred)**2).mean())    
    # return np.sqrt(((abs(y_true-y_pred)/y_pred)).mean())    
    return rmse_score(y_true, y_pred)/(np.max(y_true)-np.min(y_true))
    # return rmse_score(y_true, y_pred)/(np.mean(y_true))

score_func = make_scorer(nrmse_score, greater_is_better=False)

In [6]:
# read in the performance metrics for each experiment
data_all = ExprDataMemCPU()
data_all.load_pickle()
data_all = data_all.merge_tpch()

In [7]:
data = data_all.remove_by_wlname(['xml', 'ycsb'])
ycsb_data = data_all.remove_by_wlname(['xml', 'tpcc', 'tpch', 'twitter'])

In [8]:
all_ycsb_groups = [ # name terminal pair
    ('ycsb', 32), ('ycsb', 8), ('ycsb', 4)
]
candidate_group = ('ycsb', 8)

from_sku = (4, 32)
to_sku = (8, 64)

In [9]:
ycsb_data = ycsb_data.remove_by_config([g for g in all_ycsb_groups if g != candidate_group])

In [10]:
X_label = 'SKU'
expr_label = 'EXPR'
y_true_label = 'Y_TRUE'
y_pred_label = 'Y_PRED'
suffix_labels = ['_small', '_large']

In [11]:
# similarity for all
# new_data = data.keep_complete_exprs()

In [12]:
sampled_data = data.sample_data()

In [13]:
data_by_type = sampled_data.split_by_config()

In [14]:
for tempk, tempv in data_by_type.items():
    print(tempk, len(tempv.wl_throughput))

('tpcc', 4) 210
('tpcc', 8) 210
('tpcc', 32) 210
('tpch', 1) 450
('twitter', 4) 210
('twitter', 8) 210
('twitter', 32) 210


In [15]:
# del data_by_type[('tpch', 1)]

In [16]:
sampled_by_type = sampled_data.split_by_type()
for ty, expr_set in sampled_by_type.items():
    sub_by_term = expr_set.split_by_term()
    sampled_by_type[ty] = sub_by_term

In [17]:
from sklearn.metrics import mean_absolute_percentage_error
def get_mean_absolute_deviation(df):
    return mean_absolute_percentage_error(df['Y_TRUE_large'], df['Y_PRED_large'])

In [18]:
def unroll_mtx(mtx):
    return (mtx.T).ravel()

In [19]:
def get_X(data):
    return [ (int(c[3:]), m) for c, m in zip(data.cpu_nums, data.mem_sizes)]

In [20]:
def get_model_keys(num_cpus, mem_sizes):
    print(num_cpus, mem_sizes)
    # generate model_dict_key (cpuA, cpuB, memA, memB)
    keys = []
    for i in range(len(num_cpus)):
        cpu_a = int(num_cpus[i][3:])
        if cpu_a == 16:
            continue
        for j in range(i, len(num_cpus)):
            cpu_b = int(num_cpus[j][3:])
            if cpu_b == 16:
                continue
            for s in range(len(mem_sizes)):
                for t in range(s, len(mem_sizes)):
                    if i == j and s == t:
                         continue
                    keys.append((cpu_a, cpu_b, mem_sizes[s], mem_sizes[t]))
    return keys

In [21]:
ycsb_X = get_X(ycsb_data)
ycsb_y = np.array(ycsb_data.wl_throughput)
ycsb_expr = np.array(ycsb_data.sampled_run_idx)
model_keys = get_model_keys(np.unique(sampled_data.cpu_nums), np.unique(sampled_data.mem_sizes))

['cpu16' 'cpu2' 'cpu4' 'cpu8'] [32 64]


In [22]:
all_results = {}
all_times = {}

In [23]:
def plot_prediction(dfs, X, Y, expr_idxs, method, plot=True, figsize=(3.6,2.4)):
    # plot all trends 
    colors = ['#1b9e77','#7570b3','#d95f02','#e7298a']
    fig, ax = plt.subplots(figsize=figsize)
        
    pred_label_add = False
    pred_color = colors[0]
    overall_expr_label_add = False
    expr_tracker = 0
    
    all_true_tracker = {}
        
    color_map = [colors[1+idx%3] for idx, _ in enumerate(expr_idxs)]
    ax.scatter(X, Y, color=color_map)
    
    for df in dfs:
        expr_c = 1
        expr_label_add = False
        for _, row in df.iterrows():
            run_color = colors[1+expr_tracker%3]
            curr_x = [row[f'{X_label}{suffix}'] for suffix in suffix_labels]
            expr_tracker += 1
            true_y = [row[f'{y_true_label}{suffix}'] for suffix in suffix_labels]
            
            for x, y in zip(np.array(curr_x).flatten(), true_y):
                if x not in all_true_tracker:
                    all_true_tracker[x] = []
                all_true_tracker[x].append(y)
                
            if not overall_expr_label_add:
                if not expr_label_add and expr_tracker % 3 == 2:
                    expr = ax.plot(curr_x, true_y, '-', color=run_color, alpha=0.5, linewidth=1.5, label=f'Expr{expr_c} True_y')
                    expr_label_add = True
                else:
                    ax.plot(curr_x, true_y, '-', color=run_color, alpha=0.5, linewidth=1.5, label=f'Expr{expr_c} True_y')
            else:
                ax.plot(curr_x, true_y, '-', color=run_color, alpha=0.5, linewidth=1.5)
            expr_c += 1
        overall_expr_label_add = True
        
    for df in dfs:
        for _, row in df.iterrows():
            curr_x = [row[f'{X_label}{suffix}'] for suffix in suffix_labels]
            pred_y = [row[f'{y_true_label}{suffix_labels[0]}'], row[f'{y_pred_label}{suffix_labels[1]}']]
            
            if not pred_label_add:
                pred_label_add = True
                pred_l = ax.plot(curr_x, pred_y, '-o', color=pred_color, linewidth=1, label='Predict_y')
            else:
                ax.plot(curr_x, pred_y, '-o', color=pred_color, linewidth=1)
    
    true_x = np.sort(list(all_true_tracker.keys()))
    mean_true_y = [np.mean(all_true_tracker[key]) for key in true_x]
    ax.plot(true_x, mean_true_y, '-', color='#fc8d62', linewidth=15, alpha=0.3, label='Mean_True_y')
    max_x = np.max(true_x)
    # ax.set_xlim(right=max_x)
    base_ticks= [2, 4, 8, 16]
    ax.set_xticks([xtick for xtick in base_ticks if xtick <=max_x])
    ax.margins(0.1)           # Default margin is 0.05, value 0 means fit
                
    ax.set_xlabel('Num CPU')
    ax.set_ylabel('throughput')
    
    ax.axis('tight')
    plt.legend(bbox_to_anchor=(1, 0.1, 1, 1), loc='upper left', ncol=1)#, mode="expand")

    plt.savefig(f'figs/prediction/ycs_pred_{method}.pdf', bbox_inches = 'tight')
    plt.show()

In [24]:
'''
Use a dictionary of models
- key: (lower SKU, higher SKU)
- value: model
'''
def predict_shift(model_dicts, X, y_true, expr_idxs, method, sku_pair=None, plot=True):
    overall_data = pd.DataFrame(zip(X, expr_idxs, y_true), 
                                columns=[X_label, expr_label, y_true_label])

    # for each pair of SKU
    num_cpus = np.sort(np.unique(X))
    test_scores = []
    # models, datas_big, datas_small = [], [], []
    dfs = []
    
    for cpu_a, cpu_b, mem_a, mem_b in model_keys:
        curr_smaller_sku = (cpu_a, mem_a)
        curr_bigger_sku = (cpu_b, mem_b)
        curr_mk = (cpu_a, cpu_b, mem_a, mem_b)
        if sku_pair is not None and (
            curr_smaller_sku != sku_pair[0] or curr_bigger_sku != sku_pair[1]
        ):
            continue
        curr_smaller = overall_data[overall_data[X_label] == curr_smaller_sku]
        curr_bigger = overall_data[overall_data[X_label] == curr_bigger_sku]
        new_preds = []
            
        for model_dict in model_dicts:            
            new_y_true, new_y_pred = [], []
            
            small_x = np.array([list(temp) for temp in curr_smaller[X_label]])
            big_x = np.array([list(temp) for temp in curr_bigger[X_label]])

            curr_smaller_pred = model_dict[curr_mk].predict(small_x)
            curr_bigger_pred = model_dict[curr_mk].predict(big_x)
            curr_smaller = curr_smaller.assign(Y_PRED=curr_smaller_pred)
            curr_bigger = curr_bigger.assign(Y_PRED=curr_bigger_pred)
            curr_smaller.reset_index(drop=True, inplace=True)
            curr_bigger.reset_index(drop=True, inplace=True)

            for idx, smaller_row in curr_smaller.iterrows():
                curr_expr_idx = smaller_row[expr_label]
                curr_diff = smaller_row[y_true_label] / smaller_row[y_pred_label]
                # curr_diff = smaller_row[y_true_label] - smaller_row[y_pred_label]
                bigger_row = curr_bigger[curr_bigger[expr_label] == curr_expr_idx]
                # bigger_row = curr_bigger.iloc[[idx]]

                if bigger_row.shape[0] == 1:
                    bigger_row = bigger_row.iloc[0]
                    bigger_y_true = bigger_row[y_true_label]
                    bigger_y_pred = bigger_row[y_pred_label]
                else:
                    bigger_y_true = curr_bigger[y_true_label].mean()
                    bigger_y_pred = curr_bigger[y_pred_label].mean()
                # bigger_y_true = curr_bigger[y_true_label].mean()
                # bigger_y_pred = bigger_row[y_pred_label].mean()
                new_y_true.append(bigger_y_true)
                new_y_pred.append(bigger_y_pred * curr_diff)
            new_preds.append(new_y_pred)

        curr_bigger = curr_bigger.assign(Y_PRED=np.mean(np.array(new_preds), axis=0))
        df = pd.merge(curr_smaller, curr_bigger, left_index=True, right_index=True, suffixes=suffix_labels)
        print(df)
        dfs.append(df)
        
        score = nrmse_score(df['Y_TRUE_large'].to_numpy(), df['Y_PRED_large'].to_numpy())
        test_scores.append(score)
            
    print(test_scores)

    if plot:
        plot_prediction(dfs, X, y_true, expr_idxs, method)
    
    overall_score = np.mean(test_scores)
    return overall_score, dfs

## Regression

Use only CPU number and target (throughput)

In [25]:
from sklearn.linear_model import LinearRegression
# X_label = 'cpu_num'
y_label = 'throughput'

In [26]:
def plot_regeressions_indi(models, datas, workload_name, group_type, groupping_id, plot=True):
    colors = sns.color_palette()
    if workload_name == 'tpcc':
        color = colors[0]
    elif workload_name == 'tpch':
        color = colors[1]
    elif workload_name == 'twitter':
        color = colors[2]
    else:
        color = colors[3]
    colors = ['#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d','#666666']
        
    fig, ax = plt.subplots(figsize=(4,3.6))

    x_ticks = []
    for model, data, i in zip(models, datas, list(range(len(models)))):
        x_new = np.linspace(np.min(data[X_label])[0], np.max(data[X_label])[0], num=100).reshape(-1, 1)
        y_new = model.predict(x_new)
        x_ticks+=data[X_label].to_list()
        ax.scatter(data[X_label], data[y_label], color=color)
        ax.plot(x_new, y_new, color=colors[i], linestyle='--', label='{}-{}'.format(np.min(data[X_label])[0], np.max(data[X_label])[0]))
    
    ax.set_xlabel('Num CPU')
    ax.set_ylabel('throughput')
 
    ax.set_xticks(np.unique(x_ticks))
    ax.axis('tight')
    # plt.legend(loc='best', ncol=2)
    plt.legend(bbox_to_anchor=(1, 0, 1, 1), loc='upper left', ncol=1)#, mode="expand")
    plt.savefig(f'figs/prediction/regr_pair_{workload_name}_{group_type}_{groupping_id}.pdf', bbox_inches = 'tight')
    plt.show() 

In [27]:
def build_regression_model_indi(X, y, expr_idxs, wl_name, grouping_type, groupping_id, plot=True): 
    train_scores, test_scores = [], []

    run_label = 'RUN_IDX'
    overall_data = pd.DataFrame(zip(X, y, expr_idxs), columns=[X_label, y_label, run_label])

    k = 5
    # kf = StratifiedKFold(n_splits=k, shuffle=True)
    kf = KFold(n_splits=k, shuffle=True)
    uq_idx = np.unique(expr_idxs)

    models, datas = [], []
    
    model_dict = {}
    train_time = 0    

    for cpu_a, cpu_b, mem_a, mem_b in model_keys:
        curr_smaller_sku = (cpu_a, mem_a)
        curr_bigger_sku = (cpu_b, mem_b)
        curr_mk = (cpu_a, cpu_b, mem_a, mem_b)
        
        curr_data = overall_data[ (overall_data[X_label] == curr_smaller_sku) | (overall_data[X_label] == curr_bigger_sku) ]
        datas.append(curr_data)
        tags = [0 if t == curr_smaller_sku else 1 for t in curr_data[X_label] ]
        start = time.time()
        # select by sampled run idx 
        # for train_run_idxs, test_run_idxs in kf.split(curr_data.index.values.tolist(), tags):
        for train_run_idxs, test_run_idxs in kf.split(uq_idx):
            train_runs = uq_idx[train_run_idxs]
            test_runs = uq_idx[test_run_idxs]
            train_index = [i for i, value in enumerate(curr_data[run_label].to_list()) if value in train_runs]
            test_index = [i for i, value in enumerate(curr_data[run_label].to_list()) if value in test_runs]
#             train_index = train_run_idxs
#             test_index = test_run_idxs

            data = curr_data.iloc[train_index]
            test = curr_data.iloc[test_index]
            
            # if wl_name == 'tpch':
            #     print(data, test)

            reg = LinearRegression()
            train_x = np.array([list(temp) for temp in data[X_label]])
            test_x = np.array([list(temp) for temp in test[X_label]])
            reg.fit(train_x, data[y_label])
            test_pred = reg.predict(test_x)
            train_pred = reg.predict(train_x)

            score = nrmse_score(test[y_label], test_pred)
            test_scores.append(score)
            score = nrmse_score(data[y_label], train_pred)
            train_scores.append(score)
        end = time.time()
        train_time += (end - start) / k

        models.append(reg)
        model_dict[curr_mk] = reg

    # plot the last one as example
    if plot:
        plot_regeressions_indi(models, datas, wl_name, grouping_type, groupping_id)
    overall_test = np.mean(test_scores)
    overall_train = np.mean(train_scores)

    return overall_test, overall_train, model_dict, train_time

In [28]:
all_tests, all_trains = [], []
reg_group_to_model_dict = {} 
all_results['Regression'] = {}
all_times['Regression'] = {}
for ty, curr_data in data_by_type.items():
    if len(curr_data.wl_names) == 0:
        continue    
    name = curr_data.wl_names[0]
    term = curr_data.terminal_num[0]
    if name not in all_results['Regression']:
        all_results['Regression'][name] = {}    
        all_times['Regression'][name] = {}

    print(f'Group Id: {ty}, wl name {name}, terminal num {term}')
    X = get_X(curr_data)
    y = np.array(curr_data.wl_throughput)
    test_r2_mean, train_r2_mean, model_dict, train_time = build_regression_model_indi(X, y, np.array(curr_data.sampled_run_idx), name, 'group', curr_data.wl_groups[0], plot=OVERALL_PLOT)
    reg_group_to_model_dict[ty] = model_dict 
    print('Test rmse         = {}, Train rmse         = {}'.format(test_r2_mean, train_r2_mean)) 
    all_tests.append(test_r2_mean)
    all_trains.append(train_r2_mean)
    all_results['Regression'][name][term] = test_r2_mean
    all_times['Regression'][name][term] = train_time
    print(all_tests, all_trains)
print('Overall test nrmse: {}; train {}'.format(np.mean(all_tests), np.mean(all_trains)))

Group Id: ('tpcc', 4), wl name tpcc, terminal num 4
Test rmse         = 0.2315881591888596, Train rmse         = 0.18004579161477238
[0.2315881591888596] [0.18004579161477238]
Group Id: ('tpcc', 8), wl name tpcc, terminal num 8
Test rmse         = 0.3515035639390938, Train rmse         = 0.23606330134803805
[0.2315881591888596, 0.3515035639390938] [0.18004579161477238, 0.23606330134803805]
Group Id: ('tpcc', 32), wl name tpcc, terminal num 32
Test rmse         = 0.15818811292662974, Train rmse         = 0.12875243120087837
[0.2315881591888596, 0.3515035639390938, 0.15818811292662974] [0.18004579161477238, 0.23606330134803805, 0.12875243120087837]
Group Id: ('tpch', 1), wl name tpch, terminal num 1
Test rmse         = nan, Train rmse         = nan
[0.2315881591888596, 0.3515035639390938, 0.15818811292662974, nan] [0.18004579161477238, 0.23606330134803805, 0.12875243120087837, nan]
Group Id: ('twitter', 4), wl name twitter, terminal num 4
Test rmse         = 0.14874680662303724, Train rm

In [29]:
# most similar expr idx
min_run_idx = 0
min_groups = [('tpcc', 32), ('tpcc', 4)] # '1' '2' '5' '6']
model_dict = [reg_group_to_model_dict[min_group] for min_group in min_groups]

overall_score, dfs = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 'regression', sku_pair=None, plot=False)

print('Overall ycsb nrmse: {}'.format(overall_score))

  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (2, 32)         62    261.387129    206.189775   (2, 64)         29   
1   (2, 32)         63    244.731470    206.189775   (2, 64)         32   
2   (2, 32)         64    256.235506    206.189775   (2, 64)         35   

   Y_TRUE_large  Y_PRED_large  
0   1505.604868    565.566016  
1   1330.865721    529.527996  
2   1343.223288    554.419397  
  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (2, 32)         62    261.387129    205.585348   (4, 32)         62   
1   (2, 32)         63    244.731470    205.585348   (4, 32)         63   
2   (2, 32)         64    256.235506    205.585348   (4, 32)         64   

   Y_TRUE_large  Y_PRED_large  
0    564.803961    289.531597  
1    595.285507    271.082565  
2    587.266889    283.825282  
  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (2, 32)         62    261.387129    205.391204   (4, 64)       

In [30]:
# most similar expr idx
min_run_idx = 0
min_groups = [('tpcc', 32), ('tpcc', 4)] # '1' '2' '5' '6']

model_dict = [reg_group_to_model_dict[min_group] for min_group in min_groups]
pair_score, dfs = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 
                              'reg_tpcc-pred-pair', sku_pair=(from_sku, to_sku), plot=False)
print('pair ycsb error: {}'.format(get_mean_absolute_deviation(dfs[0])))

  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (4, 32)         62    564.803961    206.983333   (8, 64)         29   
1   (4, 32)         63    595.285507    206.983333   (8, 64)         32   
2   (4, 32)         64    587.266889    206.983333   (8, 64)         35   

   Y_TRUE_large  Y_PRED_large  
0   1450.774047   1059.975401  
1   1407.425248   1117.180540  
2   1384.077155   1102.131889  
[4.869588895217871]
pair ycsb error: 0.2264342400103593


In [31]:
tpch_g = [('tpch', 1)]
model_dict = [reg_group_to_model_dict[min_group] for min_group in tpch_g]

pair_score, dfs = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 
                              'reg_tpch-pred-pair', sku_pair=(from_sku, to_sku), plot=False)
print('pair ycsb error: {}'.format(get_mean_absolute_deviation(dfs[0])))

  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (4, 32)         62    564.803961           1.0   (8, 64)         29   
1   (4, 32)         63    595.285507           1.0   (8, 64)         32   
2   (4, 32)         64    587.266889           1.0   (8, 64)         35   

   Y_TRUE_large  Y_PRED_large  
0   1450.774047      3.444750  
1   1407.425248      3.630658  
2   1384.077155      3.581752  
[21.15258519287704]
pair ycsb error: 0.9974860351393415


In [32]:
twitter_g = [('twitter', 4), ('twitter', 32)]
model_dict = [reg_group_to_model_dict[min_group] for min_group in twitter_g]

pair_score, dfs = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 
                              'reg_twitter-pred-pair', sku_pair=(from_sku, to_sku), plot=False)
print('pair ycsb error: {}'.format(get_mean_absolute_deviation(dfs[0])))

  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (4, 32)         62    564.803961   5418.081061   (8, 64)         29   
1   (4, 32)         63    595.285507   5418.081061   (8, 64)         32   
2   (4, 32)         64    587.266889   5418.081061   (8, 64)         35   

   Y_TRUE_large  Y_PRED_large  
0   1450.774047    596.902078  
1   1407.425248    629.115907  
2   1384.077155    620.641587  
[11.987364973399979]
pair ycsb error: 0.5643832780987621


## SVM

Use only CPU number and target (throughput)

In [33]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [34]:
def plot_svrs_indi(models, datas, workload_name, group_type, groupping_id, plot=True, show_fig=True):
    colors = sns.color_palette()
    if workload_name == 'tpcc':
        color = colors[0]
    elif workload_name == 'tpch':
        color = colors[1]
    elif workload_name == 'twitter':
        color = colors[2]
    else:
        color = colors[3]
    colors = ['#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d','#666666']
    fig, ax = plt.subplots(figsize=(4,3.6))
    
    x_ticks = []
    for model, data, line_color in zip(models, datas, colors):
        x_new = np.linspace(np.min(data[X_label]), np.max(data[X_label]), num=100).reshape(-1, 1)
        y_new = model.predict(x_new)
        x_ticks += data[X_label].to_list()
        ax.scatter(data[X_label], data[y_label], color=color)
        # ax.plot(x_new, y_new, color=color, linestyle='--')
        ax.plot(x_new, y_new, color=line_color, linestyle='--', label='{}-{}'.format(np.min(data[X_label])[0], np.max(data[X_label])[0]))

    ax.set_xlabel('Num CPU')
    ax.set_ylabel('throughput')
    ax.set_xticks(np.unique(x_ticks))
    
    ax.axis('tight')
    plt.legend(bbox_to_anchor=(1, 0, 1, 1), loc='upper left', ncol=1)#, mode="expand")
    plt.savefig(f'figs/prediction/svr_pair_{workload_name}_{group_type}_{groupping_id}.pdf', bbox_inches = 'tight')
    plt.show() 

In [35]:
def build_svm_model_indi(X, y, wl_name, grouping_type, groupping_id, plot=True):  
    metrics = [X_label, y_label]
    zipped = zip(X, y)
    
    # append fixed effect and random effect to data
    overall_data = pd.DataFrame(zipped, columns=metrics)

    train_scores, test_scores = [], []

    models, datas = [], []
    model_dict = {}
    
    # epsilon range: according to this paper: http://adrem.uantwerpen.be/bibrem/pubs/IJCNN2007.pdf
    # C range: https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
    
    # Cross validation grid search (best parameters) 
    c_range = np.logspace(-5, 5, base=2.0, num=5) # 1 and 100
    epsilon_range = np.linspace(10 ** -3, 1, 5) # 1-e3 and 1
    degree_range = np.array(list(range(1, 5)))
    tuned_parameters = [{'kernel': ['rbf'],'C': c_range, 'epsilon': epsilon_range}, # 'gamma':gamma_range,
                        {'kernel': ['linear'], 'C': c_range, 'epsilon': epsilon_range}, # 'gamma':gamma_range,
                        {'kernel': ['poly'], 'C': c_range, 'degree': degree_range, 'epsilon': epsilon_range} # 'gamma':gamma_range,
                       ]
    train_time = 0

    for cpu_a, cpu_b, mem_a, mem_b in model_keys:
        curr_smaller_sku = (cpu_a, mem_a)
        curr_bigger_sku = (cpu_b, mem_b)
        curr_mk = (cpu_a, cpu_b, mem_a, mem_b)
        
        curr_data = overall_data[ (overall_data[X_label] == curr_smaller_sku) | (overall_data[X_label] == curr_bigger_sku) ]
            
        svr = SVR()
        datas.append(curr_data)
        clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=0, n_jobs=4,
                           cv=5, 
                           scoring=score_func, #'neg_root_mean_squared_error', #'neg_mean_squared_error', #'r2', 
                           return_train_score=True)
        train_x = np.array([list(temp) for temp in curr_data[X_label]])
        clf.fit(train_x, curr_data[y_label])           
        best_params = clf.best_params_
        results = pd.DataFrame.from_dict(clf.cv_results_)
        results_best = results[results['params'] == best_params].reset_index()
        train_time += results_best['mean_fit_time'][0]

        test_scores.append(results_best.iloc[0]['mean_test_score'])
        train_scores.append(results_best.iloc[0]['mean_train_score'])
        models.append(clf)
        model_dict[curr_mk] = clf
        print(f'NRMSE from [cpu={cpu_a},mem={mem_a}] to [cpu={cpu_b},mem={mem_b}] is {test_scores[-1]}')

    # plot the last one as example
    if plot:
        plot_svrs_indi(models, datas, wl_name, grouping_type, groupping_id)

    overall_test = np.mean(test_scores)
    overall_train = np.mean(train_scores)
    return overall_test, overall_train, model_dict, train_time

In [36]:
all_tests, all_trains = [], []
svm_group_to_model_dict = {}
all_results['SVM'] = {}
all_times['SVM'] = {}

for ty, curr_data in data_by_type.items():
    name = curr_data.wl_names[0]
    term = curr_data.terminal_num[0]
    if name not in all_results['SVM']:
        all_results['SVM'][name] = {}
        all_times['SVM'][name] = {}
    print(f'Group Id: {ty}, wl name {name}, terminal num {term}')
    X = get_X(curr_data)
    y = np.array(curr_data.wl_throughput)
    test_r2_mean, train_r2_mean, model_dict, train_time = build_svm_model_indi(X, y, 
                                                                               curr_data.wl_names[0], 'group', 
                                                                               curr_data.wl_groups[0], 
                                                                               plot=OVERALL_PLOT)
    svm_group_to_model_dict[ty] = model_dict
    print('Test rmse = {}, Train rmse = {}'.format(test_r2_mean, train_r2_mean))     
    all_tests.append(test_r2_mean)
    all_trains.append(train_r2_mean)
    all_results['SVM'][name][term] = test_r2_mean
    all_times['SVM'][name][term] = train_time

print('Overall test nrmse: {}, train {}'.format(np.mean(all_tests), np.mean(all_tests)))

Group Id: ('tpcc', 4), wl name tpcc, terminal num 4
NRMSE from [cpu=2,mem=32] to [cpu=2,mem=64] is -0.08383573393087697
NRMSE from [cpu=2,mem=32] to [cpu=4,mem=32] is -0.33530633233657037
NRMSE from [cpu=2,mem=32] to [cpu=4,mem=64] is -0.10565202194437687
NRMSE from [cpu=2,mem=64] to [cpu=4,mem=64] is -0.2872651393140071
NRMSE from [cpu=2,mem=32] to [cpu=8,mem=32] is -0.35133090409606693
NRMSE from [cpu=2,mem=32] to [cpu=8,mem=64] is -0.12129433726170122
NRMSE from [cpu=2,mem=64] to [cpu=8,mem=64] is -0.2780938108586297
NRMSE from [cpu=4,mem=32] to [cpu=4,mem=64] is -0.09437875339913615
NRMSE from [cpu=4,mem=32] to [cpu=8,mem=32] is -0.34511769337121856
NRMSE from [cpu=4,mem=32] to [cpu=8,mem=64] is -0.1178083076742003
NRMSE from [cpu=4,mem=64] to [cpu=8,mem=64] is -0.30373993215343276
NRMSE from [cpu=8,mem=32] to [cpu=8,mem=64] is -0.11853088474911537
Test rmse = -0.2118628209241111, Train rmse = -0.18697919445620692
Group Id: ('tpcc', 8), wl name tpcc, terminal num 8
NRMSE from [cpu=



NRMSE from [cpu=2,mem=32] to [cpu=4,mem=32] is nan
NRMSE from [cpu=2,mem=32] to [cpu=4,mem=64] is -0.0027652838177890603
NRMSE from [cpu=2,mem=64] to [cpu=4,mem=64] is -0.2372939807225952




NRMSE from [cpu=2,mem=32] to [cpu=8,mem=32] is nan
NRMSE from [cpu=2,mem=32] to [cpu=8,mem=64] is -0.0026256492105736578
NRMSE from [cpu=2,mem=64] to [cpu=8,mem=64] is -0.2456425087780995
NRMSE from [cpu=4,mem=32] to [cpu=4,mem=64] is -0.002760782520718861




NRMSE from [cpu=4,mem=32] to [cpu=8,mem=32] is nan
NRMSE from [cpu=4,mem=32] to [cpu=8,mem=64] is -0.00262565277818263
NRMSE from [cpu=4,mem=64] to [cpu=8,mem=64] is -0.23975697892769454
NRMSE from [cpu=8,mem=32] to [cpu=8,mem=64] is -0.0026256505802056173
Test rmse = nan, Train rmse = nan
Group Id: ('twitter', 4), wl name twitter, terminal num 4
NRMSE from [cpu=2,mem=32] to [cpu=2,mem=64] is -0.07142175699281025
NRMSE from [cpu=2,mem=32] to [cpu=4,mem=32] is -0.06288251348947857
NRMSE from [cpu=2,mem=32] to [cpu=4,mem=64] is -0.06255296529798914
NRMSE from [cpu=2,mem=64] to [cpu=4,mem=64] is -0.3353652577729823
NRMSE from [cpu=2,mem=32] to [cpu=8,mem=32] is -0.07684993135453994
NRMSE from [cpu=2,mem=32] to [cpu=8,mem=64] is -0.035744114211068975
NRMSE from [cpu=2,mem=64] to [cpu=8,mem=64] is -0.2905606846847427
NRMSE from [cpu=4,mem=32] to [cpu=4,mem=64] is -0.07334400451724322
NRMSE from [cpu=4,mem=32] to [cpu=8,mem=32] is -0.38234219109103545
NRMSE from [cpu=4,mem=32] to [cpu=8,mem=

In [37]:
model_dict = [svm_group_to_model_dict[min_group] for min_group in min_groups]
overall_score, dfs = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 'svm', sku_pair=None, plot=False)
print('Overall ycsb nrmse: {}'.format(overall_score))

  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (2, 32)         62    261.387129    207.598885   (2, 64)         29   
1   (2, 32)         63    244.731470    207.598885   (2, 64)         32   
2   (2, 32)         64    256.235506    207.598885   (2, 64)         35   

   Y_TRUE_large  Y_PRED_large  
0   1505.604868    553.762151  
1   1330.865721    518.476276  
2   1343.223288    542.848172  
  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (2, 32)         62    261.387129    207.516667   (4, 32)         62   
1   (2, 32)         63    244.731470    207.516667   (4, 32)         63   
2   (2, 32)         64    256.235506    207.516667   (4, 32)         64   

   Y_TRUE_large  Y_PRED_large  
0    564.803961    280.112476  
1    595.285507    262.263633  
2    587.266889    274.591800  
  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (2, 32)         62    261.387129    207.534819   (4, 64)       

In [38]:
# most similar expr idx
min_run_idx = 0
min_groups = [('tpcc', 32), ('tpcc', 4)] # '1' '2' '5' '6']

model_dict = [svm_group_to_model_dict[min_group] for min_group in min_groups]

# overall_score = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 'svm', sku_pair=None, plot=False)

# print('Overall ycsb nrmse: {}'.format(overall_score))

pair_score, dfs = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 
                              'svm-pred-pair', sku_pair=(from_sku, to_sku), plot=False)
print('pair ycsb error: {}'.format(get_mean_absolute_deviation(dfs[0])))

  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (4, 32)         62    564.803961    207.888643   (8, 64)         29   
1   (4, 32)         63    595.285507    207.888643   (8, 64)         32   
2   (4, 32)         64    587.266889    207.888643   (8, 64)         35   

   Y_TRUE_large  Y_PRED_large  
0   1450.774047   1088.405365  
1   1407.425248   1147.144823  
2   1384.077155   1131.692547  
[4.437202303361239]
pair ycsb error: 0.2056861672614608


In [39]:
print(svm_group_to_model_dict.keys())

dict_keys([('tpcc', 4), ('tpcc', 8), ('tpcc', 32), ('tpch', 1), ('twitter', 4), ('twitter', 8), ('twitter', 32)])


In [40]:
tpch_g = [('tpch', 1)]
model_dict = [svm_group_to_model_dict[min_group] for min_group in tpch_g]

pair_score, dfs = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 
                              'svm-pred-pair', sku_pair=(from_sku, to_sku), plot=False)
print('pair ycsb error: {}'.format(get_mean_absolute_deviation(dfs[0])))

  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (4, 32)         62    564.803961         0.999   (8, 64)         29   
1   (4, 32)         63    595.285507         0.999   (8, 64)         32   
2   (4, 32)         64    587.266889         0.999   (8, 64)         35   

   Y_TRUE_large  Y_PRED_large  
0   1450.774047      3.377252  
1   1407.425248      3.559516  
2   1384.077155      3.511569  
[21.153628262039824]
pair ycsb error: 0.997535295380501


In [41]:
twitter_g = [('twitter', 4), ('twitter', 32)]
model_dict = [svm_group_to_model_dict[min_group] for min_group in twitter_g]

pair_score, dfs = predict_shift(model_dict, ycsb_X, ycsb_y, ycsb_expr, 
                              'svm-pred-pair', sku_pair=(from_sku, to_sku), plot=False)
print('pair ycsb error: {}'.format(get_mean_absolute_deviation(dfs[0])))

  SKU_small EXPR_small  Y_TRUE_small  Y_PRED_small SKU_large EXPR_large  \
0   (4, 32)         62    564.803961   5417.063639   (8, 64)         29   
1   (4, 32)         63    595.285507   5417.063639   (8, 64)         32   
2   (4, 32)         64    587.266889   5417.063639   (8, 64)         35   

   Y_TRUE_large  Y_PRED_large  
0   1450.774047    598.424159  
1   1407.425248    630.720133  
2   1384.077155    622.224203  
[11.963883681607749]
pair ycsb error: 0.563272468951804
