In [1]:
# Copy for plotting

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from itertools import compress
import time

from sklearn.linear_model import lasso_path, enet_path, LogisticRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR, LinearSVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics.pairwise import rbf_kernel, polynomial_kernel
from sklearn.feature_selection import VarianceThreshold, SelectKBest, RFE, SequentialFeatureSelector, SelectFromModel
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif, r_regression
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import TruncatedSVD

import scipy.stats as ss

from helpers import expr_data
from helpers import scale_data
from helpers import similarity

import warnings
warnings.filterwarnings("ignore")

In [2]:
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 22
SMALL_SMALL_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize

plt.rc('legend', fontsize=SMALL_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
# import random
# np.random.seed(47906)
# random.seed(15231)

In [4]:
import random
np.random.seed(42)
random.seed(42)

In [5]:
SKU='cpu4'

## Load all data

In [6]:
data = expr_data.ExprData()
data.load_pickle()
# data = data.remove_by_wlname(['xml'])
data = data.fix_tpch()
# data = data.sample_data()

#### Split by SKU

In [7]:
data_by_sku = data.split_by_sku()
curr_data = data_by_sku[SKU]

In [8]:
print(list(zip(curr_data.terminal_num, curr_data.wl_names)))

[(32, 'tpcc'), (32, 'tpcc'), (32, 'tpcc'), (8, 'tpcc'), (8, 'tpcc'), (8, 'tpcc'), (4, 'tpcc'), (4, 'tpcc'), (4, 'tpcc'), (1, 'tpch'), (1, 'tpch'), (1, 'tpch'), (1, 'tpch'), (1, 'tpch'), (1, 'tpch'), (1, 'tpch'), (1, 'tpch'), (1, 'tpch'), (32, 'twitter'), (32, 'twitter'), (32, 'twitter'), (8, 'twitter'), (8, 'twitter'), (8, 'twitter'), (4, 'twitter'), (4, 'twitter'), (4, 'twitter'), (32, 'ycsb'), (4, 'ycsb'), (8, 'ycsb'), (32, 'ycsb'), (4, 'ycsb'), (8, 'ycsb'), (32, 'ycsb'), (4, 'ycsb'), (8, 'ycsb')]


#### Calculate Distance

In [9]:
# the result sku_result is a dict with its key the SKU,
# the value a list, the classification accuracy for each f_num
data_dist = {}

for sku in data_by_sku.keys():
    curr_data = data_by_sku[sku]
    if 'ter' in sku:
        continue
    print(f'cpu_num={sku}')
    scaler = scale_data.ScaleData()
    plan_mtxs_splitted, plan_col_ranges = scaler.scale(curr_data.plan_mtxs)
    perf_mtxs_splitted, perf_col_ranges = scaler.scale(curr_data.perf_mtxs)
    simi_calc = similarity.Similarity(curr_data, plan_mtxs_splitted, plan_col_ranges, perf_mtxs_splitted, perf_col_ranges, num_bins=10)
    # simi_calc.num_bins=100
    # simi_calc.num_bins=50

    simi_calc.calc_bined_mtx() # all features
    simi_calc.calc_dist_simi_matrix(normalize=True)
    print(simi_calc.simi_mtx.shape)
    # feature wise distance
    simi_calc.calc_featurewise_dist_by_col()
    print(simi_calc.simi_col_mtx.shape)
    # calc_col_dist_simi_matrix(normalize=False) # all features
    
    data_dist[sku] = simi_calc

cpu_num=cpu16
(36, 36)
(36, 36, 29)
(36, 36, 29)
cpu_num=cpu2
(36, 36)
(36, 36, 29)
(36, 36, 29)
cpu_num=cpu4
(36, 36)
(36, 36, 29)
(36, 36, 29)
cpu_num=cpu8
(36, 36)
(36, 36, 29)
(36, 36, 29)


## Select Top K Features

In [10]:
# return non-zero index in descending order
def sparse_argsort(arr):
    arr = np.where(np.isnan(arr), 0, arr)
    arr = arr * -1
    indices = np.nonzero(arr)[0]
    result = indices[np.argsort(arr[indices])]
    return result

def all_argsort(arr):
    arr = np.where(np.isnan(arr), 0, arr)
    arr = arr * -1
    result = np.argsort(arr)
    return result

In [11]:
def rand_forest_orders(X, y):
    selector = RandomForestClassifier(random_state=42, 
                                      n_jobs=-1, 
                                      n_estimators=100, 
                                      max_depth=5)
    selector.fit(X,y)
    scores = np.abs(selector.feature_importances_)
    return sparse_argsort(scores)

### Feature select main function

In [12]:
def get_top_features(curr_data, expr_num, simi_calc, method, note=None):

    # create dict for all features
    num_features = len(curr_data.feature_cols)
    feature_importance = np.array([0]*num_features)
    
    for i in range(expr_num):
        # calculate label
        curr_name = curr_data.wl_names[i]
        y = [curr_name == name for name in curr_data.wl_names]
        # X = simi_calc.dist_by_col_cube[i]
        X = simi_calc.simi_col_mtx[i]
        
        mask = np.ones(X.shape[0], dtype=bool)  
        X = X[mask]#.reshape(-1, 1)
                
        orders = rand_forest_orders(X, y)

        for idx in range(len(orders)):
            # from 0 to last idx of orders
            # the score = num_features - idx
            #   for a entry with feature_idx important order idx idx
            # the higher the order, the more the score
            feature_importance[orders[idx]] += num_features-idx
    final_orders = all_argsort(feature_importance)
    top_features = [curr_data.feature_cols[j] for j in final_orders]
    return top_features

## Compare Feature Selection with Similarity Calculation

#### Experiment Setup

In [13]:
main_dict = {}
time_dict = {}

In [14]:
all_features = data_by_sku[list(data_by_sku.keys())[0]].feature_cols
feature_num = len(all_features)

knn_thresholds = [1, 2, 3]
direct_methods = ['RandomForest']
simi_method = 'KNN'

f_nums = [1, 3, 7, 15, feature_num]

In [15]:
for knn_threshold in knn_thresholds:
    print(knn_threshold)
    
    if knn_threshold not in main_dict:
        main_dict[knn_threshold] = {}
        time_dict[knn_threshold] = {}
    for fs_method in direct_methods:
        print(fs_method)
        curr_method = {}

        for f_num in f_nums:
            curr_method[f_num] = []
        elapsed = []
        for sku in data_by_sku.keys():    
            if 'ter' in sku:
                continue
            curr_data = data_by_sku[sku]
            curr_calc = data_dist[sku]
            expr_num = curr_data.get_num_exprs()
        
            all_accs = []
            # run 10 times to get the average
            num_repeats = 1
            for i in range(num_repeats):       
                curr_accs = []
                start_time = time.time()
                top_features = get_top_features(curr_data, expr_num, curr_calc, fs_method, None)
                f_features = [top_features[:n] for n in f_nums]
                elapsed.append(time.time() - start_time)

                for f_num, curr_f in zip(f_nums, f_features):
                    curr_calc.calc_dist_simi_matrix(feature_names=curr_f)
                    pen, pens = curr_calc.simi_penalty(n=knn_threshold, dependent=True)

                    acc = 1 - (np.sum(pens)/(len(pens)*10))
                    curr_accs.append(acc)
                all_accs.append(curr_accs)
            all_accs = np.average(np.array(all_accs), axis=0)
            for f_num, acc in zip(f_nums, all_accs):
                curr_method[f_num].append(acc)
        main_dict[knn_threshold][fs_method] = curr_method
        time_dict[knn_threshold][fs_method] = np.mean(elapsed)
        print(np.mean(elapsed))

1
RandomForest
8.980920195579529
2
RandomForest
9.456942200660706
3
RandomForest
9.626041829586029


In [30]:
def pretty_print_table(k=3, sku=None):
    name_trans_dict = {
        'Forward_SFS_Linear': 'Fw SFS Linear',
        'Backward_SFS_Linear': 'Bw SFS Linear',
        'Forward_SFS_DecisionTree': 'Fw SFS DecTree',
        'Backward_SFS_DecisionTree': 'Bw SFS DecTree',
        'Forward_SFS_LogisticRegression': 'Fw SFS LogReg',
        'Backward_SFS_LogisticRegression': 'Bw SFS LogReg',
        'MutualInfoGain': 'MIGain',
        'RFE_Linear': 'RFE Linear',
        'RFE_DecisionTree': 'RFE DecTree',
        'RFE_LogisticRegression': 'RFE LogReg',
        'RandomForest': 'RandForest'
    }
    
    sku_trans_dict = {
        'cpu2': 0,
        'cpu4': 1,
        'cpu8': 2,
        'cpu16': 3,
    }
    
    for method, subval in main_dict[k].items():
        outstr = '\\textcolor{}{'
        print_name = method if method not in name_trans_dict else name_trans_dict[method]
        outstr += print_name
        outstr += '} & '
        for fnum, subsubval in subval.items():
            if fnum == 29:
                continue
            outstr += '\hlrfive{'
            if sku is None:
                outstr += f'{np.mean(subsubval):.3f}'
            else:
                outstr += f'{subsubval[sku_trans_dict[sku]]:.3f}'
            outstr += '} & '
        if method == 'Variance':
            if sku is None:
                all_acc = np.mean(subval[29])
            else:
                all_acc = subval[29][sku_trans_dict[sku]]
            outstr += '\multirow{17}{*}{'
            outstr += '{:.3f}'.format(all_acc)
            outstr += '}'
        outstr += ' & \hlrfive{'        
        outstr += f'{time_dict[k][method]:.3f}'
        outstr += '} \\\\'
        print(outstr)

In [22]:
def calc_mtx_dist(curr_data, curr_calc, f_num, method):
    accs = []
    expr_num = curr_data.get_num_exprs()
    for expr in range(expr_num):
        # calculate label
        curr_name = curr_data.wl_names[expr]
        y = [curr_name == name for name in curr_data.wl_names]
        # X = simi_calc.dist_by_col_cube[i]
        X = simi_calc.simi_col_mtx[expr]
        mask = np.ones(X.shape[0], dtype=bool)
        if method == 'PCA':
            X = X[mask]#.reshape(-1, 1)
            model = PCA(n_components=f_num)
            model.fit(X)
        elif method == 'LDA':
            model = LinearDiscriminantAnalysis(n_components=f_num)
            model.fit(X, y)
        elif method == 'SVD':
            model = TruncatedSVD(n_components=f_num, random_state=42)
            model.fit(X)
            
        simi_mtx = np.zeros((expr_num, expr_num))
        
        # calculate distance
        for i in range(expr_num): # for each experiment with idx i
            for j in range(i+1): # for each experiment idx <- i
                if i == j:
                    continue
                else:
                    ndarri = model.transform(simi_calc.bined[i])
                    ndarrj = model.transform(simi_calc.bined[j])
                    simi_mtx[i][j] = np.linalg.norm(np.absolute(ndarri - ndarrj), ord='nuc')/expr_num

        for i in range(expr_num):
            for j in range(expr_num):
                if i < j:
                    simi_mtx[i][j] = simi_mtx[j][i]
        pen, pens = curr_calc.simi_penalty(n=knn_threshold, simi=simi_mtx)

        acc = 1 - (np.sum(pens)/(len(pens)*10))
        accs.append(acc)
    return accs

In [24]:
for knn_threshold in knn_thresholds:
    print(knn_threshold)
    
    if knn_threshold not in main_dict:
        main_dict[knn_threshold] = {}
        time_dict[knn_threshold] = {}
    for fs_method in ['PCA', 'SVD']:
        print(fs_method)
        curr_method = {}

        for f_num in f_nums:
            curr_method[f_num] = []
        elapsed = []
        for sku in data_by_sku.keys():    
            if 'ter' in sku:
                continue
            print(sku)
            curr_data = data_by_sku[sku]
            curr_calc = data_dist[sku]
            expr_num = curr_data.get_num_exprs()
        
            all_accs = []
            # run 10 times to get the average
            num_repeats = 1
            for i in range(num_repeats):       
                curr_accs = []
                
                for f_num in f_nums:
                    if f_num > 15:
                        curr_accs.append(0)
                        continue
                    start_time = time.time()
                    accs = calc_mtx_dist(curr_data, curr_calc, int(f_num), fs_method)
                    elapsed.append(time.time() - start_time)

                    curr_accs.append(np.mean(accs))
                all_accs.append(curr_accs)
            all_accs = np.average(np.array(all_accs), axis=0)
            for f_num, acc in zip(f_nums, all_accs):
                curr_method[f_num].append(acc)
        main_dict[knn_threshold][fs_method] = curr_method
        time_dict[knn_threshold][fs_method] = np.mean(elapsed)
        print(np.mean(elapsed))

1
PCA
cpu16
cpu2
cpu4
cpu8
11.218974813818932
SVD
cpu16
cpu2
cpu4
cpu8
11.7953981757164
2
PCA
cpu16
cpu2
cpu4
cpu8
9.216870456933975
SVD
cpu16
cpu2
cpu4
cpu8
86.3998896330595
3
PCA
cpu16
cpu2
cpu4
cpu8
10.632197946310043
SVD
cpu16
cpu2
cpu4
cpu8
10.247445732355118


In [31]:
for k in [1,2,3]:
    print(k, "-----overall-----")
    pretty_print_table(k=k)
    for sku in [f'cpu{num}' for num in [2, 4, 8, 16]]:
        print(f"-----{sku}-----")
        pretty_print_table(k=k, sku=sku)

1 -----overall-----
\textcolor{}{RandForest} & \hlrfive{0.979} & \hlrfive{0.990} & \hlrfive{0.993} & \hlrfive{0.992} &  & \hlrfive{8.981} \\
\textcolor{}{PCA} & \hlrfive{0.981} & \hlrfive{0.997} & \hlrfive{0.997} & \hlrfive{0.997} &  & \hlrfive{11.219} \\
\textcolor{}{SVD} & \hlrfive{0.997} & \hlrfive{0.997} & \hlrfive{0.997} & \hlrfive{0.997} &  & \hlrfive{11.795} \\
-----cpu2-----
\textcolor{}{RandForest} & \hlrfive{0.981} & \hlrfive{0.972} & \hlrfive{0.989} & \hlrfive{0.989} &  & \hlrfive{8.981} \\
\textcolor{}{PCA} & \hlrfive{0.981} & \hlrfive{0.997} & \hlrfive{0.997} & \hlrfive{0.997} &  & \hlrfive{11.219} \\
\textcolor{}{SVD} & \hlrfive{0.997} & \hlrfive{0.997} & \hlrfive{0.997} & \hlrfive{0.997} &  & \hlrfive{11.795} \\
-----cpu4-----
\textcolor{}{RandForest} & \hlrfive{0.994} & \hlrfive{0.994} & \hlrfive{0.994} & \hlrfive{0.994} &  & \hlrfive{8.981} \\
\textcolor{}{PCA} & \hlrfive{0.981} & \hlrfive{0.997} & \hlrfive{0.997} & \hlrfive{0.997} &  & \hlrfive{11.219} \\
\textcolor{}