In [1]:
# Copy for testing

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from itertools import compress
import time

from sklearn.linear_model import lasso_path, enet_path, LogisticRegression, ElasticNet, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR, LinearSVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics.pairwise import rbf_kernel, polynomial_kernel
from sklearn.feature_selection import VarianceThreshold, SelectKBest, RFE, SequentialFeatureSelector, SelectFromModel
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif, r_regression

import scipy.stats as ss

from helpers import expr_data
from helpers import scale_data
from helpers import similarity

import warnings
warnings.filterwarnings("ignore")

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 22
SMALL_SMALL_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels

plt.rc('legend', fontsize=SMALL_SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [4]:
import random
np.random.seed(42)
random.seed(42)

## Load all data

In [6]:
data = expr_data.ExprData()
data.load_pickle()
data = data.fix_tpch()

#### Split by SKU

In [8]:
data_by_sku = data.split_by_sku()

#### Calculate Distance

In [None]:
# the result sku_result is a dict with its key the SKU,
# the value a list, the classification accuracy for each f_num
data_dist = {}

for sku in data_by_sku.keys():
    curr_data = data_by_sku[sku]
    if 'ter' in sku:
        continue
    print(f'cpu_num={sku}')
    scaler = scale_data.ScaleData()
    plan_mtxs_splitted, plan_col_ranges = scaler.scale(curr_data.plan_mtxs)
    perf_mtxs_splitted, perf_col_ranges = scaler.scale(curr_data.perf_mtxs)
    simi_calc = similarity.Similarity(curr_data, plan_mtxs_splitted, plan_col_ranges, perf_mtxs_splitted, perf_col_ranges, num_bins=10)

    simi_calc.calc_bined_mtx() # all features
    simi_calc.calc_dist_simi_matrix(normalize=True)
    print(simi_calc.simi_mtx.shape)
    # feature wise distance
    simi_calc.calc_featurewise_dist_by_col()
    print(simi_calc.simi_col_mtx.shape)
    
    data_dist[sku] = simi_calc

## Select Top K Features

In [11]:
# return non-zero index in descending order
def sparse_argsort(arr):
    arr = np.where(np.isnan(arr), 0, arr)
    arr = arr * -1
    indices = np.nonzero(arr)[0]
    result = indices[np.argsort(arr[indices])]
    return result

def all_argsort(arr):
    arr = np.where(np.isnan(arr), 0, arr)
    arr = arr * -1
    result = np.argsort(arr)
    return result

### Filter Based

#### Variance threshold

In [12]:
def variance_threshold(X, y):
    selector = VarianceThreshold()
    selector.fit(X)
    scores = selector.variances_
    return sparse_argsort(scores)

#### fANOVA, Chi-Squared test, Mutual Information gain, Fisher score

In [13]:
# fANOVA, Chi-Squared test, Mutual Information gain, Fisher score
def select_k_best(X, y, method):
    if method == 'fANOVA':
        selector = SelectKBest(f_classif, k='all')
    elif method == 'Chi2':
        selector = SelectKBest(chi2, k='all')
    elif method == 'MutualInfoGain': 
        selector = SelectKBest(mutual_info_classif, k='all')  
    elif method == 'Pearson': #https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.r_regression.htm
        selector = SelectKBest(r_regression, k='all')  
    selector.fit(X, y)
    scores = np.absolute(selector.scores_)
    return sparse_argsort(scores)

In [14]:
# divide by zero; not using this
def fisher_orders(X, y):
    ranks = fisher_score.fisher_score(X, y, mode='rank')
    # ranks are: 1 for most important
    scores = -1 * ranks
    return sparse_argsort(scores)

### Wrapper method

#### RFE (Recursive feature elimination)

In [15]:
def get_est(est_name):
    if est_name == 'DecisionTree':
        estimator = DecisionTreeClassifier(criterion='entropy', max_depth=None)
    elif est_name == 'LogisticRegression':
        estimator = LogisticRegression(n_jobs=-1, C=0.01) # C, tol, 
    else: # est_name == 'Linear':
        estimator = SVR(kernel="linear", C=0.05, ) # kernel, degree, 
    return estimator

In [16]:
# create customize base estimator: https://stackoverflow.com/questions/51679173/using-sklearn-rfe-with-an-estimator-from-another-package
def rfe_orders(X, y, est_name):
    estimator = get_est(est_name)
    selector = RFE(estimator, n_features_to_select=1, step=1)
    selector = selector.fit(X, y)
    # ranks are: 1 for most important
    scores = -1 * selector.ranking_
    return sparse_argsort(scores)

#### SFS (Sequential Feature Selection)

In [17]:
def sfs_orders(curr_data, direction, est_name, n):
    num_features = len(curr_data.feature_cols)
    feature_importance = np.array([0]*num_features)
    expr_num = curr_data.get_num_exprs()

    for i in range(expr_num):
        # calculate label
        curr_name = curr_data.wl_names[i]
        y = [curr_name == name for name in curr_data.wl_names]
        X = simi_calc.simi_col_mtx[i]
        estimator = get_est(est_name)

        selector = SequentialFeatureSelector(estimator, direction=direction.lower(), n_features_to_select=n, n_jobs=-1, cv=3)
        selector = selector.fit(X, y)
        mask = selector.get_support()
        for idx in range(num_features):
            feature_importance[idx] += mask[idx]
    final_orders = sparse_argsort(feature_importance)[:n]
    top_features = [curr_data.feature_cols[j] for j in final_orders]
    return top_features

### Embedded method

#### Lasso

In [18]:
def lasso_weights_orders(X, y):
    selector = Lasso(alpha=0.1).fit(X, y) # C 
    scores = np.abs(selector.coef_)
    return sparse_argsort(scores)

#### Elastic Net

In [19]:
def enet_weights_orders(X, y):
    selector = ElasticNet(alpha=0.1, l1_ratio=0.4).fit(X, y)
    scores = np.abs(selector.coef_)
    return sparse_argsort(scores)

### Feature select main function

In [20]:
def get_top_features(curr_data, expr_num, simi_calc, method, note=None):

    # create dict for all features
    num_features = len(curr_data.feature_cols)
    feature_importance = np.array([0]*num_features)
    
    for i in range(expr_num):
        # calculate label
        curr_name = curr_data.wl_names[i]
        y = [curr_name == name for name in curr_data.wl_names]
        X = simi_calc.simi_col_mtx[i]
        
        mask = np.ones(X.shape[0], dtype=bool)  
        X = X[mask]#.reshape(-1, 1)
                
        if method == 'Lasso':
            orders = lasso_weights_orders(X, y)
        elif method == 'ENet':
            orders = enet_weights_orders(X, y)
        elif method == 'Variance':
            orders = variance_threshold(X, y)
        elif method == 'fANOVA':
            orders = select_k_best(X, y, method='fANOVA')
        elif method == 'Chi2':
            orders = select_k_best(X, y, method='Chi2')
        elif method == 'MutualInfoGain':
            orders = select_k_best(X, y, method='MutualInfoGain')
        elif method == 'Pearson':
            orders = select_k_best(X, y, method='Pearson')
        elif method == 'Fisher':
            orders = fisher_orders(X, y)
        elif method == 'RFE':
            orders = rfe_orders(X, y, note)

        for idx in range(len(orders)):
            # from 0 to last idx of orders
            # the score = num_features - idx
            #   for a entry with feature_idx important order idx idx
            # the higher the order, the more the score
            feature_importance[orders[idx]] += num_features-idx
    final_orders = all_argsort(feature_importance)
    top_features = [curr_data.feature_cols[j] for j in final_orders]
    return top_features

## Compare Feature Selection with Similarity Calculation

#### Experiment Setup

In [21]:
main_dict = {}
time_dict = {}

In [22]:
all_features = data_by_sku[list(data_by_sku.keys())[0]].feature_cols
feature_num = len(all_features)

knn_thresholds = [1, 2, 3]
direct_methods = ['Variance', 'fANOVA', 'MutualInfoGain', 'Pearson', 'Lasso', 'ENet']
wrapper_methods = ['RFE']
estimator_names = ['Linear', 'DecisionTree', 'LogisticRegression']
other_methods = ['SFS', ]
simi_method = 'KNN'

f_nums = [1, 3, 7, 15, feature_num]

In [24]:
for knn_threshold in knn_thresholds:
    print(knn_threshold)
    
    if knn_threshold not in main_dict:
        main_dict[knn_threshold] = {}
        time_dict[knn_threshold] = {}
    for fs_method in direct_methods:
        print(fs_method)
        curr_method = {}

        for f_num in f_nums:
            curr_method[f_num] = []
        elapsed = []
        for sku in data_by_sku.keys():    
            if 'ter' in sku:
                continue
            curr_data = data_by_sku[sku]
            curr_calc = data_dist[sku]
            expr_num = curr_data.get_num_exprs()
        
            all_accs = []
            num_repeats = 1
            for i in range(num_repeats):       
                curr_accs = []
                start_time = time.time()
                top_features = get_top_features(curr_data, expr_num, curr_calc, fs_method, None)
                f_features = [top_features[:n] for n in f_nums]
                elapsed.append(time.time() - start_time)

                for f_num, curr_f in zip(f_nums, f_features):
                    curr_calc.calc_dist_simi_matrix(feature_names=curr_f)
                    pen, pens = curr_calc.simi_penalty(n=knn_threshold, dependent=True)

                    acc = 1 - (np.sum(pens)/(len(pens)*10))
                    curr_accs.append(acc)
                all_accs.append(curr_accs)
            all_accs = np.average(np.array(all_accs), axis=0)
            for f_num, acc in zip(f_nums, all_accs):
                curr_method[f_num].append(acc)
        main_dict[knn_threshold][fs_method] = curr_method
        time_dict[knn_threshold][fs_method] = np.mean(elapsed)
        print(np.mean(elapsed))

1
Variance
0.03309154510498047
fANOVA
0.05154687166213989
MutualInfoGain
3.2459532618522644
Pearson
0.03456294536590576
Lasso
0.051907360553741455
ENet
0.09475409984588623
2
Variance
0.024421095848083496
fANOVA
0.04363507032394409
MutualInfoGain
2.6107080578804016
Pearson
0.0350375771522522
Lasso
0.05205315351486206
ENet
0.10130876302719116
3
Variance
0.0242387056350708
fANOVA
0.04818713665008545
MutualInfoGain
2.53963840007782
Pearson
0.03474068641662598
Lasso
0.05609309673309326
ENet
0.10970205068588257


In [25]:
def pretty_print_table(k=3, sku=None):
    name_trans_dict = {
        'Forward_SFS_Linear': 'Fw SFS Linear',
        'Backward_SFS_Linear': 'Bw SFS Linear',
        'Forward_SFS_DecisionTree': 'Fw SFS DecTree',
        'Backward_SFS_DecisionTree': 'Bw SFS DecTree',
        'Forward_SFS_LogisticRegression': 'Fw SFS LogReg',
        'Backward_SFS_LogisticRegression': 'Bw SFS LogReg',
        'MutualInfoGain': 'MIGain',
        'RFE_Linear': 'RFE Linear',
        'RFE_DecisionTree': 'RFE DecTree',
        'RFE_LogisticRegression': 'RFE LogReg',
    }
    
    sku_trans_dict = {
        'cpu2': 0,
        'cpu4': 1,
        'cpu8': 2,
        'cpu16': 3,
    }
    
    for method, subval in main_dict[k].items():
        outstr = '\\textcolor{}{'
        print_name = method if method not in name_trans_dict else name_trans_dict[method]
        outstr += print_name
        outstr += '} & '
        for fnum, subsubval in subval.items():
            if fnum == 29:
                continue
            if sku is None:
                outstr += f'{np.mean(subsubval):.3f} & '
            else:
                outstr += f'{subsubval[sku_trans_dict[sku]]:.3f} & '
        if method == 'Variance':
            if sku is None:
                all_acc = np.mean(subval[29])
            else:
                all_acc = subval[29][sku_trans_dict[sku]]
            outstr += '\multirow{17}{*}{'
            outstr += '{:.3f}'.format(all_acc)
            outstr += '}'
        outstr += f' & {time_dict[k][method]:.3f} \\\\'
        print(outstr)

In [26]:
for k in [1,2,3]:
    print(k, "-----overall-----")
    pretty_print_table(k=k)
    for sku in [f'cpu{num}' for num in [2, 4, 8, 16]]:
        print(f"-----{sku}-----")
        pretty_print_table(k=k, sku=sku)

1 -----overall-----
\textcolor{}{Variance} & 0.307 & 0.702 & 0.997 & 0.997 & \multirow{17}{*}{0.997} & 0.033 \\
\textcolor{}{fANOVA} & 0.961 & 0.969 & 0.978 & 0.988 &  & 0.052 \\
\textcolor{}{MIGain} & 0.964 & 0.963 & 0.990 & 0.991 &  & 3.246 \\
\textcolor{}{Pearson} & 0.961 & 0.969 & 0.978 & 0.990 &  & 0.035 \\
\textcolor{}{Lasso} & 0.481 & 0.963 & 0.982 & 0.994 &  & 0.052 \\
\textcolor{}{ENet} & 0.481 & 0.958 & 0.992 & 0.994 &  & 0.095 \\
-----cpu2-----
\textcolor{}{Variance} & 0.483 & 0.717 & 0.997 & 0.997 & \multirow{17}{*}{0.994} & 0.033 \\
\textcolor{}{fANOVA} & 0.969 & 0.983 & 0.986 & 0.989 &  & 0.052 \\
\textcolor{}{MIGain} & 0.981 & 0.972 & 0.986 & 0.986 &  & 3.246 \\
\textcolor{}{Pearson} & 0.969 & 0.983 & 0.986 & 0.989 &  & 0.035 \\
\textcolor{}{Lasso} & 0.467 & 0.969 & 0.989 & 0.989 &  & 0.052 \\
\textcolor{}{ENet} & 0.467 & 0.969 & 0.992 & 0.989 &  & 0.095 \\
-----cpu4-----
\textcolor{}{Variance} & 0.247 & 0.786 & 0.994 & 0.994 & \multirow{17}{*}{0.997} & 0.033 \\
\textcol

#### Present Results

In [28]:
def plot_bar(fs_method, knn_threshold, lowery=0):
    colors = sns.color_palette("colorblind", len(f_nums))
    markers = ['o', 'D', 'P', 'X', '*', '>', 'p', ]
    # for each # feature (row in df), plot the accuracy for each sku
    X_lab = list(data_by_sku.keys())
    X_lab = [e for e in X_lab if 'ter' not in e]

    X_lab = X_lab[1:]+X_lab[:1]
    print(X_lab)
    X = [1, 2, 3, 4]

    x = np.arange(len(X_lab))  # the label locations
    width = 0.125  # the width of the bars
    multiplier = 0

    fig, ax = plt.subplots(figsize=(4.8,3), constrained_layout=True)

    for feature_num, accs in main_dict[knn_threshold][fs_method].items():
        offset = width * multiplier
        if feature_num > 1:
            rects = ax.bar(x + offset, accs, width, label=f'{feature_num} Features', color=colors[multiplier], edgecolor='black')
        else:
            rects = ax.bar(x + offset, accs, width, label=f'{feature_num} Feature', color=colors[multiplier], edgecolor='black')
        multiplier += 1

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Classification Accuracy')
    ax.set_xlabel('SKU')
    # ax.set_yscale('log')

    ax.set_xticks(x + width, X_lab)
    ax.legend(bbox_to_anchor=(0, 1, 1, 0), loc='lower left', ncol=3, mode='expand')
    if lowery == 0:
        ax.set_ylim(0, 1.2)
        plt.savefig('../figs/n_{}_{}fs_{}.pdf'.format(fs_method, knn_threshold, simi_method))
    else:
        ax.set_ylim(lowery, 1.1)
        plt.savefig('../figs/n_{}_{}fs_{}_{}.pdf'.format(fs_method, knn_threshold, simi_method, lowery))

    plt.show()