### L2S

>@inproceedings{jamshidi2018,
    title={Learning to sample: exploiting similarities across environments to learn performance models for configurable systems}, 
    author={Jamshidi, Pooyan and Velez, Miguel and K{\"a}stner, Christian and Siegmund, Norbert},
    booktitle={Proceedings of the 2018 26th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
    pages={71--82},
    year={2018},
    organization={ACM},
    url={https://dl.acm.org/doi/pdf/10.1145/3236024.3236074},
}

**Learning to Sample (L2S)** is a transfer learning approach defined by Jamshidi et al. 
First, it exploits the source input and selects configurations that leverage influential (interactions of) features for this input. 
Then, it explores the similarities between the source and the target, thus adding configurations having similar performances for the source and the target. 
Finally, it uses the configurations selected in previous steps to efficiently train a model on the target input. 

#### Libraries

In [52]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
%matplotlib inline
# high-level plots
import seaborn as sns

import statsmodels.api as sm

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

import statsmodels.api as sm

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split, GridSearchCV
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans
# Support vector machine - support vector regressor
from sklearn.svm import SVR
# decision trees
from sklearn.tree import DecisionTreeRegressor, plot_tree
# mean squared error
from sklearn.metrics import mean_squared_error

# gradient boosting trees
from xgboost import XGBRegressor

# we use it to interact with the file system
import os
# compute time
from time import time

# Neural network high level framework
import keras
# Sequential is a sequence of blocs
# Input deals with the data fed to the network
from keras.models import Sequential,Input,Model
# Dense is a feedforward layer with fully connected nodes
# Dropout allows to keep part of data, and to "drop out" a the rest
# Flatten makes the data "flat", i.e. in one dimension
from keras.layers import Dense, Dropout, Flatten
# Conv -> convolution, MaxPooling is relative to Pooling
# Activation if the function composing the data in output of a layer
from keras.layers import Conv2D, MaxPooling2D, Activation


from learner.mlearner import learn_with_interactions, learn_without_interactions, sample_random, stepwise_feature_selection
from learner.model import genModelTermsfromString, Model, genModelfromCoeff

import warnings
warnings.filterwarnings("ignore")

#### implementation of the approach

In [66]:
class L2S:
    
    def __init__(self):
        #self.pct_test = pct_test
        #self.ratio_exploitation = ratio_exploitation
        
        # the data folder, see the markdown there for additional explanations
        res_dir = "../../../data/ugc/res_ugc/"
        
        # the list of videos names, e.g. Animation_360P-3e40
        # we sort the list so we keep the same ids between two launches
        v_names = sorted(os.listdir(res_dir)) 

        self.predDimension = "kbs"
        
        # the list of measurements
        listVideo = []

        # we add each dataset in the list, converting the time to the right format
        # third line asserts that the measures are complete
        for v in v_names:
            data = pd.read_table(res_dir+v, delimiter = ',')
            inter = pd.get_dummies(data)
            inter[self.predDimension] = data[self.predDimension]
            listVideo.append(inter)
        
        self.listVideo = listVideo
        
        
        # to sample the source and the target using the same seed
        self.random_state = np.random.randint(0,1000)
        
        self.features = ['cabac', '8x8dct', 'mbtree', 'rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 
                 'qpmax', 'aq-mode', 'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 
                 'b_adapt', 'direct', 'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']
    
    def mse(self, y_true, y_pred):
        return np.mean((y_true-y_pred)**2)
    
    ### Step 1: Extraction Process of Performance Models
    
    #Select a good model for predicting the performance of the source video
    
    #Original files:
    #https://github.com/cmu-mars/model-learner/blob/tutorial/learner/mlearner.py for the stepwise selection
    #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html for the interactions
    
    # @PooyanJamshidi:
    # We just change slightly some functions from the original repository,
    # mainly because we don't want to add a constant in the model
    # + steps 2 and 3 were implemented in matlab but we did not find them in python
    def stepwise_selection(self, X, y,
                           initial_list = [], 
                           threshold_in = 0.01, 
                           threshold_out = 0.05, 
                           verbose=False):

        ndim = X.shape[1]
        features = [i for i in range(ndim)]
        included = list(initial_list)

        while True:
            changed=False

            # forward step (removed a constant)
            excluded = list(set(features)-set(included))
            new_pval = pd.Series(index=excluded)
            for new_column in excluded:
                model = sm.OLS(y, pd.DataFrame(X[included+[new_column]])).fit()
                new_pval[new_column] = model.pvalues[new_column]
            best_pval = new_pval.min()
            if best_pval < threshold_in:
                best_feature = new_pval.idxmin()
                included.append(best_feature)
                changed=True
                if verbose:
                    print('Add {:30} with p-value {:.5}'.format(best_feature, best_pval))

            # backward step
            model = sm.OLS(y, pd.DataFrame(X[included])).fit()
            pvalues = model.pvalues
            worst_pval = pvalues.max()
            if worst_pval > threshold_out:
                changed = True
                worst_feature = pvalues.idxmax()
                included.remove(worst_feature)
                if verbose:
                    print('Drop {:30} with p-value {:.5}'.format(worst_feature, worst_pval))
            if not changed:
                if verbose:
                    print("Construction of the model completed!")
                break
        return included
    
    ### Step 2: Active Sampling
    
    #### A - ] Exploitation : use the source's prediction model
    
    ##### (i) Sort the coefficients of the previous constructed model
    ##### (ii) Choose the coefficient with the highest value
    ##### (iii) Select the configurations with this feature activated

    # I assumed it was recursive, with a decreasing influence in the selection 
    # for a decreasing importance in the regression.
    
    def select_exploitation(self, df, sc, config_selected):
        
        self.nb_config = int(self.nb_config_exploitation - len(config_selected))
        
        if self.nb_config == 0:
            #print("Done!\n")
            return config_selected

        # if we don't have any important coefficient left to help us choose configs
        # we take the nb_config first configurations
        if len(sc) == 0:
            #print("Selecting " + str(self.nb_config) + " configurations from the rest of the dataset!")
            for conf in df.index[0:self.nb_config]:
                config_selected.append(conf)
            return config_selected

        # otherwise we just use the best coef to choose configs
        else:

            # we choose the best features coef (biggest absolute value)
            most_important_coef = sc[0]

            #print("Feature : " + str(most_important_coef))

            # configs with this feature activated
            imp_index = np.where(df[most_important_coef]==1)[0]

            # number of configs with this feature activated
            nb_imp_index = len(imp_index)

            # if we have more values to choose 
            # than the number of configurations with the best feature activated
            # we add all the configuration to the selected set
            # and we select the rest of the configuration based on other coefficients
            if nb_imp_index <= self.nb_config:
                for conf in df.iloc[imp_index].index:
                    config_selected.append(conf)
                #if nb_imp_index > 0:
                #    print("Added "+str(nb_imp_index)+ " values, "+
                #          str(self.nb_config-nb_imp_index)+" left to choose \n")
                # then we apply recursively this method to the rest of the dataframe
                return self.select_exploitation(df.iloc[np.where(df[most_important_coef]==0)[0]], 
                                              sc[1:len(sc)],
                                              config_selected)

            # otherwise we have enough values with this features activated
            # to select all the remaining configurations
            # so we apply the method to the dataframe containing all the feature activated
            # and we select the configuration by using the followings features
            else:
                return self.select_exploitation(df.iloc[imp_index], 
                                     sc[1:len(sc)], 
                                     config_selected)
    
    
    
    #### B-] Exploration : Select specific configurations, similar between the source and the target
    
    # I choose to select the group in one step:
    # if you select config per config, you may choose a local optimal
    
    def select_exploration(self, exploitation_conf, ratio_exploitation, number_group = 10):
        
        nb_exploration = int(np.round(self.config_tot*(1-ratio_exploitation)))

        #target = self.listVideo[id_target]

        # all the config left for exploration
        # total minus those chosen for exploitation
        explor_conf = np.setdiff1d(self.source.index, exploitation_conf)

        # initialization : we take the first nb_exploration config
        best_explor = explor_conf[0:nb_exploration]

        # we group it with the exploitation configurations
        conf = np.concatenate((exploitation_conf, best_explor), axis=0)
        
        # for the moment, it's our best entropy
        best_entropy = sc.entropy(self.target.iloc[conf][self.predDimension], 
                                  self.source.iloc[conf][self.predDimension])

        # then we incrementally select the configurations to diminish the entropy 
        group_counter = 0

        while group_counter < number_group:

            group_counter +=1

            # current group to 'challenge' the best result
            np.random.shuffle(explor_conf)
            current_explor = explor_conf[0:nb_exploration]

            # we group it with the exploitation configurations
            conf = np.concatenate((exploitation_conf, current_explor), axis=0)

            # we compute the Kullback Leibler divergence between the source and the target
            current_entropy = sc.entropy(self.target.iloc[conf][self.predDimension], 
                                         self.source.iloc[conf][self.predDimension])

            # we finally take the group giving the lowest entropy
            # if this group is better than the best group, we replace it by the new one
            if current_entropy > best_entropy:
                #print("Entropy gained : "+str(current_entropy-best_entropy))
                best_entropy = current_entropy
                best_explor = current_explor

        return best_explor
    
    
    
    def learn(self, source_id, target_id, ratio_exploitation = 0.3, 
              l2s_tr_ratio = 0.8, 
              train_size = 20,
              learning_algorithm = RandomForestRegressor):

        # the source video
        self.source = self.listVideo[source_id]

        # the number of config used in the training
        self.config_tot = int(train_size)
        
        if train_size <= 1:
            self.config_tot = int(train_size*self.source.shape[1])

        # transform some variables into dummies, to fit the orginal paper
        # since we don't want to introduce a meaningless constant in the model, 
        # we have to keep all columns

        X_src = pd.DataFrame(np.array(self.source.drop([self.predDimension], axis = 1), dtype=int))

        #X_src = self.source[self.keep_features]
        
        # add interactions
        poly = PolynomialFeatures(degree = 1, interaction_only = True, include_bias = True)
        
        # degree 2 take too much time + it will not scale for large configuration spaces...
        # IMO O(n) or O(nlog(n)) are the complexity we should all target for our algorithms
        
        X_interact = pd.DataFrame(np.array(poly.fit_transform(X_src), int))

        # performance variable, to predict
        y_src = self.source[self.predDimension]
        
        # we train the model with the training data
        
        # print("\n############### I- Knowledge extraction #################\n")

        selected_features = self.stepwise_selection(X_interact, y_src)

        # print("\n############### II- Sampling #################\n")

        reg = LinearRegression()

        reg.fit(X_interact[selected_features], y_src)

        sorted_coefs = pd.Series(np.abs(reg.coef_), 
                                 selected_features, 
                                 dtype='float64').sort_values(ascending=False).index

        # print("A- EXPLOITATION\n")
        
        self.nb_config_exploitation = int(ratio_exploitation*self.config_tot)
        
        exploitation_conf = self.select_exploitation(X_interact, sorted_coefs, [])
        
        # print(exploitation_conf)

        # print("\nB- EXPLORATION\n")

        # we ensure we sample the configurations of the training set
        # which removes the potential threat of using the configuration of the testing set
        # during the training
        
        # target
        self.target = self.listVideo[target_id]
        
        exploration_conf = self.select_exploration(exploitation_conf, ratio_exploitation)

        sampled_conf = np.concatenate((exploitation_conf,exploration_conf), axis=0)
        
        # print(sampled_conf)
        
        # print("\n############### III- Performance Model Learning #################\n")

        # we build a performance model for the target
        # instead of using all the configurations, we use the sampled configuration
        # ie we remove the unnecessary configurations
        # print(len(sampled_conf))
        
        X_tgt = self.target.drop([self.predDimension], axis = 1)
        y_tgt = self.target[self.predDimension]

        X_tgt_tr = X_tgt.iloc[sampled_conf]
        y_tgt_tr = y_tgt[sampled_conf]
        
        #X_tgt_te = self.target[self.keep_features].drop(sampled_conf, inplace = False, axis=0)
        #y_tgt_te = self.target[self.predDimension].drop(sampled_conf, inplace = False, axis=0)

        # The shift function, to transfer the prediction from the source to the target
        lf = learning_algorithm()
        lf.fit(X_tgt_tr, y_tgt_tr)
        y_tgt_pred = lf.predict(X_tgt)
        
        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return self.mse(y_tgt, y_tgt_pred)
    
    def predict_conf(self, source_id, target_id, ratio_exploitation = 0.3, 
              l2s_tr_ratio = 0.8, 
              train_size = 20,
              learning_algorithm = XGBRegressor):

        # the source video
        self.source = self.listVideo[source_id]

        # the number of config used in the training
        self.config_tot = int(train_size)
        
        if train_size <= 1:
            self.config_tot = int(train_size*self.source.shape[1])

        # transform some variables into dummies, to fit the orginal paper
        # since we don't want to introduce a meaningless constant in the model, 
        # we have to keep all columns

        X_src = pd.DataFrame(np.array(self.source.drop([self.predDimension], axis = 1), dtype=int))

        #X_src = self.source[self.keep_features]
        
        # add interactions
        poly = PolynomialFeatures(degree = 1, interaction_only = True, include_bias = True)
        
        # degree 2 take too much time + it will not scale for large configuration spaces...
        # IMO O(n) or O(nlog(n)) are the complexity we should all target for our algorithms
        
        X_interact = pd.DataFrame(np.array(poly.fit_transform(X_src), int))

        # performance variable, to predict
        y_src = self.source[self.predDimension]
        
        # we train the model with the training data
        
        # print("\n############### I- Knowledge extraction #################\n")

        selected_features = self.stepwise_selection(X_interact, y_src)

        # print("\n############### II- Sampling #################\n")

        reg = LinearRegression()

        reg.fit(X_interact[selected_features], y_src)

        sorted_coefs = pd.Series(np.abs(reg.coef_), 
                                 selected_features, 
                                 dtype='float64').sort_values(ascending=False).index

        # print("A- EXPLOITATION\n")
        
        self.nb_config_exploitation = int(ratio_exploitation*self.config_tot)
        
        exploitation_conf = self.select_exploitation(X_interact, sorted_coefs, [])
        
        # print(exploitation_conf)

        # print("\nB- EXPLORATION\n")

        # we ensure we sample the configurations of the training set
        # which removes the potential threat of using the configuration of the testing set
        # during the training
        
        # target
        self.target = self.listVideo[target_id]
        
        exploration_conf = self.select_exploration(exploitation_conf, ratio_exploitation)

        sampled_conf = np.concatenate((exploitation_conf,exploration_conf), axis=0)
        
        #print("\n############### III- Performance Model Learning #################\n")

        # we build a performance model for the target
        # instead of using all the configurations, we use the sampled configuration
        # ie we remove the unnecessary configurations
        # print(len(sampled_conf))
        
        X_tgt = self.target.drop([self.predDimension], axis = 1)
        y_tgt = self.target[self.predDimension]

        X_tgt_tr = X_tgt.iloc[sampled_conf]
        y_tgt_tr = y_tgt[sampled_conf]
        
        #X_tgt_te = self.target[self.keep_features].drop(sampled_conf, inplace = False, axis=0)
        #y_tgt_te = self.target[self.predDimension].drop(sampled_conf, inplace = False, axis=0)

        # The shift function, to transfer the prediction from the source to the target
        lf = learning_algorithm()
        lf.fit(X_tgt_tr, y_tgt_tr)
        y_tgt_pred = lf.predict(X_tgt)
        
        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return np.argmin(y_tgt_pred)

#### Learning algorithm

In [60]:
l2s = L2S()

LAs = [DecisionTreeRegressor, RandomForestRegressor, XGBRegressor, SVR]
for i in range(5):
    source_id = np.random.randint(0,1000)
    target_id = np.random.randint(0,1000)
    for la in LAs:
        print(la, l2s.learn(source_id = source_id, 
                            target_id = target_id, 
                            train_size = 20, 
                            learning_algorithm=la))

<class 'sklearn.tree._classes.DecisionTreeRegressor'> 3281685.2859895527
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 2042005.232342937
<class 'xgboost.sklearn.XGBRegressor'> 3062540.786840061
<class 'sklearn.svm._classes.SVR'> 9576159.427733865
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 27405524.146231342
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 6519916.737222477
<class 'xgboost.sklearn.XGBRegressor'> 7263449.448959386
<class 'sklearn.svm._classes.SVR'> 71663105.14875856
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 10962.400734825867
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 6764.398971992097
<class 'xgboost.sklearn.XGBRegressor'> 2998.9533698724767
<class 'sklearn.svm._classes.SVR'> 61580.4388545609
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 1613.7505716417907
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 2706.459407840895
<class 'xgboost.sklearn.XGBRegressor'> 5054.611589117543
<class 'sklearn.svm._

#### Chosen algorithm :  XGBRegressor (however it may depends on the choice of videos)

Bug with Linear Regression, mse too low? To analyze

Again, it depends on the video we consider

#### Predictions

We predict the configurations for each video of the test set, for 5 configs, 10 configs, ..., 30 configs in the training set.

In [67]:
# the data folder, see the markdown there for additional explanations
res_dir = "../../../data/ugc/res_ugc/"

# the list of videos names, e.g. Animation_360P-3e40
# we sort the list so we keep the same ids between two launches
v_names = sorted(os.listdir(res_dir)) 

v_names_train = np.loadtxt("../../../results/raw_data/train_names.csv", dtype= str)
v_names_test = np.loadtxt("../../../results/raw_data/test_names.csv", dtype= str)
index_train = [i for i in range(len(v_names)) if v_names[i] in v_names_train]
index_test = [i for i in range(len(v_names)) if v_names[i] in v_names_test]

train_sizes = np.arange(5,31,5)

In [68]:
l2s = L2S()
l2s_confs = dict()
for i in range(len(index_test)):
    it = index_test[i]
    source_index_train = np.random.randint(0, len(v_names_train))
    source_id = index_train[source_index_train]
    for ts in train_sizes:
        l2s_confs[(i, ts)] = l2s.predict_conf(source_id = source_id, target_id = it, train_size=ts,
                                      learning_algorithm = XGBRegressor)

In [69]:
l2s_confs

{(0, 5): 1,
 (0, 10): 34,
 (0, 15): 153,
 (0, 20): 123,
 (0, 25): 166,
 (0, 30): 123,
 (1, 5): 57,
 (1, 10): 169,
 (1, 15): 93,
 (1, 20): 189,
 (1, 25): 100,
 (1, 30): 89,
 (2, 5): 111,
 (2, 10): 163,
 (2, 15): 176,
 (2, 20): 190,
 (2, 25): 190,
 (2, 30): 161,
 (3, 5): 85,
 (3, 10): 4,
 (3, 15): 32,
 (3, 20): 161,
 (3, 25): 194,
 (3, 30): 155,
 (4, 5): 76,
 (4, 10): 91,
 (4, 15): 10,
 (4, 20): 175,
 (4, 25): 42,
 (4, 30): 79,
 (5, 5): 8,
 (5, 10): 87,
 (5, 15): 103,
 (5, 20): 100,
 (5, 25): 97,
 (5, 30): 189,
 (6, 5): 49,
 (6, 10): 168,
 (6, 15): 166,
 (6, 20): 100,
 (6, 25): 159,
 (6, 30): 168,
 (7, 5): 48,
 (7, 10): 189,
 (7, 15): 3,
 (7, 20): 98,
 (7, 25): 92,
 (7, 30): 92,
 (8, 5): 19,
 (8, 10): 85,
 (8, 15): 181,
 (8, 20): 170,
 (8, 25): 179,
 (8, 30): 181,
 (9, 5): 0,
 (9, 10): 62,
 (9, 15): 178,
 (9, 20): 96,
 (9, 25): 109,
 (9, 30): 189,
 (10, 5): 8,
 (10, 10): 62,
 (10, 15): 123,
 (10, 20): 91,
 (10, 25): 91,
 (10, 30): 96,
 (11, 5): 4,
 (11, 10): 178,
 (11, 15): 123,
 (11, 20

In [71]:
l2s_data = pd.DataFrame({"id_video" : [i for i in range(len(index_test))]})
for ts in train_sizes:
    l2s_data["conf"+str(ts)] = [l2s_confs[(i, ts)] for i in range(len(index_test))]

In [72]:
l2s_data.set_index("id_video").to_csv("../../../results/raw_data/L2S_results.csv")