Import libraries

In [49]:
import os

import pandas as pd

import numpy as np

from scipy import stats
import scipy.stats as sc
from scipy.cluster.hierarchy import linkage, leaves_list

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from learner.mlearner import learn_with_interactions, learn_without_interactions, sample_random, stepwise_feature_selection
from learner.model import genModelTermsfromString, Model, genModelfromCoeff


from import_data import load_data

Import data

In [52]:
listVideo = load_data(drop_default=True)

nbVideos = len(listVideo)

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true-y_pred)/y_true))

predDimension="etime"

   Unnamed: 0  configurationID no_8x8dct no_asm no_cabac no_deblock  \
0           0                1      True  False    False       True   
1           1             1001     False  False     True       True   
2           2             1002      True  False    False      False   
3           3             1003     False  False    False       True   
4           4             1004     False  False     True      False   

  no_fast_pskip no_mbtree no_mixed_refs no_weightb  rc_lookahead  ref  frames  \
0          True     False          True       True          20.0  9.0    1374   
1          True      True          True       True          20.0  1.0    1374   
2          True     False          True       True          60.0  5.0    1374   
3          True      True         False      False          20.0  9.0    1374   
4         False      True          True      False          20.0  9.0    1374   

     cpu       fps     kbs  etime       size  
0  703.2  1315.615  225.03  1.052  1289

# L2s implementation

### Step 1: Extraction Process of Performance Models

Select a good model for predicting the performance of the source video

Original files:
- https://github.com/cmu-mars/model-learner/blob/tutorial/learner/mlearner.py for the stepwise selection
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html for the interactions

In [54]:
# @PooyanJamshidi:
# We just change slightly some functions from the original repository,
# mainly because we don't want to add a constant in the model
# + steps 2 and 3 were implemented in matlab but we did not find them in python

def stepwise_selection(X, y,
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    
    ndim = X.shape[1]
    features = [i for i in range(ndim)]
    included = list(initial_list)
    
    while True:
        changed=False
        
        # forward step (removed a constant)
        excluded = list(set(features)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, pd.DataFrame(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add {:30} with p-value {:.5}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, pd.DataFrame(X[included])).fit()
        pvalues = model.pvalues
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.5}'.format(worst_feature, worst_pval))
        if not changed:
            if verbose:
                print("Construction of the model completed!")
            break
    return included

In [94]:
# to sample the source and the target using the same seed
random_state = np.random.randint(0,1000)

# a list of features to keep as explicative variables
keep_features = ['no_8x8dct','no_asm', 'no_cabac','no_deblock','no_fast_pskip', 
                 'no_mbtree','no_mixed_refs','no_weightb','rc_lookahead','ref']

# ordinal data to convert into dummies
to_dummy_features = ['rc_lookahead','ref']


# percentage of configuration used for test
pct_test = 0.7

# the source video
source = listVideo[1]


# transform some variables into dummies, to fit the orginal paper
# since we don't want to introduce a meaningless constant in the model, 
# we have to keep all columns

dummies = pd.get_dummies(source[keep_features], 
                   drop_first = False,
                   columns=to_dummy_features)

X_src = pd.DataFrame(np.array(dummies, dtype=int))


# add interactions
poly = PolynomialFeatures(degree=2, interaction_only = True, include_bias = True)
X_interact = pd.DataFrame(np.array(poly.fit_transform(X_src),int))

# performance variable, to predict
y_src = np.array(source[predDimension], dtype=float)

# split train test
X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_interact, 
                                                                    y_src, 
                                                                    test_size=pct_test, 
                                                                    random_state=random_state)

# the index of the selected features
selected_features = stepwise_selection(X_interact, y_src)

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Add                              0 with p-value 2.3451e-201
Add                             12 with p-value 9.3285e-33
Add                              7 with p-value 2.748e-45
Add                             82 with p-value 8.7723e-45
Add                              6 with p-value 1.2362e-23
Add                             13 with p-value 1.352e-23
Add                             14 with p-value 7.5903e-206
Add                              3 with p-value 1.0257e-18
Add                             83 with p-value 1.1616e-23
Add                             84 with p-value 6.8461e-97
Add                              1 with p-value 1.1333e-23
Add                             75 with p-value 1.6628e-23
Add                             70 with p-value 7.8824e-22
Add                              9 with p-value 4.1825e-21
Add                             72 with p-value 3.3327e-29
Add                             42 with p-value 8.6591e-22
Add                             80 with p-value 1.6946e-

### Step 2: Active Sampling

#### A - ] Exploitation : use the source's prediction model

##### (i) Sort the coefficients of the previous constructed model

##### (ii) Choose the coefficient with the highest value

##### (iii) Select the configurations with this feature activated



I assumed it was recursive, with a decreasing influence in the selection for a decreasing importance in the regression.

In [103]:
ratio_exploitation = 0.3
config_tot = 200

reg = LinearRegression()

reg.fit(X_interact[selected_features], y_src)

sorted_coefs = pd.Series(np.abs(reg.coef_), selected_features).sort_values(ascending=False).index

nb_config_exploitation = np.round(ratio_exploitation*config_tot)

nb_config_selected = 0

assert X_interact.shape[0] >= nb_config_exploitation ; " Too many configurations to select ! "

def select_exploitation(df, sc, config_selected):
    
    #number of config left to choose
    nb_config = int(nb_config_exploitation - len(config_selected))
    
    if nb_config == 0:
        print("Done!\n")
        return config_selected
    
    # if we don't have any important coefficient left to help us choose configs
    # we take the nb_config first configurations
    if len(sc) == 0:
        print("Selecting " + str(nb_config) + " configurations from the rest of the dataset!")
        for conf in df.index[0:nb_config]:
            config_selected.append(conf)
        return config_selected
    
    # otherwise we just use the best coef to choose configs
    else:
        
        # we choose the best features coef (biggest absolute value)
        most_important_coef = sc[0]
        
        print("Feature : " + str(most_important_coef))
        
        # configs with this feature activated
        imp_index = np.where(df[most_important_coef]==1)[0]

        # number of configs with this feature activated
        nb_imp_index = len(imp_index)

        # if we have more values to choose 
        # than the number of configurations with the best feature activated
        # we add all the configuration to the selected set
        # and we select the rest of the configuration based on other coefficients
        if nb_imp_index <= nb_config:
            for conf in df.iloc[imp_index].index:
                config_selected.append(conf)
            if nb_imp_index > 0:
                print("Added "+str(nb_imp_index)+ " values, "+str(nb_config-nb_imp_index)+" left to choose \n")
            # then we apply recursively this method to the rest of the dataframe
            return select_config(df.iloc[np.where(df[most_important_coef]==0)[0]], 
                                          sc[1:len(sc)],
                                          config_selected)
        
        # otherwise we have enough values with this features activated
        # to select all the remaining configurations
        # so we apply the method to the dataframe containing all the feature activated
        # and we select the configuration by using the followings features
        else:
            return select_config(df.iloc[imp_index], 
                                 sc[1:len(sc)], 
                                 config_selected)

exploitation_conf = select_exploitation(X_interact, sorted_coefs, [])

print("Selected : " + str(exploitation_conf))

Feature : 9
Feature : 10
Feature : 11
Feature : 6
Feature : 72
Feature : 73
Feature : 74
Feature : 84
Added 32 values, 28 left to choose 

Feature : 83
Feature : 82
Feature : 7
Feature : 12
Feature : 13
Feature : 14
Feature : 0
Feature : 3
Added 16 values, 12 left to choose 

Feature : 75
Feature : 70
Feature : 1
Added 8 values, 4 left to choose 

Feature : 42
Feature : 50
Feature : 16
Feature : 19
Feature : 43
Feature : 80
Feature : 4
Added 4 values, 0 left to choose 

Done!

Selected : [4, 28, 54, 104, 109, 135, 146, 213, 235, 284, 368, 386, 408, 437, 461, 463, 502, 585, 697, 743, 751, 772, 784, 797, 820, 842, 863, 869, 925, 1011, 1013, 1021, 131, 150, 369, 390, 452, 579, 680, 755, 773, 800, 836, 861, 862, 1008, 1143, 1146, 209, 243, 282, 406, 434, 464, 515, 865, 113, 770, 813, 867]


#### B-] Exploration : Select specific configurations, similar between the source and the target

In [109]:
ratio_exploration = 1-ratio_exploitation
nb_exploration = int(config_tot*ratio_exploitation)

# I choose to select the group in one step:
# if you select config per config, you may choose a local optimal

def select_exploration(df, exploitation_conf, id_target, number_group = 100):
    
    target = listVideo[id_target]
    
    # all the config left for exploration
    # total minus those chosen for exploitation
    explor_conf = np.setdiff1d(df.index, exploitation_conf)
    
    # initialization : we take the first nb_exploration config
    best_explor = explor_conf[0:nb_exploration]
    
    # we group it with the exploitation configurations
    conf = np.concatenate((exploitation_conf, best_explor), axis=0)
    # for the moment, it's our best entropy
    best_entropy  = sc.entropy(target.iloc[conf][predDimension], source.iloc[conf][predDimension])
    
    # then we incrementally select the configurations to diminish the entropy 
    group_counter = 0
    
    while group_counter < number_group:
        
        group_counter +=1
        
        # current group to 'challenge' the best result
        np.random.shuffle(explor_conf)
        current_explor = explor_conf[0:nb_exploration]
        
        # we group it with the exploitation configurations
        conf = np.concatenate((exploitation_conf, current_explor), axis=0)
        
        # we compute the Kullback Leibler divergence between the source and the target
        current_entropy = sc.entropy(target.iloc[conf][predDimension], source.iloc[conf][predDimension])
        
        # we finally take the group giving the lowest entropy
        # if this group is better than the best group, we replace it by the new one
        if current_entropy > best_entropy:
            print("Entropy gained : "+str(current_entropy-best_entropy))
            best_entropy = current_entropy
            best_explor = current_explor
    
    return best_explor

print("\nConfigurations kept for exploration : \n" + 
      str(select_exploration(X_interact, exploitation_conf, 0, 1000)))

Entropy gained : 0.000606207058237323
Entropy gained : 2.6231712971008878e-05
Entropy gained : 9.222420139338907e-05
Entropy gained : 0.00010228298178902506
Entropy gained : 0.0005347582695323372
Entropy gained : 0.00010852132709060405

Configurations kept for exploration : 
[  44  969  488  273 1135  337  285  807  566  717  193 1103  582  482
 1149  429  132  841  774  397  734  322 1104  277  663 1134  543 1122
  475  831 1131  881  251 1145  248  341  625   66 1049  765  907  265
  468   29  641  334  478  598  699  720 1151  832  658   51  580  703
 1100  333  934  909]


### Step 3 : Transfer the knowledge

In [123]:
def l2s_transfer(i, j, ratio_exploitation = 0.3, l2s_tr_ratio = 0.5, pct_test = 0.7):
    
    # to sample the source and the target using the same seed
    random_state = np.random.randint(0,1000)

    # a list of features to keep as explicative variables
    keep_features = ['no_8x8dct','no_asm', 'no_cabac','no_deblock','no_fast_pskip', 
                     'no_mbtree','no_mixed_refs','no_weightb','rc_lookahead','ref']

    # ordinal data to convert into dummies
    to_dummy_features = ['rc_lookahead','ref']

    # the source video
    source = listVideo[i]
    
    # the number of config used in the training
    config_tot = int(l2s_tr_ratio*(1-pct_test)*source.shape[1])

    # transform some variables into dummies, to fit the orginal paper
    # since we don't want to introduce a meaningless constant in the model, 
    # we have to keep all columns

    dummies = pd.get_dummies(source[keep_features], 
                       drop_first = False,
                       columns=to_dummy_features)

    X_src = pd.DataFrame(np.array(dummies, dtype=int))


    # add interactions
    poly = PolynomialFeatures(degree=2, interaction_only = True, include_bias = True)
    X_interact = pd.DataFrame(np.array(poly.fit_transform(X_src),int))

    # performance variable, to predict
    y_src = np.array(source[predDimension], dtype=float)

    # split train test (-> we only use X_src_train to sample l2s)
    X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_interact, 
                                                                        y_src, 
                                                                        test_size=pct_test, 
                                                                        random_state=random_state)

    # we train the model with the training data
    print("\n############### I- Stepwise regression #################\n")
    
    selected_features = stepwise_selection(X_src_train, y_src_train)
    
    print("\n############### II- Sampling #################\n")
    
    reg = LinearRegression()

    reg.fit(X_src_train[selected_features], y_src_train)

    sorted_coefs = pd.Series(np.abs(reg.coef_), selected_features).sort_values(ascending=False).index

    nb_config_exploitation = np.round(ratio_exploitation*config_tot)
    
    print("A- EXPLOITATION\n")
    
    exploitation_conf = select_exploitation(X_src_train, sorted_coefs, [])
    
    print("\nB- EXPLORATION\n")
    
    exploration_conf = select_exploration(X_src_train, exploitation_conf, j, 1000)
    
    sampled_conf=np.concatenate((exploitation_conf,exploration_conf), axis=0)
    
    print(sampled_conf)
    
    print("\n############### III- Transfer #################\n")
    
    # we split the source and the target
    
    target = listVideo[j]
    
    _, X_src_te, _, y_src_te = train_test_split(source[keep_features], 
                                                                    source[predDimension], 
                                                                    test_size=pct_test, 
                                                                    random_state=random_state)
    
        
    _, X_tgt_te, _, y_tgt_te = train_test_split(target[keep_features], 
                                                                    target[predDimension],  
                                                                    test_size=pct_test, 
                                                                    random_state=random_state)
    X_src_tr = source[keep_features].iloc[sampled_conf]
    y_src_tr = source[predDimension].iloc[sampled_conf]
    
    X_tgt_tr = target[keep_features].iloc[sampled_conf]
    y_tgt_tr = target[predDimension].iloc[sampled_conf]
    
    lf = LinearRegression()
    lf.fit(X_src_tr, y_src_tr)
    y_src_pred_te = np.array(lf.predict(X_src_te)).reshape(-1,1)
    
    # The shift function, to transfer the prediction from the source to the target
    shift = LinearRegression()
    shift.fit(np.array(y_src_tr).reshape(-1,1), y_tgt_tr)
    y_tgt_pred_te = shift.predict(y_src_pred_te)
    
    # We return the mean average percentage error 
    # between the real values of y_test from target 
    # and the predictions shifted 
    return min(mape(y_tgt_te, y_tgt_pred_te),1)
    
    
l2s_transfer(1,2)


############### I- Stepwise regression #################

Add                              0 with p-value 1.1648e-206


  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Add                             12 with p-value 4.4722e-31
Add                              7 with p-value 2.3656e-45
Add                             82 with p-value 1.1016e-36
Add                             75 with p-value 6.2839e-21
Add                             76 with p-value 2.2796e-24
Add                              3 with p-value 2.1261e-22
Add                              1 with p-value 3.6486e-12
Add                             14 with p-value 2.7675e-11
Add                             13 with p-value 1.0823e-153
Add                             83 with p-value 2.8069e-32
Add                             84 with p-value 1.859e-109
Add                              9 with p-value 3.2819e-17
Add                             42 with p-value 1.9082e-18
Add                             72 with p-value 1.2842e-16
Add                             77 with p-value 3.3292e-18
Add                              6 with p-value 3.7972e-59
Drop                             76 with p-value 0.6149

0.13990661995599477