Import libraries

In [11]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
%matplotlib inline
# high-level plots
import seaborn as sns

import statsmodels.api as sm

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans

# we use it to interact with the file system
import os
# compute time
from time import time

# Neural network high level framework
import keras
# Sequential is a sequence of blocs
# Input deals with the data fed to the network
from keras.models import Sequential,Input,Model
# Dense is a feedforward layer with fully connected nodes
# Dropout allows to keep part of data, and to "drop out" a the rest
# Flatten makes the data "flat", i.e. in one dimension
from keras.layers import Dense, Dropout, Flatten
# Conv -> convolution, MaxPooling is relative to Pooling
# Activation if the function composing the data in output of a layer
from keras.layers import Conv2D, MaxPooling2D, Activation


from learner.mlearner import learn_with_interactions, learn_without_interactions, sample_random, stepwise_feature_selection
from learner.model import genModelTermsfromString, Model, genModelfromCoeff

Import data

In [18]:
#because x264 output is "m:s", where m is the number of minutes and s the number of seconds 
# we define a function to convert this format into the number of seconds
def elapsedtime_to_sec(el):
    tab = el.split(":")
    return float(tab[0])*60+float(tab[1])

# the data folder, see the markdown there for additional explanations
res_dir = "../../../data/ugc/res_ugc/"

# the list of videos names, e.g. Animation_360P-3e40
# we sort the list so we keep the same ids between two launches
v_names = sorted(os.listdir(res_dir)) 

# the list of measurements
listVideo = []

# we add each dataset in the list, converting the time to the right format
# third line asserts that the measures are complete
for v in v_names:
    data = pd.read_table(res_dir+v, delimiter = ',')
    data['etime'] = [*map(elapsedtime_to_sec, data['elapsedtime'])]
    assert data.shape == (201,34), v
    listVideo.append(data)

nbVideos = len(listVideo)

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true-y_pred)/y_true))

predDimension="size"

# to sample the source and the target using the same seed
random_state = np.random.randint(0,1000)

# a list of features to keep as explicative variables
keep_features = ['cabac', '8x8dct', 'mbtree', 'rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 
                 'qpmax', 'aq-mode', 'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 
                 'b_adapt', 'direct', 'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']

# ordinal data to convert into dummies
to_dummy_features = ['rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 'qpmax', 'aq-mode',
                    'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 'b_adapt', 'direct',
                     'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']

config_tot = 200

# L2s implementation

### Step 1: Extraction Process of Performance Models

Select a good model for predicting the performance of the source video

Original files:
- https://github.com/cmu-mars/model-learner/blob/tutorial/learner/mlearner.py for the stepwise selection
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html for the interactions

In [19]:
# @PooyanJamshidi:
# We just change slightly some functions from the original repository,
# mainly because we don't want to add a constant in the model
# + steps 2 and 3 were implemented in matlab but we did not find them in python

def stepwise_selection(X, y,
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    
    ndim = X.shape[1]
    features = [i for i in range(ndim)]
    included = list(initial_list)
    
    while True:
        changed=False
        
        # forward step (removed a constant)
        excluded = list(set(features)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, pd.DataFrame(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add {:30} with p-value {:.5}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, pd.DataFrame(X[included])).fit()
        pvalues = model.pvalues
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.5}'.format(worst_feature, worst_pval))
        if not changed:
            if verbose:
                print("Construction of the model completed!")
            break
    return included

In [25]:
# percentage of configuration used for test
#pct_test = 0.7

# the source video
source = listVideo[1]


# transform some variables into dummies, to fit the orginal paper
# since we don't want to introduce a meaningless constant in the model, 
# we have to keep all columns

#dummies = pd.get_dummies(source[keep_features], 
#                   drop_first = False,
#                   columns=to_dummy_features)

# X_src = pd.DataFrame(np.array(dummies, dtype=int))


# add interactions
# poly = PolynomialFeatures(degree=2, interaction_only = True, include_bias = True)
# X_interact = pd.DataFrame(np.array(poly.fit_transform(X_src),int))

# performance variable, to predict
# y_src = np.array(source[predDimension], dtype=float)

# split train test
# X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_interact, 
#                                                                     y_src, 
#                                                                     test_size=pct_test, 
#                                                                     random_state=random_state)

# the index of the selected features
# selected_features = stepwise_selection(X_interact, y_src)

### Step 2: Active Sampling

#### A - ] Exploitation : use the source's prediction model

##### (i) Sort the coefficients of the previous constructed model

##### (ii) Choose the coefficient with the highest value

##### (iii) Select the configurations with this feature activated



I assumed it was recursive, with a decreasing influence in the selection for a decreasing importance in the regression.

In [26]:
ratio_exploitation = 0.3
config_tot = 200

#reg = LinearRegression()

#reg.fit(X_interact[selected_features], y_src)

#sorted_coefs = pd.Series(np.abs(reg.coef_), selected_features).sort_values(ascending=False).index

#nb_config_exploitation = np.round(ratio_exploitation*config_tot)

#nb_config_selected = 0

#assert X_interact.shape[0] >= nb_config_exploitation ; " Too many configurations to select ! "

def select_exploitation(df, sc, config_selected):
    
    #number of config left to choose
    nb_config = int(nb_config_exploitation - len(config_selected))
    
    if nb_config == 0:
        print("Done!\n")
        return config_selected
    
    # if we don't have any important coefficient left to help us choose configs
    # we take the nb_config first configurations
    if len(sc) == 0:
        print("Selecting " + str(nb_config) + " configurations from the rest of the dataset!")
        for conf in df.index[0:nb_config]:
            config_selected.append(conf)
        return config_selected
    
    # otherwise we just use the best coef to choose configs
    else:
        
        # we choose the best features coef (biggest absolute value)
        most_important_coef = sc[0]
        
        print("Feature : " + str(most_important_coef))
        
        # configs with this feature activated
        imp_index = np.where(df[most_important_coef]==1)[0]

        # number of configs with this feature activated
        nb_imp_index = len(imp_index)

        # if we have more values to choose 
        # than the number of configurations with the best feature activated
        # we add all the configuration to the selected set
        # and we select the rest of the configuration based on other coefficients
        if nb_imp_index <= nb_config:
            for conf in df.iloc[imp_index].index:
                config_selected.append(conf)
            if nb_imp_index > 0:
                print("Added "+str(nb_imp_index)+ " values, "+str(nb_config-nb_imp_index)+" left to choose \n")
            # then we apply recursively this method to the rest of the dataframe
            return select_exploitation(df.iloc[np.where(df[most_important_coef]==0)[0]], 
                                          sc[1:len(sc)],
                                          config_selected)
        
        # otherwise we have enough values with this features activated
        # to select all the remaining configurations
        # so we apply the method to the dataframe containing all the feature activated
        # and we select the configuration by using the followings features
        else:
            return select_exploitation(df.iloc[imp_index], 
                                 sc[1:len(sc)], 
                                 config_selected)

# exploitation_conf = select_exploitation(X_interact, sorted_coefs, [])

# print("Selected : " + str(exploitation_conf))

#### B-] Exploration : Select specific configurations, similar between the source and the target

In [27]:


# I choose to select the group in one step:
# if you select config per config, you may choose a local optimal

def select_exploration(df, exploitation_conf, id_target, number_group = 100):
    
    ratio_exploration = 1-ratio_exploitation
    nb_exploration = int(config_tot*ratio_exploration)
    
    target = listVideo[id_target]
    
    # all the config left for exploration
    # total minus those chosen for exploitation
    explor_conf = np.setdiff1d(df.index, exploitation_conf)
    
    # initialization : we take the first nb_exploration config
    best_explor = explor_conf[0:nb_exploration]
    
    # we group it with the exploitation configurations
    conf = np.concatenate((exploitation_conf, best_explor), axis=0)
    # for the moment, it's our best entropy
    best_entropy  = sc.entropy(target.iloc[conf][predDimension], source.iloc[conf][predDimension])
    
    # then we incrementally select the configurations to diminish the entropy 
    group_counter = 0
    
    while group_counter < number_group:
        
        group_counter +=1
        
        # current group to 'challenge' the best result
        np.random.shuffle(explor_conf)
        current_explor = explor_conf[0:nb_exploration]
        
        # we group it with the exploitation configurations
        conf = np.concatenate((exploitation_conf, current_explor), axis=0)
        
        # we compute the Kullback Leibler divergence between the source and the target
        current_entropy = sc.entropy(target.iloc[conf][predDimension], source.iloc[conf][predDimension])
        
        # we finally take the group giving the lowest entropy
        # if this group is better than the best group, we replace it by the new one
        if current_entropy > best_entropy:
            print("Entropy gained : "+str(current_entropy-best_entropy))
            best_entropy = current_entropy
            best_explor = current_explor
    
    return best_explor

#print("\nConfigurations kept for exploration : \n" + 
#      str(select_exploration(X_interact, exploitation_conf, 0, 1000)))

### Step 3 : Transfer the knowledge

In [28]:
def l2s_transfer(source_id, target_id, ratio_exploitation = 0.4, l2s_tr_ratio = 0.7, pct_test = 0.5):
    
    # to sample the source and the target using the same seed
    random_state = np.random.randint(0,1000)


    # a list of features to keep as explicative variables
    keep_features = ['cabac', '8x8dct', 'mbtree', 'rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 
                     'qpmax', 'aq-mode', 'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 
                     'b_adapt', 'direct', 'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']

    # ordinal data to convert into dummies
    to_dummy_features = ['rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 'qpmax', 'aq-mode',
                        'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 'b_adapt', 'direct',
                         'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']

    # the source video
    source = listVideo[source_id]
    
    # the number of config used in the training
    config_tot = int(l2s_tr_ratio*(1-pct_test)*source.shape[1])

    # transform some variables into dummies, to fit the orginal paper
    # since we don't want to introduce a meaningless constant in the model, 
    # we have to keep all columns

    dummies = pd.get_dummies(source[keep_features], 
                       drop_first = False,
                       columns=to_dummy_features)

    X_src = pd.DataFrame(np.array(dummies, dtype=int))


    # add interactions
    poly = PolynomialFeatures(degree=2, interaction_only = True, include_bias = True)
    X_interact = pd.DataFrame(np.array(poly.fit_transform(X_src),int))

    # performance variable, to predict
    y_src = np.array(source[predDimension], dtype=float)

    # split train test (-> we only use X_src_train to sample l2s)
    X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_interact, 
                                                                        y_src, 
                                                                        test_size=pct_test, 
                                                                        random_state=random_state)

    # we train the model with the training data
    print("\n############### I- Stepwise regression #################\n")
    
    selected_features = stepwise_selection(X_src_train, y_src_train)
    
    print("\n############### II- Sampling #################\n")
    
    reg = LinearRegression()

    reg.fit(X_src_train[selected_features], y_src_train)

    sorted_coefs = pd.Series(np.abs(reg.coef_), selected_features).sort_values(ascending=False).index

    nb_config_exploitation = np.round(ratio_exploitation*config_tot)
    
    print("A- EXPLOITATION\n")
    
    exploitation_conf = select_exploitation(X_src_train, sorted_coefs, [])
    
    print("\nB- EXPLORATION\n")
    
    # we ensure we sample the configurations of the training set
    # which removes the potential threat of using the configuration of the testing set
    # during the training
    
    exploration_conf = select_exploration(X_src_train, exploitation_conf, target_id, 1000)
    
    sampled_conf = np.concatenate((exploitation_conf,exploration_conf), axis=0)
    
    print(sampled_conf)
    
    print("\n############### III- Transfer #################\n")
    
    # we split the source and the target
    
    target = listVideo[target_id]
    
    _, X_src_te, _, y_src_te = train_test_split(X_interact[selected_features], 
                                                source[predDimension], 
                                                test_size=pct_test, 
                                                random_state=random_state)
        
    _, X_tgt_te, _, y_tgt_te = train_test_split(X_interact[selected_features], 
                                                target[predDimension], 
                                                test_size=pct_test, 
                                                random_state=random_state)
    
    # instead of using all the configurations, we use the sampled configuration
    # ie we remove the unnecessary configurations
    
    X_src_tr = X_interact[selected_features].iloc[sampled_conf]
    y_src_tr = source[predDimension].iloc[sampled_conf]
    
    X_tgt_tr = X_interact[selected_features].iloc[sampled_conf]
    y_tgt_tr = target[predDimension].iloc[sampled_conf]
    
    lf = RandomForestRegressor()
    lf.fit(X_src_tr, y_src_tr)
    y_src_pred_te = np.array(lf.predict(X_src_te)).reshape(-1,1)
    
    # The shift function, to transfer the prediction from the source to the target
    shift = RandomForestRegressor()
    shift.fit(np.array(y_src_tr).reshape(-1,1), y_tgt_tr)
    y_tgt_pred_te = shift.predict(y_src_pred_te)
    
    # We return the mean average percentage error 
    # between the real values of y_test from target 
    # and the predictions shifted 
    return mape(y_tgt_te, y_tgt_pred_te)
    
    
l2s_transfer(source_id = 1, target_id = 2)


############### I- Stepwise regression #################



  return self.params / self.bse


Add                              0 with p-value 3.5793e-55


KeyboardInterrupt: 