#### Model Shift (MS) is a transfer learning defined by Valov et al. 
First, it trains a performance model on the source input and predicts the performance distribution of the source input. 
Then, it trains a shifting function, predicting the performances of the target input based on the performances of the source. 
Finally, it applies the shifting function to the predictions of the source. 

In [31]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
# high-level plots
import seaborn as sns

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

import statsmodels.api as sm

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split, GridSearchCV
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans
# Support vector machine - support vector regressor
from sklearn.svm import SVR
# decision trees
from sklearn.tree import DecisionTreeRegressor, plot_tree
# mean squared error
from sklearn.metrics import mean_squared_error

# gradient boosting trees
from xgboost import XGBRegressor

# we use it to interact with the file system
import os
# compute time
from time import time

# Neural network high level framework
import keras
# Sequential is a sequence of blocs
# Input deals with the data fed to the network
from keras.models import Sequential,Input,Model
# Dense is a feedforward layer with fully connected nodes
# Dropout allows to keep part of data, and to "drop out" a the rest
# Flatten makes the data "flat", i.e. in one dimension
from keras.layers import Dense, Dropout, Flatten
# Conv -> convolution, MaxPooling is relative to Pooling
# Activation if the function composing the data in output of a layer
from keras.layers import Conv2D, MaxPooling2D, Activation

In [59]:
class MS:
    
    def __init__(self):
        #self.pct_test = pct_test
        #self.ratio_exploitation = ratio_exploitation
        
        # the data folder, see the markdown there for additional explanations
        res_dir = "../../../data/ugc/res_ugc/"
        
        # the list of videos names, e.g. Animation_360P-3e40
        # we sort the list so we keep the same ids between two launches
        v_names = sorted(os.listdir(res_dir)) 

        self.predDimension = "kbs"
        
        # the list of measurements
        listVideo = []

        # we add each dataset in the list, converting the time to the right format
        # third line asserts that the measures are complete
        for v in v_names:
            data = pd.read_table(res_dir+v, delimiter = ',')
            inter = pd.get_dummies(data)
            inter[self.predDimension] = data[self.predDimension]
            listVideo.append(inter)
        
        self.listVideo = listVideo
        
        
        # to sample the source and the target using the same seed
        self.random_state = np.random.randint(0,1000)
        
        self.features = ['cabac', '8x8dct', 'mbtree', 'rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 
                 'qpmax', 'aq-mode', 'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 
                 'b_adapt', 'direct', 'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']
    
    def mse(self, y_true, y_pred):
        return np.mean((y_true-y_pred)**2)
    
    def learn(self, source_id, target_id, train_size, 
                    learning_algorithm = RandomForestRegressor, 
                    shift_function = RandomForestRegressor):
    
        # the percentage (proportion) of configurations used for the test
        # pct_test = 1-nb_config_target_training/len(listVideo[target_id].index)
        # print(pct_test)

        # random state , i.e. a seed to split the source and the target datasets
        # by using the same set of configurations for training and testing
        random_state = np.random.randint(0,1000)

        # We define the source video, and split it into train-test
        source = self.listVideo[source_id]
        X_src = source.drop([self.predDimension], axis = 1)
        y_src = np.array(source[self.predDimension], dtype=float)
        X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_src, 
                                                                            y_src, 
                                                                            train_size=train_size,
                                                                            random_state=random_state)
        #print(X_src_train.shape)
        # We define the target video, and split it into train-test
        target = self.listVideo[target_id]
        X_tgt = target.drop([self.predDimension], axis = 1)
        y_tgt = np.array(target[self.predDimension], dtype=float)
        X_tgt_train, X_tgt_test, y_tgt_train, y_tgt_test = train_test_split(X_tgt, 
                                                                            y_tgt, 
                                                                            train_size=train_size, 
                                                                            random_state=random_state)

        # The learning algorithm, training on the source video
        X_src_train2, _, y_src_train2, _ = train_test_split(X_src, y_src, 
                                                            test_size=0.7)
        
        lf = learning_algorithm()
        lf.fit(X_src_train2, y_src_train2)
        y_src_pred_test = np.array(lf.predict(X_src_test)).reshape(-1,1)

        # The shift function, to transfer the prediction from the source to the target
        shift = shift_function()
        shift.fit(np.array(y_src_train).reshape(-1,1), y_tgt_train)
        y_tgt_pred_test = shift.predict(y_src.reshape(-1,1))

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return self.mse(y_tgt_pred_test, y_tgt)
    
    def predict_conf(self, source_id, target_id, train_size, 
                    learning_algorithm = RandomForestRegressor, 
                    shift_function = RandomForestRegressor):
    
        # the percentage (proportion) of configurations used for the test
        # pct_test = 1-nb_config_target_training/len(listVideo[target_id].index)
        # print(pct_test)

        # random state , i.e. a seed to split the source and the target datasets
        # by using the same set of configurations for training and testing
        random_state = np.random.randint(0,1000)

        # We define the source video, and split it into train-test
        source = self.listVideo[source_id]
        X_src = source.drop([self.predDimension], axis = 1)
        y_src = np.array(source[self.predDimension], dtype=float)
        X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_src, 
                                                                            y_src, 
                                                                            train_size=train_size,
                                                                            random_state=random_state)
        #print(X_src_train.shape)
        # We define the target video, and split it into train-test
        target = self.listVideo[target_id]
        X_tgt = target.drop([self.predDimension], axis = 1)
        y_tgt = np.array(target[self.predDimension], dtype=float)
        X_tgt_train, X_tgt_test, y_tgt_train, y_tgt_test = train_test_split(X_tgt, 
                                                                            y_tgt, 
                                                                            train_size=train_size, 
                                                                            random_state=random_state)

        # The learning algorithm, training on the source video
        X_src_train2, _, y_src_train2, _ = train_test_split(X_src, y_src, 
                                                            test_size=0.7)
        
        lf = learning_algorithm()
        lf.fit(X_src_train2, y_src_train2)
        y_src_pred_test = np.array(lf.predict(X_src_test)).reshape(-1,1)

        # The shift function, to transfer the prediction from the source to the target
        shift = shift_function()
        shift.fit(np.array(y_src_train).reshape(-1,1), y_tgt_train)
        y_tgt_pred_test = shift.predict(y_src.reshape(-1,1))

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return np.argmin(y_tgt_pred_test)

In [60]:
ms = MS()

for ts in np.arange(5,31,5):
    print(pct_test, ms.learn(source_id = 2, target_id = 6, train_size=ts))

0.9 1752906.0207466183
0.9 1180212.96931652
0.9 1897579.8515903938
0.9 3533435.359147508
0.9 1167846.1500647946
0.9 980115.2062252816


#### Learning algorithm

In [61]:
LAs = [LinearRegression, DecisionTreeRegressor, RandomForestRegressor, XGBRegressor, SVR]
for i in range(5):
    source_id = np.random.randint(0,1000)
    target_id = np.random.randint(0,1000)
    for la in LAs:
        print(la, ms.learn(source_id = source_id, target_id = target_id, 
                           train_size=20, learning_algorithm=la))

<class 'sklearn.linear_model._base.LinearRegression'> 1402725.3941713842
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 1593177.9965469814
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 1771066.38976192
<class 'xgboost.sklearn.XGBRegressor'> 1062298.221570041
<class 'sklearn.svm._classes.SVR'> 1693598.6880435469
<class 'sklearn.linear_model._base.LinearRegression'> 477024.9797989845
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 433654.11879539635
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 416416.2724485392
<class 'xgboost.sklearn.XGBRegressor'> 532741.3921451636
<class 'sklearn.svm._classes.SVR'> 473014.2127069929
<class 'sklearn.linear_model._base.LinearRegression'> 11725.935165688516
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 5476.942739595304
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 5515.880780477623
<class 'xgboost.sklearn.XGBRegressor'> 12031.106470507546
<class 'sklearn.svm._classes.SVR'> 4631.240308183757
<class '

#### Chosen algorithm :  SVR (however it may depends on the choice of videos)

#### Shifting function

In [63]:
LAs = [LinearRegression, DecisionTreeRegressor, RandomForestRegressor, XGBRegressor, SVR]
for i in range(5):
    source_id = np.random.randint(0,1000)
    target_id = np.random.randint(0,1000)
    for la in LAs:
        print(la, ms.learn(source_id = source_id, target_id = target_id, 
                           train_size=20, shift_function=la))

<class 'sklearn.linear_model._base.LinearRegression'> 456217.51119152317
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 670142.3770900498
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 554727.6716164041
<class 'xgboost.sklearn.XGBRegressor'> 628226.4760653581
<class 'sklearn.svm._classes.SVR'> 4850385.808003911
<class 'sklearn.linear_model._base.LinearRegression'> 6398.924680545176
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 11170.511540796018
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 5605.233614417313
<class 'xgboost.sklearn.XGBRegressor'> 16088.985135634259
<class 'sklearn.svm._classes.SVR'> 8618.201187216728
<class 'sklearn.linear_model._base.LinearRegression'> 9593810.98738373
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 10670635.162215425
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 4684757.708281321
<class 'xgboost.sklearn.XGBRegressor'> 3545923.388881826
<class 'sklearn.svm._classes.SVR'> 24912526.982189137
<class 's

#### Chosen algorithm  for shifting function:  RandomForestRegressor (however it may depends on the choice of videos)

We predict the configurations for each video of the test set, for 5 configs, 10 configs, ..., 30 configs in the training set.

In [64]:
# the data folder, see the markdown there for additional explanations
res_dir = "../../../data/ugc/res_ugc/"

# the list of videos names, e.g. Animation_360P-3e40
# we sort the list so we keep the same ids between two launches
v_names = sorted(os.listdir(res_dir)) 

v_names_train = np.loadtxt("../../../results/raw_data/train_names.csv", dtype= str)
v_names_test = np.loadtxt("../../../results/raw_data/test_names.csv", dtype= str)
index_train = [i for i in range(len(v_names)) if v_names[i] in v_names_train]
index_test = [i for i in range(len(v_names)) if v_names[i] in v_names_test]

train_sizes = np.arange(5,31,5)

In [65]:
ms = MS()
ms_confs = dict()
for i in range(len(index_test)):
    it = index_test[i]
    source_index_train = np.random.randint(0, len(v_names_train))
    source_id = index_train[source_index_train]
    for ts in train_sizes:
        ms_confs[(i, ts)] = ms.predict_conf(source_id = source_id, target_id = it, train_size=ts,
                                      learning_algorithm = SVR, shift_function = RandomForestRegressor)

In [66]:
ms_confs

{(0, 5): 8,
 (0, 10): 8,
 (0, 15): 8,
 (0, 20): 67,
 (0, 25): 177,
 (0, 30): 104,
 (1, 5): 8,
 (1, 10): 41,
 (1, 15): 123,
 (1, 20): 168,
 (1, 25): 168,
 (1, 30): 168,
 (2, 5): 4,
 (2, 10): 106,
 (2, 15): 4,
 (2, 20): 32,
 (2, 25): 32,
 (2, 30): 4,
 (3, 5): 1,
 (3, 10): 60,
 (3, 15): 19,
 (3, 20): 104,
 (3, 25): 4,
 (3, 30): 91,
 (4, 5): 0,
 (4, 10): 99,
 (4, 15): 2,
 (4, 20): 6,
 (4, 25): 6,
 (4, 30): 10,
 (5, 5): 2,
 (5, 10): 2,
 (5, 15): 2,
 (5, 20): 9,
 (5, 25): 100,
 (5, 30): 20,
 (6, 5): 1,
 (6, 10): 100,
 (6, 15): 91,
 (6, 20): 105,
 (6, 25): 1,
 (6, 30): 80,
 (7, 5): 0,
 (7, 10): 62,
 (7, 15): 1,
 (7, 20): 2,
 (7, 25): 16,
 (7, 30): 31,
 (8, 5): 1,
 (8, 10): 198,
 (8, 15): 123,
 (8, 20): 123,
 (8, 25): 168,
 (8, 30): 21,
 (9, 5): 1,
 (9, 10): 1,
 (9, 15): 9,
 (9, 20): 73,
 (9, 25): 8,
 (9, 30): 100,
 (10, 5): 5,
 (10, 10): 12,
 (10, 15): 192,
 (10, 20): 4,
 (10, 25): 46,
 (10, 30): 85,
 (11, 5): 7,
 (11, 10): 89,
 (11, 15): 123,
 (11, 20): 179,
 (11, 25): 153,
 (11, 30): 89,
 (

In [70]:
ms_data = pd.DataFrame({"id_video" : [i for i in range(len(index_test))]})
for ts in train_sizes:
    ms_data["conf"+str(ts)] = [ms_confs[(i, ts)] for i in range(len(index_test))]

In [73]:
# ms_data.set_index("id_video").to_csv("../../../results/raw_data/MS_results.csv")