#### No Transfer (NT) is a Simple Learning approach, acting as a control approach to state whether transfer learning is suited to solve this problem. 
It trains a performance model directly on the target input, without using any source. 
We expect to outperform No Transfer with transfer learning approaches. 

In [1]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
# high-level plots
import seaborn as sns

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split, GridSearchCV
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans
# Support vector machine - support vector regressor
from sklearn.svm import SVR
# decision trees
from sklearn.tree import DecisionTreeRegressor, plot_tree
# mean squared error
from sklearn.metrics import mean_squared_error

# gradient boosting trees
from xgboost import XGBRegressor

# we use it to interact with the file system
import os
# compute time
from time import time

# Neural network high level framework
import keras
# Sequential is a sequence of blocs
# Input deals with the data fed to the network
from keras.models import Sequential,Input,Model
# Dense is a feedforward layer with fully connected nodes
# Dropout allows to keep part of data, and to "drop out" a the rest
# Flatten makes the data "flat", i.e. in one dimension
from keras.layers import Dense, Dropout, Flatten
# Conv -> convolution, MaxPooling is relative to Pooling
# Activation if the function composing the data in output of a layer
from keras.layers import Conv2D, MaxPooling2D, Activation

Using TensorFlow backend.


In [28]:
predDimension = "size"

def elapsedtime_to_sec(el):
    tab = el.split(":")
    return float(tab[0])*60+float(tab[1])

In [29]:
class NT:
    
    def __init__(self):
        #self.pct_test = pct_test
        #self.ratio_exploitation = ratio_exploitation
        
        # the data folder, see the markdown there for additional explanations
        res_dir = "../../../data/ugc/res_ugc/"
        
        # the list of videos names, e.g. Animation_360P-3e40
        # we sort the list so we keep the same ids between two launches
        v_names = sorted(os.listdir(res_dir)) 

        self.predDimension = predDimension
        
        # the list of measurements
        listVideo = []

        # we add each dataset in the list, converting the time to the right format
        # third line asserts that the measures are complete
        for v in v_names:
            data = pd.read_table(res_dir+v, delimiter = ',')
            data['etime'] = [*map(elapsedtime_to_sec, data['elapsedtime'])]
            inter = pd.get_dummies(data)
            inter[self.predDimension] = data[self.predDimension]
            listVideo.append(inter)
        
        self.listVideo = listVideo
        
        
        # to sample the source and the target using the same seed
        self.random_state = np.random.randint(0,1000)
        
        self.features = ['cabac', '8x8dct', 'mbtree', 'rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 
                 'qpmax', 'aq-mode', 'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 
                 'b_adapt', 'direct', 'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']
    
    def mse(self, y_true, y_pred):
        return np.mean((y_true-y_pred)**2)
    
    def learn(self, target_id, train_size, 
                    learning_algorithm = RandomForestRegressor):
        
        # random state , i.e. a seed to split the source and the target datasets
        # by using the same set of configurations for training and testing
        random_state = np.random.randint(0,1000)
        
        #print(X_src_train.shape)
        # We define the target video, and split it into train-test
        target = self.listVideo[target_id]
        X_tgt = target.drop([self.predDimension], axis = 1)
        y_tgt = np.array(target[self.predDimension], dtype=float)
        X_tgt_train, X_tgt_test, y_tgt_train, y_tgt_test = train_test_split(X_tgt, 
                                                                            y_tgt, 
                                                                            train_size=train_size, 
                                                                            random_state=random_state)
        
        lf = learning_algorithm()
        lf.fit(X_tgt_train, y_tgt_train)
        y_tgt_pred_test = np.array(lf.predict(X_tgt)).reshape(-1,1)

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return self.mse(y_tgt_pred_test, y_tgt)
    
    def predict_conf(self, target_id, train_size, 
                    learning_algorithm = RandomForestRegressor):
    
        # the percentage (proportion) of configurations used for the test
        # pct_test = 1-nb_config_target_training/len(listVideo[target_id].index)
        # print(pct_test)

        # random state , i.e. a seed to split the source and the target datasets
        # by using the same set of configurations for training and testing
        random_state = np.random.randint(0,1000)
        
        #print(X_src_train.shape)
        # We define the target video, and split it into train-test
        target = self.listVideo[target_id]
        X_tgt = target.drop([self.predDimension], axis = 1)
        y_tgt = np.array(target[self.predDimension], dtype=float)
        X_tgt_train, X_tgt_test, y_tgt_train, y_tgt_test = train_test_split(X_tgt, 
                                                                            y_tgt, 
                                                                            train_size=train_size, 
                                                                            random_state=random_state)
        
        lf = learning_algorithm()
        lf.fit(X_tgt_train, y_tgt_train)
        y_tgt_pred_test = np.array(lf.predict(X_tgt)).reshape(-1,1)

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return np.argmin(y_tgt_pred_test)

In [30]:
nt = NT()

#for ts in np.arange(5,31,5):
#    print(ts, nt.learn(target_id = 6, train_size=ts))

#### Learning algorithm

In [9]:
LAs = [LinearRegression, DecisionTreeRegressor, RandomForestRegressor, XGBRegressor, SVR]
for i in range(5):
    target_id = np.random.randint(0,1000)
    for la in LAs:
        print(la, nt.learn(target_id = target_id, 
                           train_size=20, learning_algorithm=la))

<class 'sklearn.linear_model._base.LinearRegression'> 90636.724876731
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 96131.63230777207
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 68576.87464658367
<class 'xgboost.sklearn.XGBRegressor'> 100141.08316194612
<class 'sklearn.svm._classes.SVR'> 50436.971072850356
<class 'sklearn.linear_model._base.LinearRegression'> 122000.82175214507
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 89863.02603694216
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 111890.67569140258
<class 'xgboost.sklearn.XGBRegressor'> 126003.63378314463
<class 'sklearn.svm._classes.SVR'> 67815.88225900207
<class 'sklearn.linear_model._base.LinearRegression'> 58610860.38614221
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 59860116.173036136
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 39829093.04593233
<class 'xgboost.sklearn.XGBRegressor'> 68805679.42634857
<class 'sklearn.svm._classes.SVR'> 33061617.61326263
<class 's

#### Chosen algorithm :  DecisionTreeRegressor (however it may depends on the choice of videos)

We predict the configurations for each video of the test set, for 5 configs, 10 configs, ..., 30 configs in the training set.

In [31]:
# the data folder, see the markdown there for additional explanations
res_dir = "../../../data/ugc/res_ugc/"

# the list of videos names, e.g. Animation_360P-3e40
# we sort the list so we keep the same ids between two launches
v_names = sorted(os.listdir(res_dir)) 

v_names_train = np.loadtxt("../../../results/raw_data/train_names.csv", dtype= str)
v_names_test = np.loadtxt("../../../results/raw_data/test_names.csv", dtype= str)
index_train = [i for i in range(len(v_names)) if v_names[i] in v_names_train]
index_test = [i for i in range(len(v_names)) if v_names[i] in v_names_test]

train_sizes = np.arange(5,31,5)

In [32]:
nt_confs = dict()
for i in range(len(index_test)):
    it = index_test[i]
    for ts in train_sizes:
        nt_confs[(i, ts)] = nt.predict_conf(target_id = it, train_size=ts,
                                      learning_algorithm = DecisionTreeRegressor)

In [33]:
nt_confs

{(0, 5): 69,
 (0, 10): 21,
 (0, 15): 89,
 (0, 20): 123,
 (0, 25): 123,
 (0, 30): 89,
 (1, 5): 1,
 (1, 10): 25,
 (1, 15): 65,
 (1, 20): 85,
 (1, 25): 89,
 (1, 30): 89,
 (2, 5): 1,
 (2, 10): 1,
 (2, 15): 38,
 (2, 20): 32,
 (2, 25): 36,
 (2, 30): 130,
 (3, 5): 20,
 (3, 10): 57,
 (3, 15): 25,
 (3, 20): 4,
 (3, 25): 160,
 (3, 30): 5,
 (4, 5): 1,
 (4, 10): 1,
 (4, 15): 83,
 (4, 20): 16,
 (4, 25): 112,
 (4, 30): 79,
 (5, 5): 1,
 (5, 10): 1,
 (5, 15): 100,
 (5, 20): 100,
 (5, 25): 94,
 (5, 30): 85,
 (6, 5): 80,
 (6, 10): 8,
 (6, 15): 3,
 (6, 20): 123,
 (6, 25): 30,
 (6, 30): 123,
 (7, 5): 16,
 (7, 10): 39,
 (7, 15): 100,
 (7, 20): 45,
 (7, 25): 2,
 (7, 30): 42,
 (8, 5): 19,
 (8, 10): 20,
 (8, 15): 13,
 (8, 20): 25,
 (8, 25): 85,
 (8, 30): 177,
 (9, 5): 1,
 (9, 10): 71,
 (9, 15): 123,
 (9, 20): 123,
 (9, 25): 100,
 (9, 30): 62,
 (10, 5): 1,
 (10, 10): 8,
 (10, 15): 5,
 (10, 20): 4,
 (10, 25): 123,
 (10, 30): 38,
 (11, 5): 1,
 (11, 10): 123,
 (11, 15): 123,
 (11, 20): 165,
 (11, 25): 14,
 (11, 3

In [34]:
nt_data = pd.DataFrame({"id_video" : [i for i in range(len(index_test))]})
for ts in train_sizes:
    nt_data["conf"+str(ts)] = [nt_confs[(i, ts)] for i in range(len(index_test))]

In [35]:
nt_data.set_index("id_video").to_csv("../../../results/raw_data/"+predDimension+"/NT_results.csv")