In [1]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
%matplotlib inline
# high-level plots
import seaborn as sns

import statsmodels.api as sm

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans

# we use it to interact with the file system
import os
# compute time
from time import time

# Neural network high level framework
import keras
# Sequential is a sequence of blocs
# Input deals with the data fed to the network
from keras.models import Sequential,Input,Model
# Dense is a feedforward layer with fully connected nodes
# Dropout allows to keep part of data, and to "drop out" a the rest
# Flatten makes the data "flat", i.e. in one dimension
from keras.layers import Dense, Dropout, Flatten
# Conv -> convolution, MaxPooling is relative to Pooling
# Activation if the function composing the data in output of a layer
from keras.layers import Conv2D, MaxPooling2D, Activation


from learner.mlearner import learn_with_interactions, learn_without_interactions, sample_random, stepwise_feature_selection
from learner.model import genModelTermsfromString, Model, genModelfromCoeff

Using TensorFlow backend.


In [12]:
class NT:
    
    def __init__(self):
        
        # the data folder, see the markdown there for additional explanations
        res_dir = "../../../data/ugc/res_ugc/"
        
        # the list of videos names, e.g. Animation_360P-3e40
        # we sort the list so we keep the same ids between two launches
        v_names = sorted(os.listdir(res_dir)) 

        # the list of measurements
        listVideo = []

        # we add each dataset in the list, converting the time to the right format
        # third line asserts that the measures are complete
        for v in sorted(v_names):
            data = pd.read_table(res_dir+v, delimiter = ',')
            listVideo.append(data)
        
        self.listVideo = listVideo
        
        self.predDimension = "kbs"
        
        # to sample the source and the target using the same seed
        self.random_state = np.random.randint(0,1000)
        
        self.features = ['cabac', '8x8dct', 'mbtree', 'rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 
                 'qpmax', 'aq-mode', 'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 
                 'b_adapt', 'direct', 'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']
        
        self.to_dummy_features = ['cabac', '8x8dct', 'mbtree', 'rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 
                 'qpmax', 'aq-mode', 'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 
                 'b_adapt', 'direct', 'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']
    
    
    def mape(self, y_true, y_pred):
        return np.mean(np.abs((y_true-y_pred)/y_true))
    
    def learn(self, target_id, pct_train):

        # random state , i.e. a seed to split the source and the target datasets
        # by using the same set of configurations for training and testing
        
        # We define the target video, and split it into train-test
        target = self.listVideo[target_id]
        #print(target.shape)
        dummies = pd.get_dummies(target[self.features], drop_first = False, columns=self.features)
        X_tgt = pd.DataFrame(np.array(dummies, dtype=int))
        #X_tgt = target[['cabac', 'mbtree','ref','subme']]
        y_tgt = np.array(target[self.predDimension], dtype=float)
        X_tgt_train, X_tgt_test, y_tgt_train, y_tgt_test = train_test_split(X_tgt, 
                                                                            y_tgt, 
                                                                            train_size=pct_train)
        lf = RandomForestRegressor(n_estimators=100)
        lf.fit(X_tgt_train, y_tgt_train)
        y_tgt_pred_test = np.array(lf.predict(X_tgt_test)).reshape(-1,1)
        #print(y_tgt_pred_test)

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return self.mape(y_tgt_test, y_tgt_pred_test)

In [13]:
nt = NT()

for pct_train in np.round(np.arange(0.1, 1, 0.1),2):
    print(pct_train, np.mean([nt.learn(target_id = 6, pct_train = pct_train) for i in range(5)]))

0.1 0.14749832459861906
0.2 0.17886076587994545
0.3 0.18425768743702525
0.4 0.18093377724380116
0.5 0.1732045768508626
0.6 0.1776159748556177
0.7 0.17588752935903565
0.8 0.17780601959250406
0.9 0.17775161673340825
