In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from scipy import stats
import joblib
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.model_selection import GroupShuffleSplit
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

In [2]:
%matplotlib inline

In [3]:
def rmse(y_true, y_pred):
    rmse = np.sqrt(metrics.mean_squared_error(y_true, y_pred))
    return rmse

def lcc(y_true, y_pred):
    lcc, _ = stats.pearsonr(y_true, y_pred)
    return lcc

def srocc(y_true, y_pred):
    srocc, _ = stats.spearmanr(y_true, y_pred)
    return srocc

In [4]:
def accuracy(y_true, y_pred):
    ratio = abs(y_true - y_pred) / y_true
    return 1 - np.mean(ratio)

In [5]:
scorer = {}
scorer['rmse'] = metrics.make_scorer(rmse, greater_is_better=False)
scorer['lcc'] = metrics.make_scorer(lcc, greater_is_better=True)
scorer['srocc'] = metrics.make_scorer(srocc, greater_is_better=True)
scorer

{'rmse': make_scorer(rmse, greater_is_better=False),
 'lcc': make_scorer(lcc),
 'srocc': make_scorer(srocc)}

In [6]:
features = pd.read_pickle('features.pkl')
labels = pd.read_pickle('labels.pkl')

In [7]:
labels.index

Index(['G10BoatInPark_ERP_4096x2048_fps30_qp27_14547k.mp4',
       'G10BoatInPark_ERP_4096x2048_fps30_qp37_3270k.mp4',
       'G10BoatInPark_ERP_4096x2048_fps30_qp42_1507k.mp4',
       'G10BodybuildingWorkout_ERP_7680x3840_fps29.97_qp27_6105k.mp4',
       'G10BodybuildingWorkout_ERP_7680x3840_fps29.97_qp37_913k.mp4',
       'G10BodybuildingWorkout_ERP_7680x3840_fps29.97_qp42_697k.mp4',
       'G10BuddhaCave_ERP_4096x2048_fps30_qp27_1289k.mp4',
       'G10BuddhaCave_ERP_4096x2048_fps30_qp37_236k.mp4',
       'G10BuddhaCave_ERP_4096x2048_fps30_qp42_170k.mp4',
       'G10DrivingInCountry_ERP_3840x1920_fps30_qp27_27706k.mp4',
       ...
       'G9DivingWithJellyfish_ERP_7680x3840_fps25_qp42_876k.mp4',
       'G9DrivingInCity_ERP_3840x1920_fps30_qp27_11315k.mp4',
       'G9DrivingInCity_ERP_3840x1920_fps30_qp37_2350k.mp4',
       'G9DrivingInCity_ERP_3840x1920_fps30_qp42_1069k.mp4',
       'G9FootballMatch_ERP_4096x2048_fps30_qp27_1529k.mp4',
       'G9FootballMatch_ERP_4096x2048_fps30_qp37

In [8]:
labels.head()

G10BoatInPark_ERP_4096x2048_fps30_qp27_14547k.mp4               34.140535
G10BoatInPark_ERP_4096x2048_fps30_qp37_3270k.mp4                41.399333
G10BoatInPark_ERP_4096x2048_fps30_qp42_1507k.mp4                54.386335
G10BodybuildingWorkout_ERP_7680x3840_fps29.97_qp27_6105k.mp4    35.192198
G10BodybuildingWorkout_ERP_7680x3840_fps29.97_qp37_913k.mp4     56.533862
Name: DMOS, dtype: float64

In [9]:
group_label = np.arange(len(features.index) / 3)
group_label = np.matlib.repmat(group_label,3,1)
group_label = group_label.reshape(-1,1, order='F')

In [10]:
Reg_video = RandomForestRegressor(random_state=8, n_jobs=1)

In [11]:
def CV_Generator(features, labels, group_label, n=8, test_ratio=0.2):
    CV_Group = GroupShuffleSplit(n_splits=n, test_size=test_ratio, random_state=8)
    for train, test in CV_Group.split(features, labels, groups=group_label):
        yield train, test

In [12]:
parameters_grid_GCV_3MET = {}

In [13]:
parameters_grid_GCV_3MET['n_estimators'] = [411, 729]
parameters_grid_GCV_3MET['criterion'] = ['mse']
parameters_grid_GCV_3MET['max_depth'] = [8]
parameters_grid_GCV_3MET['min_samples_split'] = [3]
parameters_grid_GCV_3MET['min_samples_leaf'] = [3]
parameters_grid_GCV_3MET['max_features'] = ['auto']
parameters_grid_GCV_3MET['bootstrap'] = [True]
parameters_grid_GCV_3MET['verbose'] = [0]
parameters_grid_GCV_3MET['oob_score'] = [True]

In [14]:
parameters_grid_GCV_3MET

{'n_estimators': [411, 729],
 'criterion': ['mse'],
 'max_depth': [8],
 'min_samples_split': [3],
 'min_samples_leaf': [3],
 'max_features': ['auto'],
 'bootstrap': [True],
 'verbose': [0],
 'oob_score': [True]}

In [17]:
parameters_grid_search_GCV_3MET = GridSearchCV(estimator = Reg_video, param_grid = parameters_grid_GCV_3MET, 
                          cv = CV_Generator(features, labels, group_label), n_jobs = 1, verbose = 1, return_train_score=True, 
                                      error_score = np.nan, scoring = scorer, refit = 'rmse', iid=False)

In [18]:
parameters_grid_search_GCV_3MET.fit(features, labels)

Fitting 8 folds for each of 2 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:   15.6s finished


GridSearchCV(cv=<generator object CV_Generator at 0x1096cd938>,
       error_score=nan,
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
           oob_score=False, random_state=8, verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=1,
       param_grid={'n_estimators': [411, 729], 'criterion': ['mse'], 'max_depth': [8], 'min_samples_split': [3], 'min_samples_leaf': [3], 'max_features': ['auto'], 'bootstrap': [True], 'verbose': [0], 'oob_score': [True]},
       pre_dispatch='2*n_jobs', refit='rmse', return_train_score=True,
       scoring={'rmse': make_scorer(rmse, greater_is_better=False), 'lcc': make_scorer(lcc), 'srocc': make_scorer(srocc)},
       verbose=1)

In [None]:
parameters_grid_search_GCV_3MET.best_params_

In [None]:
idx = 0
for train_idx, test_idx in CV_Generator(features, labels, group_label, 16, test_ratio=0.2):
    train_features = features.iloc[train_idx]
    train_labels = labels.iloc[train_idx]
    test_features = features.iloc[test_idx]
    test_labels = labels.iloc[test_idx]
    best_model = parameters_grid_search_GCV_3MET.best_estimator_.fit(train_features, train_labels)
    train_pred = best_model.predict(train_features)
    test_pred = best_model.predict(test_features)
    if idx == 5:
        final_model = best_model
        print('Model saved')
    print('RMSE:', rmse(train_labels, train_pred), rmse(test_labels, test_pred))
    print('LCC:', lcc(train_labels, train_pred), lcc(test_labels, test_pred))
    print('SROCC:', srocc(train_labels, train_pred), srocc(test_labels, test_pred))
    #print('Accuracy:', accuracy(train_labels, train_pred), accuracy(test_labels, test_pred))
    print('\n')
    idx += 1

In [None]:
feature_importance = pd.DataFrame(final_model.feature_importances_, index=features.columns, columns=['Weight'])

In [None]:
feature_importance = feature_importance.sort_values(by = 'Weight', ascending=False)

In [None]:
feature_importance

In [None]:
fig, ax = plt.subplots(figsize = (24, 9))
ax.bar(feature_importance.index, feature_importance['Weight'])
plt.tight_layout

In [None]:
features_augment = features.loc[:,['msssim', 'TA', 'psnrhvsm']]