In [1]:
import torch
import joblib
from tqdm import tqdm
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, pearsonr

In [18]:
x_train = torch.load('../data/X_tensor_APSIPA.pt')
y_train = torch.load('../data/y_tensor_APSIPA.pt')
ref_names = joblib.load('../data/ref_names_APSIPA.pkl')
codec_names = joblib.load('../data/codecs_APSIPA.pkl')

In [19]:
X_train = []
for x in x_train:
    vector = [v.detach().numpy() for v in x]
    X_train.append(vector)

In [20]:
refs = list(set(ref_names))
codecs = list(set(codec_names))

In [5]:
refs

['romanoillamp_vox10',
 'loot_vox10_1200',
 'head_00039_vox9',
 'the20smaria_00600_vox10',
 'soldier_vox10_0690',
 'amphoriskos_vox10',
 'longdress_vox10_1300',
 'biplane_vox10']

In [21]:
codecs

['TEXT', 'TRISOUP_PREDLIFT', 'OCTREE_PRED_LIFT', 'TRISOUP_RAHT', 'OCTREE_RAHT']

In [22]:
# the key is the reference that is excluded from the group
groups_by_ref = {}
for ref in refs:
    xtrain, ytrain = [], []
    xtest, ytest = [], []
    for i, ref_name in enumerate(ref_names):
        if ref_name == ref:
            xtest.append(X_train[i])
            ytest.append(y_train[i])
            continue
        xtrain.append(X_train[i])
        ytrain.append(y_train[i])
    groups_by_ref[ref] = [xtrain, ytrain, xtest, ytest]

In [30]:
# the key is the reference that is excluded from the group
groups_by_codec = {}
for codec in codecs:
    xtrain, ytrain = [], []
    xtest, ytest = [], []
    for i, codec_name in enumerate(codec_names):
        if codec_name == codec:
            xtest.append(X_train[i])
            ytest.append(y_train[i])
            continue
        xtrain.append(X_train[i])
        ytrain.append(y_train[i])
    groups_by_codec[codec] = [xtrain, ytrain, xtest, ytest]

In [24]:
def get_etr_model():
    return ExtraTreesRegressor(
    n_estimators=37,
    min_samples_split=15,
    min_samples_leaf=4,
    max_features='log2',  # type: ignore
    max_depth=7
)

In [25]:
def get_svr_model():
    return SVR(
        kernel='rbf',
        gamma=1,  # type: ignore
        epsilon=0.01,
        degree=2,
        C=5
    )

In [26]:
def get_lgbm_model():
    return lgb.LGBMRegressor(
        subsample_for_bin=140000,
        reg_lambda=0.1,
        reg_alpha=1.0,
        num_leaves=100,
        n_estimators=166,
        min_split_gain=1,
        min_child_weight=0.0001,
        min_child_samples=20,
        learning_rate=0.1,
        colsample_bytree=1.0,
        boosting_type='dart'
    )

In [27]:
models = ['lgbm', 'svr', 'etr']

In [28]:
results_by_ref = []
for ref_out, xy in tqdm(groups_by_ref.items()):
    result = {'group_out': ref_out}
    xtrain, ytrain = xy[0], xy[1]
    xtest, ytest = xy[2], xy[3]
    for model_name in models:
        if model_name == 'lgbm':
            model = get_lgbm_model()
        if model_name == 'svr':
            model = get_svr_model()
        if model_name == 'etr':
            model = get_etr_model()
        model.fit(xtrain, ytrain)
        ypred = model.predict(xtest)
        result[f'{model_name}-pearson'] = pearsonr(ytest, ypred)[0]
        result[f'{model_name}-spearman'] = spearmanr(ytest, ypred)[0]
        result[f'{model_name}-mse'] = mean_squared_error(ytest, ypred)
    results_by_ref.append(result)

100%|██████████| 8/8 [00:01<00:00,  4.15it/s]


In [31]:
results_by_codec = []
for codec_out, xy in tqdm(groups_by_codec.items()):
    result = {'group_out': codec_out}
    xtrain, ytrain = xy[0], xy[1]
    xtest, ytest = xy[2], xy[3]
    for model_name in models:
        if model_name == 'lgbm':
            model = get_lgbm_model()
        if model_name == 'svr':
            model = get_svr_model()
        if model_name == 'etr':
            model = get_etr_model()
        model.fit(xtrain, ytrain)
        ypred = model.predict(xtest)
        result[f'{model_name}-pearson'] = pearsonr(ytest, ypred)[0]
        result[f'{model_name}-spearman'] = spearmanr(ytest, ypred)[0]
        result[f'{model_name}-mse'] = mean_squared_error(ytest, ypred)
    results_by_codec.append(result)

100%|██████████| 5/5 [00:01<00:00,  3.30it/s]


In [37]:
df_results_by_ref = pd.DataFrame(results_by_ref)
df_results_by_codec = pd.DataFrame(results_by_codec)

In [38]:
to_concat_by_ref = {
    'group_out': 'mean',
    'lgbm-pearson': df_results_by_ref.loc[:, 'lgbm-pearson'].mean(),
    'lgbm-spearman': df_results_by_ref.loc[:, 'lgbm-spearman'].mean(),
    'lgbm-mse': df_results_by_ref.loc[:, 'lgbm-mse'].mean(),
    'svr-pearson': df_results_by_ref.loc[:, 'svr-pearson'].mean(),
    'svr-spearman': df_results_by_ref.loc[:, 'svr-spearman'].mean(),
    'svr-mse': df_results_by_ref.loc[:, 'svr-mse'].mean(),
    'etr-pearson': df_results_by_ref.loc[:, 'etr-pearson'].mean(),
    'etr-spearman': df_results_by_ref.loc[:, 'etr-spearman'].mean(),
    'etr-mse': df_results_by_ref.loc[:, 'etr-mse'].mean()
}

In [39]:
to_concat_by_codec = {
    'group_out': 'mean',
    'lgbm-pearson': df_results_by_codec.loc[:, 'lgbm-pearson'].mean(),
    'lgbm-spearman': df_results_by_codec.loc[:, 'lgbm-spearman'].mean(),
    'lgbm-mse': df_results_by_codec.loc[:, 'lgbm-mse'].mean(),
    'svr-pearson': df_results_by_codec.loc[:, 'svr-pearson'].mean(),
    'svr-spearman': df_results_by_codec.loc[:, 'svr-spearman'].mean(),
    'svr-mse': df_results_by_codec.loc[:, 'svr-mse'].mean(),
    'etr-pearson': df_results_by_codec.loc[:, 'etr-pearson'].mean(),
    'etr-spearman': df_results_by_codec.loc[:, 'etr-spearman'].mean(),
    'etr-mse': df_results_by_codec.loc[:, 'etr-mse'].mean()
}

In [40]:
df_conc_by_ref = pd.DataFrame([to_concat_by_ref])
df_results_by_ref = pd.concat([df_results_by_ref, df_conc_by_ref])

In [41]:
df_conc_by_codec = pd.DataFrame([to_concat_by_codec])
df_results_by_codec = pd.concat([df_results_by_codec, df_conc_by_codec])

In [36]:
df_results_by_ref

Unnamed: 0,group_out,lgbm-pearson,lgbm-spearman,lgbm-mse,svr-pearson,svr-spearman,svr-mse,etr-pearson,etr-spearman,etr-mse
0,romanoillamp_vox10,0.954732,0.964774,0.65014,0.943044,0.94968,0.457908,0.943577,0.9482,0.429481
1,loot_vox10_1200,0.938353,0.972065,0.194798,0.940321,0.972716,0.307531,0.93576,0.971234,0.353658
2,head_00039_vox9,0.954444,0.973553,0.693719,0.958956,0.972627,0.486603,0.946296,0.981012,0.512762
3,the20smaria_00600_vox10,0.95141,0.97028,0.153863,0.954043,0.977645,0.234751,0.957994,0.972704,0.240833
4,soldier_vox10_0690,0.956258,0.973613,0.144662,0.950766,0.973577,0.22772,0.953819,0.975553,0.24819
5,amphoriskos_vox10,0.90541,0.941721,0.317693,0.924268,0.952076,0.282869,0.918189,0.95109,0.320457
6,longdress_vox10_1300,0.886687,0.961844,0.467367,0.933535,0.974565,0.40905,0.919799,0.972215,0.42916
7,biplane_vox10,0.949477,0.981732,0.951615,0.939075,0.965874,0.696461,0.943555,0.977701,0.652686
0,mean,0.937096,0.967448,0.446732,0.943001,0.967345,0.387862,0.939873,0.968714,0.398403


In [42]:
df_results_by_codec

Unnamed: 0,group_out,lgbm-pearson,lgbm-spearman,lgbm-mse,svr-pearson,svr-spearman,svr-mse,etr-pearson,etr-spearman,etr-mse
0,TEXT,0.758858,0.778513,0.747033,0.746872,0.748968,0.534426,0.727012,0.742586,0.597294
1,TRISOUP_PREDLIFT,0.890865,0.923112,0.367507,0.917649,0.924389,0.334415,0.911918,0.917246,0.363575
2,OCTREE_PRED_LIFT,0.967148,0.970886,0.270933,0.970171,0.96504,0.203554,0.963848,0.966187,0.224962
3,TRISOUP_RAHT,0.903553,0.936039,0.359598,0.898263,0.933291,0.392223,0.901169,0.936055,0.377531
4,OCTREE_RAHT,0.967294,0.961921,0.236708,0.958332,0.902742,0.223918,0.96796,0.956304,0.188979
0,mean,0.897544,0.914095,0.396356,0.898257,0.894886,0.337707,0.894382,0.903676,0.350468
