# Testing regressors

Goal: Finding the best possible regressor for traversing from the latent space of z to the parameters space.

In [2]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.decomposition import PCA

## Trying the naive approach

### Import data


In [3]:
permute_idxs = np.load("permuted_100000.npy")
naive_input = np.load("/media/pawel/DATA/tmp/freddie_mercuries/naive_input.npy")[permute_idxs]
out_vec = np.load("./data/outvec.npy")[permute_idxs]

permute_idxs.shape, naive_input.shape, out_vec.shape

((100000,), (100000, 65536), (100000, 5))

### Test on 100 samples



In [4]:
def scale(X):
    skaler = StandardScaler()
    skaler.fit(X)
    X = skaler.transform(X)
    return X

def train_and_eval(model, naive_input, out_vec):
    naive_input, out_vec = scale(naive_input), scale(out_vec)
    X_train, X_test, y_train, y_test = train_test_split(naive_input, out_vec, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    
    print(f"Train: {model.score(X_train, y_train)}")
    print(f"Test: {model.score(X_test, y_test)}")


In [18]:

X_naive_1000, Y_naive_1000 = naive_input[:1000], out_vec[:1000]
train_and_eval(RandomForestRegressor(n_estimators=100, max_depth=5), X_naive_1000, Y_naive_1000)

Train: 0.80412245582356
Test: 0.7221271433352514


In [17]:
train_and_eval(MLPRegressor(hidden_layer_sizes=(100, 100)), X_naive_1000, Y_naive_1000)

Train: -3.2906412949528616
Test: -5.574728534802301



### Apply PCA



In [5]:
naive_input_pca = PCA(256)
naive_input_pca.fit(naive_input[:10000])

naive_pca_list = [naive_input_pca.transform(naive_input[100*i:100*(i+1)]) for i in range(1000)]
X_naive_pca = np.concatenate(naive_pca_list)

Transformed


[array([[ 1.20763813e+03, -1.08051557e+03,  3.04430358e+03, ...,
         -3.09754797e+01,  3.88684082e+01,  8.22950794e+00],
        [-6.71025508e+02, -4.40738254e+02, -1.62452211e+03, ...,
         -3.89359942e+01,  2.97974854e+01,  4.80424879e+00],
        [ 4.45635325e+02, -1.12845585e+03, -1.46950884e+03, ...,
         -8.23127012e+00, -1.58790755e+01, -2.88604277e+01],
        ...,
        [-5.35552247e+02, -1.28890509e+03, -6.12754490e+02, ...,
          2.26186314e+01, -2.22616573e+00,  2.53132852e+01],
        [ 5.48730048e+02, -1.26607614e+03, -5.26281368e+02, ...,
          7.88257838e+00, -4.68544583e+00,  3.45274452e+01],
        [ 2.99270025e+03,  1.95853566e+02,  2.25585735e+02, ...,
         -5.71490077e+01,  2.57314951e+01,  2.20450775e+01]]),
 array([[-5.46285806e+02, -1.43217277e+03, -1.58733687e+02, ...,
         -2.16891917e+01,  5.57108264e+01,  1.66080469e+01],
        [-2.26913844e+03, -2.27299389e+01,  1.04268664e+03, ...,
         -1.39480924e+01,  2.34575170e

In [10]:
Y = out_vec
train_and_eval(RandomForestRegressor(n_estimators=100, max_depth=5), X_naive_pca, Y)

Train: 0.5042843052116812
Test: 0.5032633631352539
