In [1]:
import pickle
from os import listdir
import warnings
warnings.filterwarnings('ignore') # nao quero warning de convergência

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
# set dos nomes do diretório
fnames = {f.split('-train')[0].split('-test')[0] for f in listdir(path='./datasets') if 'ipynb' not in f and 'keijzer' not in f}
print(fnames)

folds  = list(range(5))
algoritmos = {'Ridge':RidgeCV,'Lasso':LassoCV,'Lars':LassoLarsCV,'ElasticNet':ElasticNetCV,
              'XGBoost':GradientBoostingRegressor,'MLP':MLPRegressor}

paramsXG = {'n_estimators': (10, 50, 100), 'max_depth': (2, 3, 4), 
          'learning_rate': (0.01, 0.1, 1)}

paramsMLP = {'hidden_layer_sizes': ( (50,), (100,), (500,), (50,50,), (100,50,) ), 
             'activation':('tanh', 'relu')}

# listas com o que queremos comparar
dataset   = []
algoritmo = []
rmse_l    = []

for D in fnames:
    print(D)
    for algname, model_fn in algoritmos.items():
        rmse = 0.0
        for fold in folds:
            X_train = np.loadtxt(f'datasets/{D}-train-{fold}.dat', delimiter=',')
            X_test  = np.loadtxt(f'datasets/{D}-test-{fold}.dat', delimiter=',')
            
            X_train, y_train = X_train[:, :-1], X_train[:,-1]
            X_test, y_test   = X_test[:, :-1], X_test[:,-1]
            
            if algname == 'XGBoost':
                reg = model_fn()
                model = GridSearchCV(reg, paramsXG)
            elif algname == 'MLP':
                scaler = StandardScaler()
                scaler.fit(X_train)
                X_train = scaler.transform(X_train)
                X_test  = scaler.transform(X_test)
                reg = model_fn()
                model = GridSearchCV(reg, paramsMLP)
            else:
                model = model_fn()

            model.fit(X_train, y_train)                
            y_hat = model.predict(X_test)
            rmse += np.sqrt(np.square(y_hat - y_test).mean())
        rmse /= 5.0
        dataset.append(D)
        algoritmo.append(algname)
        rmse_l.append(rmse)

{'yacht', 'airfoil', 'energyHeating', 'wineRed', 'towerData', 'concrete', 'energyCooling', 'ppb', 'wineWhite', 'cpu', 'bioavailability', 'forestfires'}
yacht
airfoil
energyHeating
wineRed
towerData
concrete
energyCooling
ppb
wineWhite
cpu
bioavailability
forestfires


In [9]:
data = {'dataset':dataset, 'algoritmo':algoritmo, 'rmse':rmse_l}
df = pd.DataFrame(data)

In [13]:
# salvar o df no arquivo
fw = open('df_GS.pkl', 'wb')
pickle.dump(df, fw)
fw.close()

In [None]:
# carregar o df do arquivo (pra não ter q rodar as células acima novamente)
fw = open('df_GS.pkl', 'rb')
df = pickle.load(fw)
fw.close()

In [12]:
for D in fnames:
    print('\n', D)
    print(df.loc[df['dataset'] == D])


 yacht
    algoritmo dataset      rmse
0       Ridge   yacht  8.996498
1       Lasso   yacht  8.972195
2        Lars   yacht  8.983248
3  ElasticNet   yacht  9.067633
4     XGBoost   yacht  0.868652
5         MLP   yacht  3.330309

 airfoil
     algoritmo  dataset      rmse
6        Ridge  airfoil  4.862221
7        Lasso  airfoil  6.142515
8         Lars  airfoil  4.819300
9   ElasticNet  airfoil  6.169792
10     XGBoost  airfoil  2.186777
11         MLP  airfoil  4.836487

 energyHeating
     algoritmo        dataset      rmse
12       Ridge  energyHeating  2.970699
13       Lasso  energyHeating  4.235185
14        Lars  energyHeating  2.942069
15  ElasticNet  energyHeating  4.502519
16     XGBoost  energyHeating  0.382823
17         MLP  energyHeating  2.842564

 wineRed
     algoritmo  dataset      rmse
18       Ridge  wineRed  0.650684
19       Lasso  wineRed  0.656002
20        Lars  wineRed  0.650573
21  ElasticNet  wineRed  0.657719
22     XGBoost  wineRed  0.607601
23        