In [None]:
import csv
import os
from getdata import dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.metrics import mean_squared_error

path = "./dataset"
# print(os.listdir(path))
# print(os.path.join(path,'*.csv'))

for input_file in os.listdir(path):
    data, target = dataset(input_file)
    X_train, X_test, y_train, y_test = train_test_split(
        data, target, test_size=0.2, random_state=0)
    print("====="+input_file+"==========")
    print("X_train:", X_train.shape)
    print("y_train:", y_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_test shape:", y_test.shape)
    print("=============================")

    # pipe=Pipeline([('preprocessing', None), ('feature_selection', SelectFromModel(RandomForestRegressor(n_estimators=64,max_features=X_train.shape[1]))), ('model',RandomForestRegressor())],  memory="cache_folder")
    pipe=Pipeline([('preprocessing', None), ('model',RandomForestRegressor())],  memory="cache_folder")
    param_grid = [
        {'preprocessing': [None], 
         # 'feature_selection__threshold':["0.5*mean","mean"],
        'model__n_estimators': [100],
        'model__max_depth': [15],
        'model__min_samples_leaf': [2],
        'model__max_leaf_nodes': [None],},
        {'model': [MLPRegressor(solver='lbfgs')],
         'preprocessing' : [StandardScaler(),None],
         'model__activation' : ['relu','logistic','tanh'],
         'model__hidden_layer_sizes':[(2,),(3,),(5,),(7,),(9,),(13,),(17,),(23,),(29,)],  
        }]

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Best params:\n{}\n".format(grid.best_params_))
    print("Best cross-validation score(root mean squared error): {:.2f}".format(((grid.best_score_)*(-1))**0.5))
    print("Test-set score(root mean squared error): {:.2f}".format((grid.score(X_test, y_test)*(-1))**0.5))
    print("\n======================================================================================")
    print("======================================================================================\n\n")

In [None]:
=====winequality-white.csv==========
X_train: (3918, 11)
y_train: (3918,)
X_test shape: (980, 11)
y_test shape: (980,)
=============================
Best params:
{'model__max_depth': 15, 'model__max_leaf_nodes': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessing': None}

Best cross-validation score(root mean squared error): 0.71
Test-set score(root mean squared error): 0.79

======================================================================================
======================================================================================


=====winequality-red.csv==========
X_train: (1279, 11)
y_train: (1279,)
X_test shape: (320, 11)
y_test shape: (320,)
=============================
Best params:
{'model__max_depth': 15, 'model__max_leaf_nodes': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessing': None}

Best cross-validation score(root mean squared error): 0.75
Test-set score(root mean squared error): 0.71

======================================================================================
======================================================================================


=====appliancesenergy.csv==========
X_train: (15788, 25)
y_train: (15788,)
X_test shape: (3947, 25)
y_test shape: (3947,)
=============================
Best params:
{'model__max_depth': 15, 'model__max_leaf_nodes': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessing': None}

Best cross-validation score(root mean squared error): 74.03
Test-set score(root mean squared error): 79.14

======================================================================================
======================================================================================


=====airfoil.csv==========
X_train: (1202, 5)
y_train: (1202,)
X_test shape: (301, 5)
y_test shape: (301,)
=============================
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Best params:
{'model__max_depth': 15, 'model__max_leaf_nodes': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessing': None}

Best cross-validation score(root mean squared error): 0.31
Test-set score(root mean squared error): 0.28

======================================================================================
======================================================================================


=====mg.csv==========
X_train: (1108, 6)
y_train: (1108,)
X_test shape: (277, 6)
y_test shape: (277,)
=============================
Best params:
{'model': MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(29,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False), 'model__activation': 'logistic', 'model__hidden_layer_sizes': (29,), 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best cross-validation score(root mean squared error): 0.53
Test-set score(root mean squared error): 0.56

======================================================================================
======================================================================================


=====cadata.csv==========
X_train: (16512, 8)
y_train: (16512,)
X_test shape: (4128, 8)
y_test shape: (4128,)
=============================
Best params:
{'model__max_depth': 15, 'model__max_leaf_nodes': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessing': None}

Best cross-validation score(root mean squared error): 0.44
Test-set score(root mean squared error): 0.43

======================================================================================
======================================================================================


=====skillcraft.csv==========
X_train: (2670, 18)
y_train: (2670,)
X_test shape: (668, 18)
y_test shape: (668,)
=============================
Best params:
{'model': MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(2,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False), 'model__activation': 'logistic', 'model__hidden_layer_sizes': (2,), 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best cross-validation score(root mean squared error): 0.92
Test-set score(root mean squared error): 0.96

======================================================================================
======================================================================================


=====cpusmall.csv==========
X_train: (6553, 12)
y_train: (6553,)
X_test shape: (1639, 12)
y_test shape: (1639,)
=============================
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Best params:
{'model__max_depth': 15, 'model__max_leaf_nodes': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessing': None}

Best cross-validation score(root mean squared error): 2.88
Test-set score(root mean squared error): 2.81

======================================================================================
======================================================================================


=====telemonitoring.csv==========
X_train: (4700, 16)
y_train: (4700,)
X_test shape: (1175, 16)
y_test shape: (1175,)
=============================
Best params:
{'model__max_depth': 15, 'model__max_leaf_nodes': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessing': None}

Best cross-validation score(root mean squared error): 8.50
Test-set score(root mean squared error): 8.51

======================================================================================
======================================================================================


=====spacega.csv==========
X_train: (2485, 6)
y_train: (2485,)
X_test shape: (622, 6)
y_test shape: (622,)
=============================
Best params:
{'model': MLPRegressor(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(23,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False), 'model__activation': 'tanh', 'model__hidden_layer_sizes': (23,), 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best cross-validation score(root mean squared error): 0.51
Test-set score(root mean squared error): 0.55

======================================================================================
======================================================================================


=====bikesharing.csv==========
X_train: (13903, 14)
y_train: (13903,)
X_test shape: (3476, 14)
y_test shape: (3476,)
=============================
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Best params:
{'model': MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(3,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False), 'model__activation': 'relu', 'model__hidden_layer_sizes': (3,), 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best cross-validation score(root mean squared error): 0.00
Test-set score(root mean squared error): 0.01

======================================================================================
======================================================================================


=====concretecs.csv==========
X_train: (824, 8)
y_train: (824,)
X_test shape: (206, 8)
y_test shape: (206,)
=============================
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Best params:
{'model': MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(29,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False), 'model__activation': 'logistic', 'model__hidden_layer_sizes': (29,), 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best cross-validation score(root mean squared error): 0.32
Test-set score(root mean squared error): 0.34

======================================================================================
======================================================================================

