### GOAL
Here I explore training results/validation results for a few algorithms, trying to predict DOC with one two preprocessing pipelines designed in 
the notebook "DOC_5_Pipeline_Design_for_PreProcessing.ipynb"

In [267]:
import pandas as pd
import numpy as np
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor, export_graphviz
import graphviz
from scipy.stats import distributions as SSD
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.externals import joblib
from seaborn import heatmap
from IPython.core.display import HTML, display
import matplotlib.pyplot as pl

In [60]:
from sklearn import __version__ as skl_version

In [61]:
print(skl_version)

0.19.0


In [2]:
%matplotlib inline
display(HTML("<style>.container {width: 90% !important}</style>"))

In [47]:
def GetRrsIdx(df, label_list=None):
    if label_list:
        return [df.columns.get_loc(label) for label in label_list]
    else:
        return [df.columns.get_loc(col) for col in df.filter(like='Rrs', axis=1).columns]


class BandRatioAdder(BaseEstimator, TransformerMixin):
    """ class to add attributes. """
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rrs443_ix, rrs531_ix, rrs555_ix = 3, 4, 5
        band_ratio_531_443 = X[:, rrs531_ix] / X[:, rrs443_ix]
        band_ratio_555_443 = X[:, rrs555_ix] / X[:, rrs443_ix]
        return np.c_[X, band_ratio_531_443, band_ratio_555_443]


class RrsLogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rrs_ix_list = [2, 3, 4, 5, 6, 7, 8]
        X[:, rrs_ix_list] = np.log(X[:, rrs_ix_list])
        return X

def AssessModel(regressor, feats, labels, model_name=''):
    preds = regressor.predict(feats)
    model_mse = mean_squared_error(labels, preds)
    print("%s rmse: %.3f" % (model_name, np.sqrt(model_mse)))

def display_scores(scores):
    rmse_scores = np.sqrt(-scores)
    print("Scores: " , rmse_scores)
    print("Mean: ", rmse_scores.mean())
    print("Standard deviation", rmse_scores.std())

doc_ag412_pipeline = Pipeline([('imputer', Imputer(strategy='median')),
                               ('br_adder', BandRatioAdder()),
                               ('std_scaler', StandardScaler())])

In [4]:
with open('./PklJar/TrainSet.pkl', 'rb') as fb:
    trainFrames = pickle.load(fb)
dfTrainFeatures = trainFrames['features']
dfTrainLabels = trainFrames['labels']

**CAUTION:** functions implemented in the pipeline must be in the namespace.

In this case, these are contained in Helpers.py (imported above)

In [5]:
rrs443_ix, rrs531_ix, rrs555_ix = GetRrsIdx(dfTrainFeatures,
                                            label_list=['Rrs443', 'Rrs531', 'Rrs555'])
rrs_ix_list = GetRrsIdx(dfTrainFeatures)
rrs_ix_list = rrs_ix_list + [rrs_ix_list[-1] + 1, rrs_ix_list[-1] + 2]

In [195]:
features_names

['SST',
 'SSS',
 'Rrs412',
 'Rrs443',
 'Rrs531',
 'Rrs555',
 'Rrs667',
 'br_531_443',
 'br_555_443']

In [274]:
rrs443_ix, rrs531_ix, rrs555_ix = 3, 4, 5

In [275]:
rrs443_ix, rrs531_ix, rrs555_ix 

(3, 4, 5)

Training & evaluation on the Training Set

In [253]:
doc_features_preprocessed = doc_ag412_pipeline.fit_transform(dfTrainFeatures.values)

In [7]:
doc_labels = dfTrainLabels.as_matrix(columns=['doc'])

In [8]:
lin_reg_doc = LinearRegression()
lin_reg_doc.fit(doc_features_preprocessed, doc_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [30]:
AssessModel(lin_reg_doc, doc_features_preprocessed, doc_labels, model_name='lin_reg' )

lin_reg rmse: 21.584


In [34]:
sgd_doc = SGDRegressor(max_iter=1000)
sgd_doc.fit(doc_features_preprocessed, doc_labels.reshape((-1,)))

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [35]:
AssessModel(sgd_doc, doc_features_preprocessed, doc_labels, model_name='sgd')

sgd rmse: 21.601


In [44]:
# comparing to lin_reg and sgd
lin_reg_scores = cross_val_score(lin_reg_doc, doc_features_preprocessed, doc_labels,
                                scoring='neg_mean_squared_error', cv=10)
sgd_scores = cross_val_score(sgd_doc, doc_features_preprocessed, doc_labels.reshape((-1,)),
                            scoring='neg_mean_squared_error', cv=10)

In [49]:
display_scores(lin_reg_scores)

Scores:  [ 20.73787638  20.15781999  21.87362437  15.42237922  17.06222598
  28.24074133  28.27423931  42.74516779  22.1058328   21.69912084]
Mean:  23.8319028007
Standard deviation 7.40180538491


In [50]:
display_scores(sgd_scores)

Scores:  [ 20.88940831  20.22516235  21.73665881  15.24791377  16.48080812
  27.34215976  28.45216958  42.36234314  22.00228579  21.64682123]
Mean:  23.6385730858
Standard deviation 7.33876462849


In [180]:
ridge_doc = Ridge()
ridge_doc.fit(doc_features_preprocessed, doc_labels)
AssessModel(ridge_doc, doc_features_preprocessed, doc_labels, model_name='ridge')

ridge rmse: 21.623


In [187]:
lasso_doc = Lasso(alpha=0.05)
lasso_doc.fit(doc_features_preprocessed, doc_labels)
AssessModel(lasso_doc, doc_features_preprocessed, doc_labels, model_name='lasso')

lasso rmse: 21.607


In [193]:
e_net_doc = ElasticNet(alpha=0.1, l1_ratio=0.9)
e_net_doc.fit(doc_features_preprocessed, doc_labels)
AssessModel(e_net_doc, doc_features_preprocessed, doc_labels, model_name='ElasticNet')

ElasticNet rmse: 21.816


In [36]:
tree_doc = DecisionTreeRegressor()
tree_doc.fit(doc_features_preprocessed, doc_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [37]:
AssessModel(tree_doc, doc_features_preprocessed, doc_labels, model_name='d-tree')

d-tree rmse: 0.000


Mitigating the decision tree overfit using k-fold cross-validation

In [46]:
tree_scores = cross_val_score(tree_doc, doc_features_preprocessed, doc_labels,
                        scoring='neg_mean_squared_error', cv=10)

In [48]:
display_scores(tree_scores)

Scores:  [ 27.35474036  31.00433296  16.81202746  25.80543546  20.4533907
  25.46563619  36.15110518  42.27280109  32.35785155  30.10684743]
Mean:  28.7784168402
Standard deviation 7.00233708537


In [64]:
features_names = dfTrainFeatures.columns.tolist() + ['br_531_443', 'br_555_443']

In [87]:
feature_scores = [(name,score) for name, score in zip(features_names,
                                                     tree_doc.feature_importances_)]

In [88]:
feature_scores

[('SST', 0.023060026520039169),
 ('SSS', 0.86001633157142854),
 ('Rrs412', 0.0073552344742274459),
 ('Rrs443', 0.00087997498170493474),
 ('Rrs531', 0.028769476761792284),
 ('Rrs555', 0.021440038049392696),
 ('Rrs667', 0.015271590344997988),
 ('br_531_443', 0.033028757552767518),
 ('br_555_443', 0.010178569743649491)]

In [91]:
sorted_feature_scores = sorted(feature_scores, key=lambda x: x[1], reverse=True)

In [92]:
sorted_feature_scores

[('SSS', 0.86001633157142854),
 ('br_531_443', 0.033028757552767518),
 ('Rrs531', 0.028769476761792284),
 ('SST', 0.023060026520039169),
 ('Rrs555', 0.021440038049392696),
 ('Rrs667', 0.015271590344997988),
 ('br_555_443', 0.010178569743649491),
 ('Rrs412', 0.0073552344742274459),
 ('Rrs443', 0.00087997498170493474)]

In [219]:
export_graphviz(tree_doc, out_file='./FigJar/tree_example.dot', feature_names=features_names)

In [94]:
forest_doc = RandomForestRegressor()
forest_doc.fit(doc_features_preprocessed, doc_labels.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [96]:
forest_scores = cross_val_score(forest_doc, doc_features_preprocessed,
                                doc_labels.ravel(), 
                                scoring='neg_mean_squared_error', cv=10)

In [97]:
display_scores(forest_scores)

Scores:  [ 17.24582528  20.42944063  18.27214692  14.75014539  16.02748236
  21.93918614  29.58377162  31.15810535  17.67575827  24.96909436]
Mean:  21.2050956332
Standard deviation 5.38313220924


**Fine Tuning HyperParameters** of the random forest regressor with:
* scikit-learn's GridSearchCV
* scikit-optimize's BayesSearchCV

In [99]:
# First pickle all models trained so far...
joblib.dump(lin_reg_doc, './PklJar/Models/doc_6_lin_reg.pkl')
joblib.dump(sgd_doc, './PklJar/Models/doc_6_sgd.pkl')
joblib.dump(tree_doc, './PklJar/Models/doc_6_tree.pkl')
joblib.dump(forest_doc, './PklJar/Models/doc_6_forest.pkl')

['./PklJar/Models/doc_6_forest.pkl']

### Hyper Parameter tuning using Grid Search and Randomized Search
#### <u>Random Forest</u> trained using Grid Search and K-fold Cross-Validation (GridSearchCV)

In [200]:
param_grid_forest = [
    {'n_estimators': [80, 100, 200, 300, 400, 500, 600], 'max_features': [3, 5, 7, 'auto', 'sqrt', 'log2']},
    #{'bootstrap': [False], 'n_estimators': [30, 50, 70, 100], 'max_features': [2, 5, 8]}
]

In [201]:
forest_doc_2 = RandomForestRegressor()
grid_search_forest = GridSearchCV(forest_doc_2, param_grid_forest, cv=5, scoring='neg_mean_squared_error')
grid_search_forest.fit(doc_features_preprocessed, doc_labels.ravel())

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [80, 100, 200, 300, 400, 500, 600], 'max_features': [3, 5, 7, 'auto', 'sqrt', 'log2']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [276]:
grid_forest = grid_search_forest.best_estimator_

In [202]:
grid_search_forest.best_params_

{'max_features': 5, 'n_estimators': 100}

In [285]:
cvres = grid_search_forest.cv_results_
for mean_score, params in sorted(zip(cvres['mean_test_score'], cvres['params']), reverse=True)[:5]:
    print(np.sqrt(-mean_score), params)

21.0156381051 {'max_features': 5, 'n_estimators': 100}
21.0361471697 {'max_features': 7, 'n_estimators': 600}
21.0594815815 {'max_features': 7, 'n_estimators': 400}
21.161166091 {'max_features': 7, 'n_estimators': 500}
21.180933087 {'max_features': 7, 'n_estimators': 200}


In [283]:
# Features ranked by decreasing importance
for name, importance in sorted(zip(features_names, grid_forest.feature_importances_),
                               key=lambda x: x[1], reverse=True):
    print(f'feature: {name}, score: {importance}')

feature: SSS, score: 0.5803666664300997
feature: br_531_443, score: 0.11047150257024325
feature: Rrs667, score: 0.09414732559067467
feature: br_555_443, score: 0.08178003301941139
feature: SST, score: 0.03844657927731698
feature: Rrs555, score: 0.03689808041478406
feature: Rrs531, score: 0.021684805869839684
feature: Rrs412, score: 0.018398289009892897
feature: Rrs443, score: 0.01780671781773745


#### <u>Random Forest</u> using Randomized Search and K-fold Cross-Validation (RandomizedSearchCV)

In [297]:
param_grid_forest_2 = param_dist = {'n_estimators': SSD.poisson(500),
                                    'max_depth': [3, None],
                                    'max_features': SSD.randint(low=1, high=9),
                                    'min_samples_split': SSD.randint(2, 11),
                                    'min_samples_leaf': SSD.randint(1, 11),
                                    'bootstrap': [True, False],
                                    'criterion': ['mse', 'mae']}

In [298]:
forest_doc = RandomForestRegressor()
rand_search_forest = RandomizedSearchCV(forest_doc, param_distributions=param_grid_forest_2,
                                       scoring='neg_mean_squared_error', cv=5, n_iter=100)
rand_search_forest.fit(doc_features_preprocessed, doc_labels.ravel())
rand_forest = rand_search_forest.best_estimator_

In [299]:
# Features ranked by decreasing importance
for name, importance in sorted(zip(features_names, rand_forest.feature_importances_),
                               key=lambda x: x[1], reverse=True):
    print(f'feature: {name}, score: {importance}')

feature: SSS, score: 0.6783578920795299
feature: Rrs667, score: 0.08616714125285456
feature: br_531_443, score: 0.07813575197105779
feature: br_555_443, score: 0.06299479822945551
feature: SST, score: 0.03210372368305944
feature: Rrs555, score: 0.020615114657153454
feature: Rrs531, score: 0.017374194561803443
feature: Rrs412, score: 0.012876467182423405
feature: Rrs443, score: 0.011374916382662133


In [302]:
rand_search_forest.best_params_

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 6,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 456}

#### <u>Support Vector Regressor (SVR)</u> trained using GridSearchCV

In [213]:
C_start, C_end = 0.001, 1000
gamma_start, gamma_end = 0.001, 10
C_range = np.geomspace(C_start, C_end, num=int(np.log10(C_end/C_start) + 1))
gamma_range = np.geomspace(gamma_start, gamma_end, num=int(np.log10(gamma_end/gamma_start)+1)*2)
param_grid_svr = [{'kernel': ['linear'], 'C': C_range },
                  { 'kernel': ['rbf'], 'C': C_range, 'gamma': gamma_range}]

In [214]:
svr_doc = SVR()
grid_search_svr = GridSearchCV(svr_doc, param_grid_svr, cv=5, scoring='neg_mean_squared_error')
grid_search_svr.fit(doc_features_preprocessed, doc_labels.ravel())

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['linear'], 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03])}, {'kernel': ['rbf'], 'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   2.78256e-03,   7.74264e-03,   2.15443e-02,
         5.99484e-02,   1.66810e-01,   4.64159e-01,   1.29155e+00,
         3.59381e+00,   1.00000e+01])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [262]:
grid_svr = grid_search_svr.best_estimator_

In [281]:
cvres = grid_search_svr.cv_results_
show_result_num = 15
for mean_score, params in  sorted(zip(cvres['mean_test_score'], cvres['params']), reverse=True)[:show_result_num]:
    print(np.sqrt(-mean_score), params)

22.4761618359 {'C': 100.0, 'gamma': 0.021544346900318832, 'kernel': 'rbf'}
22.6515631321 {'C': 1000.0, 'gamma': 0.0027825594022071257, 'kernel': 'rbf'}
22.805247019 {'C': 1000.0, 'gamma': 0.0077426368268112694, 'kernel': 'rbf'}
23.3099925444 {'C': 100.0, 'kernel': 'linear'}
23.3296119807 {'C': 1000.0, 'kernel': 'linear'}
23.3618165906 {'C': 10.0, 'kernel': 'linear'}
23.4599317984 {'C': 100.0, 'gamma': 0.0077426368268112694, 'kernel': 'rbf'}
23.8058195368 {'C': 1000.0, 'gamma': 0.021544346900318832, 'kernel': 'rbf'}
23.8505184092 {'C': 1000.0, 'gamma': 0.001, 'kernel': 'rbf'}
24.0038912559 {'C': 100.0, 'gamma': 0.059948425031894091, 'kernel': 'rbf'}
24.7495684643 {'C': 1.0, 'kernel': 'linear'}
24.9114453444 {'C': 100.0, 'gamma': 0.1668100537200059, 'kernel': 'rbf'}
25.3843400094 {'C': 100.0, 'gamma': 0.0027825594022071257, 'kernel': 'rbf'}
26.0344213987 {'C': 10.0, 'gamma': 0.059948425031894091, 'kernel': 'rbf'}
26.0599326271 {'C': 1000.0, 'gamma': 0.059948425031894091, 'kernel': 'rbf'}

#### <u>SVR</u> using RandomizedSearchCV

In [243]:
param_grid_svr_2 = {'kernel': ['rbf'], 'C': SSD.expon(scale=500), 'gamma': SSD.expon(scale=0.1)}
svr_doc = SVR()
rand_search_svr = RandomizedSearchCV(svr_doc, param_distributions=param_grid_svr_2,
                                     scoring='neg_mean_squared_error', cv=5, n_iter=100)
rand_search_svr.fit(doc_features_preprocessed, doc_labels.ravel())

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'kernel': ['rbf'], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb5e95d5e10>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb5e9e71f98>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='neg_mean_squared_error',
          verbose=0)

In [245]:
rcvres = rand_search_svr.cv_results_
for i,(mean_score, params) in enumerate(sorted(zip(rcvres['mean_test_score'], rcvres['params']), reverse=True)):
    print(np.sqrt(-mean_score), params)
    if i>10:
        break

22.4243664554 {'C': 303.10365744231819, 'gamma': 0.012308071293193517, 'kernel': 'rbf'}
22.4319030822 {'C': 190.33225287110199, 'gamma': 0.014624728871265612, 'kernel': 'rbf'}
22.4856000102 {'C': 374.11590000424223, 'gamma': 0.010671094542824233, 'kernel': 'rbf'}
22.5008121606 {'C': 138.17526037645769, 'gamma': 0.016249706755990221, 'kernel': 'rbf'}
22.5296812884 {'C': 344.72573881537357, 'gamma': 0.015026724458630148, 'kernel': 'rbf'}
22.552098151 {'C': 158.95344647544314, 'gamma': 0.014001796701789419, 'kernel': 'rbf'}
22.6080641154 {'C': 484.49042027379954, 'gamma': 0.0048955759138413803, 'kernel': 'rbf'}
22.725245491 {'C': 73.085092672714637, 'gamma': 0.023536091249112399, 'kernel': 'rbf'}
22.7871294257 {'C': 822.53236779158658, 'gamma': 0.006347432810124099, 'kernel': 'rbf'}
22.890990654 {'C': 428.74047046334908, 'gamma': 0.016010496500786606, 'kernel': 'rbf'}
22.9618315898 {'C': 346.76053049553104, 'gamma': 0.019325958442163324, 'kernel': 'rbf'}
22.980374442 {'C': 3058.9221350668

In [246]:
rand_svr = rand_search_svr.best_estimator_

### Comparing results on test set

In [254]:
# Load Test set:
with open('./PklJar/TestSet.pkl', 'rb') as f:
    test_dict = pickle.load(f)
    dfTest_features = test_dict['features']
    dfTest_labels = test_dict['labels']
    del test_dict
# Run test set through preprocessing pipeline
test_features_processed =  doc_ag412_pipeline.transform(dfTest_features.values)
doc_test_labels = dfTest_labels['doc'].values

##### Random Forest with  Grid Search CV

In [303]:
test_pred_forest_grid = grid_forest.predict(test_features_processed)
forest_grid_final_mse = mean_squared_error(doc_test_labels, test_pred_forest_grid)
print("forest_grid final rmse: %.3f" % np.sqrt(forest_grid_final_mse))

forest_grid final rmse: 18.558


##### Random Forest with Randomized Search CV

In [300]:
test_pred_forest_rand = rand_forest.predict(test_features_processed)
forest_rand_final_mse = mean_squared_error(doc_test_labels, test_pred_forest_rand)
print("svr_rand final rmse: %.3f" % np.sqrt(forest_rand_final_mse))

svr_rand final rmse: 19.346


##### SVR with  Grid Search CV

In [264]:
test_pred_svr_grid = grid_svr.predict(test_features_processed)
svr_grid_final_mse = mean_squared_error(doc_test_labels, test_pred_svr_grid)
print("svr_grid final rmse: %.3f" % np.sqrt(svr_grid_final_mse))

svr_grid final rmse: 20.115


##### SVR with  Randomized Search CV

In [260]:
test_pred_svr_rand = rand_svr.predict(test_features_processed)
svr_rand_final_mse = mean_squared_error(doc_test_labels, test_pred_svr_rand)
print("svr_rand final rmse: %.3f" % np.sqrt(svr_rand_final_mse))

svr_rand final rmse: 19.911


### Testing on field data spanning

In [306]:
dfAntonio = pd.read_excel('/accounts/ekarakoy/DATA/SergioDOC/Algorithm_Sept_2011_AVG_MERGED_30min_MATCH_Nov2011_data_compilation_for-Erdem-Sergio.xlsx',
                     sheetname='ForML')

In [310]:
dfAntonio.dropna(inplace=True)
dfA_labels = dfAntonio['DOC']

In [312]:
dfA_features = dfAntonio.iloc[:, 1:]
dfA_features.head()

Unnamed: 0,Temp,Salinity,Rrs412,Rrs443,Rrs532,Rrs555,Rrs665
0,6.4368,32.73,0.001985,0.002234,0.001967,0.00155,0.000176
1,6.7481,32.643,0.0017,0.001818,0.002168,0.001858,0.000229
2,6.4288,32.7849,0.001666,0.001981,0.002592,0.002248,0.000262
3,6.3396,30.087,0.002146,0.002986,0.005287,0.005039,0.000792
4,5.9519,31.053,0.002645,0.003534,0.005942,0.005494,0.000837


In [314]:
antonio_features_processed =  doc_ag412_pipeline.transform(dfA_features.values)
doc_antonio_labels = dfA_labels.values

In [315]:
antonio_pred_forest_grid = grid_forest.predict(antonio_features_processed)
forest_grid_final_mse_antonio = mean_squared_error(doc_antonio_labels, antonio_pred_forest_grid)
print("forest_grid final rmse: %.3f" % np.sqrt(forest_grid_final_mse_antonio))

forest_grid final rmse: 8.327
