In [1]:
# Import libraries necessary for this project
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import os
import HyperSpectrumDataSet as hsd
import sys
import datetime
import copy
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_squared_error
# Import supplementary visualization code visuals.py

# Pretty display for notebooks
%matplotlib inline

In [2]:
all_traits = ["LMA_O", "Narea_O", "SPAD_O", "Nmass_O", "Pmass_O", "Vcmax", "Vcmax25", "J", "Photo_O", "Cond_O"]




In [3]:
best_paras = {}

best_paras["LMA_O"] = [0.01, 3, 0.8]
best_paras["Narea_O"] = [0.01, 3, 0.8]
best_paras["SPAD_O"] = [0.05, 3, 0.8] 
best_paras["Nmass_O"] = [0.1, 7, 0.5]
best_paras["Pmass_O"] = [0.1, 7, 0.8] 
best_paras["Vcmax"] = [0.01, 3, 0.8] 
best_paras["Vcmax25"] = [0.1, 7, 0.8] 
best_paras["J"] = [0.01, 3, 0.8] 
best_paras["Photo_O"] = [0.1, 9, 0.5] 
best_paras["Cond_O"] = [0.01, 3, 0.8]


In [4]:
use_best_params = True

drop_percent = 0.2
exp_desc = "XGBRegressor_optimized_drop_"+str(drop_percent)

run_traits = all_traits

train_results_r2 = {}
val_results_r2 = {}
test_results_r2 = {}

sum_train = 0.0
sum_val = 0.0
sum_test = 0.0
print(exp_desc)
for target_trait in run_traits:
    train_ds = hsd.HyperSpectrumDataSet(target_trait, "train",drop_percent=drop_percent)
    val_ds = hsd.HyperSpectrumDataSet(target_trait, "val")
    test_ds = hsd.HyperSpectrumDataSet(target_trait, "test")
        
    X_test, y_test = test_ds.data_values, test_ds.data_labels
    X_val, y_val = val_ds.data_values, val_ds.data_labels 
    X_train, y_train = train_ds.data_values, train_ds.data_labels
    
    if use_best_params:
        learning_rate = best_paras[target_trait][0]
        max_depth = best_paras[target_trait][1]
        colsample_bytree = best_paras[target_trait][2]
        subsample = 0.8
        n_estimators = 1000
    else:
        learning_rate = 0.1
        max_depth = 3
        colsample_bytree = 1
        n_estimators = 100
        subsample = 1
    
    model = XGBRegressor(n_estimators = n_estimators, subsample = subsample, learning_rate=learning_rate, 
                          max_depth=max_depth, colsample_bytree=colsample_bytree)
    
    eval_set = [(X_val, y_val)]
    
    model.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="rmse", eval_set=eval_set, verbose=False)
    
    # Make predictions using the unoptimized and model
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    test_preds = model.predict(X_test)
    
    train_results_r2[target_trait] = r2_score(y_train, train_preds)
    val_results_r2[target_trait] = r2_score(y_val, val_preds)
    test_results_r2[target_trait] = r2_score(y_test, test_preds)
    
    sum_train += r2_score(y_train, train_preds)
    sum_val += r2_score(y_val, val_preds)
    sum_test += r2_score(y_test, test_preds)
    
    # Report 
    print("Trait %s -  train set: %0.4f, val set: %0.4f, test set: %0.4f" % (target_trait, r2_score(y_train, train_preds), r2_score(y_val, val_preds), r2_score(y_test, test_preds)))

print ("Average: train %.4f, val %.4f, test %.4f"% (sum_train/len(train_results_r2), sum_val/len(val_results_r2), sum_test/len(test_results_r2)))

XGBRegressor_optimized_drop_0.2
Trait LMA_O -  train set: 0.9770, val set: 0.7806, test set: 0.7615
Trait Narea_O -  train set: 0.9782, val set: 0.8717, test set: 0.8192
Trait SPAD_O -  train set: 0.9605, val set: 0.8236, test set: 0.8318
Trait Nmass_O -  train set: 0.9961, val set: 0.6677, test set: 0.6374
Trait Pmass_O -  train set: 0.9998, val set: 0.5036, test set: 0.4402
Trait Vcmax -  train set: 0.9363, val set: 0.6864, test set: 0.5702
Trait Vcmax25 -  train set: 0.9998, val set: 0.7381, test set: 0.5078
Trait J -  train set: 0.8588, val set: 0.7832, test set: 0.7412
Trait Photo_O -  train set: 1.0000, val set: 0.4335, test set: 0.6474
Trait Cond_O -  train set: 0.6199, val set: 0.1723, test set: 0.4155
Average: train 0.9326, val 0.6461, test 0.6372


XGBRegressor_unoptimized
Trait LMA_O -  train set: 0.9721, val set: 0.7757, test set: 0.7264
Trait Narea_O -  train set: 0.9742, val set: 0.8804, test set: 0.8147
Trait SPAD_O -  train set: 0.9599, val set: 0.8217, test set: 0.8183
Trait Nmass_O -  train set: 0.9404, val set: 0.6645, test set: 0.6611
Trait Pmass_O -  train set: 0.9586, val set: 0.5775, test set: 0.4504
Trait Vcmax -  train set: 0.9681, val set: 0.6685, test set: 0.5407
Trait Vcmax25 -  train set: 0.9485, val set: 0.7328, test set: 0.4625
Trait J -  train set: 0.8779, val set: 0.7772, test set: 0.7415
Trait Photo_O -  train set: 0.9247, val set: 0.5073, test set: 0.6643
Trait Cond_O -  train set: 0.9013, val set: 0.1967, test set: 0.5175
Average: train 0.9425, val 0.6602, test 0.6397


#### Optimized 

Trait LMA_O -  train set: 0.9767, val set: 0.7803, test set: 0.7642
Trait Narea_O -  train set: 0.9734, val set: 0.8741, test set: 0.8169
Trait SPAD_O -  train set: 0.9959, val set: 0.8347, test set: 0.8322
Trait Nmass_O -  train set: 0.9923, val set: 0.6643, test set: 0.6658
Trait Pmass_O -  train set: 0.9998, val set: 0.5241, test set: 0.4366
Trait Vcmax -  train set: 0.9354, val set: 0.6864, test set: 0.5688
Trait Vcmax25 -  train set: 1.0000, val set: 0.7060, test set: 0.4727
Trait J -  train set: 0.8982, val set: 0.7880, test set: 0.7427
Trait Photo_O -  train set: 1.0000, val set: 0.4411, test set: 0.6544
Trait Cond_O -  train set: 0.6199, val set: 0.1710, test set: 0.4184
Average: train 0.9392, val 0.6470, test 0.6439

In [None]:
# Grid search

In [None]:
params = {'learning_rate':[0.01, 0.05, 0.1],  # default=0.1
          'n_estimators':[1000],  # default=100
          'max_depth': [3, 5, 7, 9],
          'subsample':[0.8],
          'colsample_bytree':[0.3, 0.5, 0.8]
          }

In [None]:
def train_model(model, params, X_train, y_train, X_val, y_val):
    
    # Make an fbeta_score scoring object using make_scorer()
    scorer = make_scorer(mean_squared_error)

    eval_set = [(X_val, y_val)]
    
    # Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
    cv = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 30)
    grid_r2_score = GridSearchCV(estimator=model, param_grid=params, cv = cv, n_jobs=-1, scoring=scorer, verbose=10)

    # Fit the grid search object to the training data and find the optimal parameters using fit()
    grid_r2_score.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="rmse", eval_set=eval_set, verbose=False)

    # Best parameters
    print("Best parameters found: ",grid_r2_score.best_params_)
    # Get the estimator
    best_xgb = grid_r2_score.best_estimator_

    train_preds = best_xgb.predict(X_train)
    val_preds = best_xgb.predict(X_val)
    
    train_r2_score = r2_score(y_train, train_preds)
    val_r2_score = r2_score(y_val, val_preds)
    
    print("Best params %s -  train set: %0.4f, val set: %0.4f" % (str(grid_r2_score.best_params_), 
                                                                  train_r2_score, val_r2_score))

    return (best_xgb, grid_r2_score.best_params_, train_r2_score, val_r2_score)

In [None]:
train_results = {}
val_results = {}
test_results = {}
final_parameters = {}
for target_trait in run_traits:
    train_ds = hsd.HyperSpectrumDataSet(target_trait, "train")
    val_ds = hsd.HyperSpectrumDataSet(target_trait, "val")
    test_ds = hsd.HyperSpectrumDataSet(target_trait, "test")

    X_test, y_test = test_ds.data_values, test_ds.data_labels
    X_val, y_val = val_ds.data_values, val_ds.data_labels 
    X_train, y_train = train_ds.data_values, train_ds.data_labels

    model = XGBRegressor()

    best_model, b_params, train_r2, val_r2 = train_model(model, params, X_train, y_train, X_val, y_val)
    
    final_parameters[target_trait] = b_params 
    test_preds = best_model.predict(X_test)

    train_results[target_trait] = train_r2
    val_results[target_trait] = val_r2
    test_results[target_trait] =  r2_score(y_test, test_preds)
    # Report 
    print("Trait %s - best model -  train set: %0.4f, val set: %0.4f, test set: %0.4f" % (target_trait, train_r2, val_r2, r2_score(y_test, test_preds)))
    

In [None]:
# print report to file

#best_params[Cond_O] = "learning_rate: 0.1, max_depth: 3, min_child_weight: 3, n_estimators: 1000}"

now = datetime.datetime.now()

script_dir = os.path.dirname("__file__")

result_file_name = os.path.join(script_dir, "results")
result_file_name = os.path.join(result_file_name, exp_desc + "_" + now.strftime("%Y-%m-%d-%H-%M")+"_results.txt")

result_file = open(result_file_name,"w")
result_file.write("-------------------------------------------------\n")
result_file.write("best parameters\n")

for target_trait in run_traits:
    result_file.write("trait: " + str(final_parameters[target_trait])+"\n")
result_file.write("-------------------------------------------------\n")

result_file.write("Here are the training data set results:\n")
total = 0.0
for key, value in train_results.items():
    result_file.write("trait: " + str(key) +", R square: " + str(np.asscalar(value))+"\n")
    total += np.asscalar(value)
result_file.write("Average R square: " + str(total / len(train_results))+"\n")
result_file.write("-------------------------------------------------\n")

result_file.write("Here are the validation data set results:\n")
total = 0.0
for key, value in val_results.items():
    result_file.write("trait: " + str(key) +", R square: " + str(np.asscalar(value))+"\n")
    total += np.asscalar(value)
result_file.write("Average R square: " + str(total / len(train_results))+"\n")

result_file.write("-------------------------------------------------\n")
result_file.write("Here are the test data set results: \n")

total = 0.0
for key, value in test_results.items():
    result_file.write("trait: " + str(key) + ", R square: " + str(np.asscalar(value))+"\n")
    total += np.asscalar(value)
result_file.write("Average R square: " + str(total / len(test_results)) + "\n")
result_file.write("-------------------------------------------------\n")
result_file.close()

result_file = open(result_file_name+"csv","w")
result_file.write("Model, Dataset, LMA_O, Narea_O, SPAD_O, Nmass_O, Parea_O, Pmass_O, Vcmax, Vcmax25, J, Photo_O, Cond_O, Average\n")

result_file.write(exp_desc +", train, ") 
total = 0.0

for trait in all_traits:
    if trait in train_results:
        result_file.write(str(np.asscalar(train_results[trait]))+", ")
        total += np.asscalar(train_results[trait])
    else:
        result_file.write(", ")

result_file.write(str(total/len(train_results))+"\n")

result_file.write(exp_desc +", val, ") 
total = 0.0

for trait in all_traits:
    if trait in val_results:
        result_file.write(str(np.asscalar(val_results[trait]))+", ")
        total += np.asscalar(val_results[trait])
    else:
        result_file.write(", ")

result_file.write(str(total/len(val_results))+"\n")


result_file.write(exp_desc +", test, ") 
total = 0.0

for trait in all_traits:
    if trait in test_results:
        result_file.write(str(np.asscalar(test_results[trait]))+", ")
        total += np.asscalar(test_results[trait])
    else:
        result_file.write(", ")

result_file.write(str(total/len(test_results))+"\n")

result_file.close()