# Testing Random Forest Model on Hold-out Data

In [None]:
import ee
ee.Authenticate()
ee.Initialize()

To authorize access needed by Earth Engine, open the following URL in a web browser and follow the instructions. If the web browser does not start automatically, please manually browse the URL below.

    https://code.earthengine.google.com/client-auth?scopes=https%3A//www.googleapis.com/auth/earthengine%20https%3A//www.googleapis.com/auth/devstorage.full_control&request_id=IgzttPa6HKxhBIgpl9Tm_kB2soOO4CTS7L1JdmPlDw8&tc=tze6_NmGi0D4-Ss3WtVOFc5w9Q0OLZNUpy_pelsHnVQ&cc=FzduK6XDPFvkSJE94OOQntnFTM7lrJ1Ov7uElldSESk

The authorization workflow will generate a code, which you should paste in the box below.
Enter verification code: 4/1AfJohXmGSBg38Gg5Kk6PtetSTlw2MN9yeeFoTC590Xl9wkv0h3SNDj9-K5E

Successfully saved authorization token.


### Import held out data

In [None]:
# bring in the last 10 percent of data for generalizing
testing = ee.FeatureCollection('users/aspenjkmorgan/k_folds/test')
testing.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000000',
 'properties': {'DPT': 5.050012111663818,
  'PBLH': 903.87353515625,
  'PRES': 91148.5,
  'RH': 59.955150648997744,
  'TMP': 12.624994277954102,
  'WDIR': 201.3000030517578,
  'WIND': 1.7549999952316284,
  'aod': 0.217,
  'pm25': 3.103142857,
  'random': 0.9806150715932024,
  'station': 'Flathead Valley',
  'system:time_start': 1339092000000}}

### Use best split of data for training the model



In [None]:
# best fold of training data
training = ee.FeatureCollection('users/aspenjkmorgan/k_folds/training_fold_9')
training.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000000',
 'properties': {'DPT': -4.599997043609619,
  'PBLH': 66.51596069335938,
  'PRES': 91613.5,
  'RH': 67.44328718390516,
  'TMP': 0.7449890375137329,
  'WDIR': 234.10000610351562,
  'WIND': 0.5600000023841858,
  'aod': 0.008,
  'pm25': 5.5785,
  'random': 0.006469227008691791,
  'station': 'Flathead Valley',
  'system:time_start': 1325534400000}}

### Helper functions

In [None]:
# Compute the difference between computed pm25 and actual
def difference(feature):
  diff = ee.Number(feature.get('pm_cal')) \
    .subtract(ee.Number(feature.get('pm25')))
  # Return the feature with the squared difference set to the 'diff' property.
  return feature.set('sq_diff', diff.pow(2))

def getTss(feature):
  buf = ee.Number(feature.get('pm25')).subtract(ee.Number(mean_actual))
  return feature.set('tss', buf.pow(2))

### Use best hyperparameters in model



In [None]:
# best estimator, fit to training data
rf_model = ee.Classifier.smileRandomForest(**{
    'numberOfTrees': 100,
    'variablesPerSplit': 2,
    'minLeafPopulation': 4,
    'bagFraction': 0.7,
    'maxNodes': None,
    'seed': 0}) \
    .setOutputMode('REGRESSION')\
    .train(**{
        'features': training,
        'classProperty': 'pm25',
        'inputProperties': ['PRES', 'aod', 'RH', 'PBLH', 'DPT', 'WIND', 'WDIR']
})

In [None]:
# get predictions
testing_applied = testing.classify(rf_model, 'pm_cal')

In [None]:
testing_applied.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000000',
 'properties': {'DPT': 5.050012111663818,
  'PBLH': 903.87353515625,
  'PRES': 91148.5,
  'RH': 59.955150648997744,
  'TMP': 12.624994277954102,
  'WDIR': 201.3000030517578,
  'WIND': 1.7549999952316284,
  'aod': 0.217,
  'pm25': 3.103142857,
  'pm_cal': 8.952964286752378,
  'random': 0.9806150715932024,
  'station': 'Flathead Valley',
  'system:time_start': 1339092000000}}

### Calculate Metrics (RMSE and $R^2$)



In [None]:
mean_actual = testing_applied.aggregate_mean('pm25')
testing_applied = testing_applied.map(difference)
testing_applied = testing_applied.map(getTss)

# RMSE for validation data
testing_rmse = ee.Number(testing_applied.reduceColumns(ee.Reducer.mean(), ['sq_diff']).get('mean')).sqrt()

In [None]:
print('RMSE: ' + str(testing_rmse.getInfo()))

RMSE: 11.048054111273144


In [None]:
# get R2
rss = ee.Number(testing_applied.reduceColumns(ee.Reducer.sum(), ['sq_diff']).get('sum'))
tss = ee.Number(testing_applied.reduceColumns(ee.Reducer.sum(), ['tss']).get('sum'))
r2 = ee.Number(1).subtract(rss.divide(tss))

In [None]:
print('R2: ' + str(r2.getInfo()))

R2: 0.48374696741112233
