# Testing Random Forest Model on Hold-out Data

In [1]:
import ee
ee.Authenticate()
ee.Initialize(project='ee-aspenjkmorgan')

### Import held out data

In [2]:
# bring in the last 10 percent of data for generalizing
testing = ee.FeatureCollection('users/aspenjkmorgan/k_folds/test_v2')
testing.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000000',
 'properties': {'DPT': 7.049996852874756,
  'PBLH': 176.98858642578125,
  'PRES': 91151.5,
  'RH': 45.5619720135636,
  'TMP': 19.07999610900879,
  'WDIR': 157.40000915527344,
  'WIND': 2.009999990463257,
  'aod': 0.0965,
  'pm25': 8.4275,
  'random': 0.7443328619811324,
  'station': 'Flathead Valley',
  'system:time_start': 1335124800000}}

### Use best split of data for training the model



In [3]:
# best fold of training data
training = ee.FeatureCollection('users/aspenjkmorgan/k_folds/train6_v2')
training.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000000',
 'properties': {'DPT': 9.450006484985352,
  'PBLH': 1632.96826171875,
  'PRES': 91517.5,
  'RH': 47.9038797554143,
  'TMP': 20.899988174438477,
  'WDIR': 131.14999389648438,
  'WIND': 1.184999942779541,
  'aod': 0.133,
  'pm25': 4.04225,
  'random': 0.09903436499566254,
  'station': 'Flathead Valley',
  'system:time_start': 1344103200000}}

### Helper functions

In [4]:
# Compute the difference between computed pm25 and actual
def difference(feature):
  diff = ee.Number(feature.get('pm_cal')) \
    .subtract(ee.Number(feature.get('pm25')))
  # Return the feature with the squared difference set to the 'diff' property.
  return feature.set('sq_diff', diff.pow(2))

def getTss(feature):
  buf = ee.Number(feature.get('pm25')).subtract(ee.Number(mean_actual))
  return feature.set('tss', buf.pow(2))

### Use best hyperparameters in model



In [5]:
# best estimator, fit to training data
rf_model = ee.Classifier.smileRandomForest(**{
    'numberOfTrees': 100,
    'variablesPerSplit': 3,
    'minLeafPopulation': 2,
    'bagFraction': 0.5,
    'maxNodes': None,
    'seed': 0}) \
    .setOutputMode('REGRESSION')\
    .train(**{
        'features': training,
        'classProperty': 'pm25',
        'inputProperties': ['PRES', 'aod', 'RH', 'PBLH', 'DPT', 'WIND', 'WDIR', 'TMP']
})

In [None]:
# save model to assets for app
ee.batch.Export.classifier.toAsset(rf_model,
                                   'Saved random forest',
                                   'users/aspenjkmorgan/random_forest_2023-12-20').start()

In [6]:
# get predictions
testing_applied = testing.classify(rf_model, 'pm_cal')

In [7]:
testing_applied.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000000',
 'properties': {'DPT': 7.049996852874756,
  'PBLH': 176.98858642578125,
  'PRES': 91151.5,
  'RH': 45.5619720135636,
  'TMP': 19.07999610900879,
  'WDIR': 157.40000915527344,
  'WIND': 2.009999990463257,
  'aod': 0.0965,
  'pm25': 8.4275,
  'pm_cal': 5.905838605201665,
  'random': 0.7443328619811324,
  'station': 'Flathead Valley',
  'system:time_start': 1335124800000}}

### Calculate Metrics (RMSE and $R^2$)



In [8]:
mean_actual = testing_applied.aggregate_mean('pm25')
testing_applied = testing_applied.map(difference)
testing_applied = testing_applied.map(getTss)

# RMSE for validation data
testing_rmse = ee.Number(testing_applied.reduceColumns(ee.Reducer.mean(), ['sq_diff']).get('mean')).sqrt()

In [9]:
print('RMSE: ' + str(testing_rmse.getInfo()))

RMSE: 10.39414272387502


In [10]:
# get R2
rss = ee.Number(testing_applied.reduceColumns(ee.Reducer.sum(), ['sq_diff']).get('sum'))
tss = ee.Number(testing_applied.reduceColumns(ee.Reducer.sum(), ['tss']).get('sum'))
r2 = ee.Number(1).subtract(rss.divide(tss))

In [11]:
print('R2: ' + str(r2.getInfo()))

R2: 0.5626536142143996
