# Testing Multiple Linear Regression Model on Hold-out Data

In [1]:
import ee
ee.Authenticate()
ee.Initialize(project='ee-aspenjkmorgan')

### Import held out data

In [2]:
# bring in the last 10 percent of data for generalizing
testing = ee.FeatureCollection('users/aspenjkmorgan/k_folds/test_v2')
testing.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000000',
 'properties': {'DPT': 7.049996852874756,
  'PBLH': 176.98858642578125,
  'PRES': 91151.5,
  'RH': 45.5619720135636,
  'TMP': 19.07999610900879,
  'WDIR': 157.40000915527344,
  'WIND': 2.009999990463257,
  'aod': 0.0965,
  'pm25': 8.4275,
  'random': 0.7443328619811324,
  'station': 'Flathead Valley',
  'system:time_start': 1335124800000}}

### Use best split of data for training the model

In [3]:
# best fold of training data
training = ee.FeatureCollection('users/aspenjkmorgan/k_folds/train8_v2')
training.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000001',
 'properties': {'DPT': 7.849999904632568,
  'PBLH': 1735.4619140625,
  'PRES': 90981,
  'RH': 34.764906845648156,
  'TMP': 24.399988174438477,
  'WDIR': 126.55000305175781,
  'WIND': 2.1999998092651367,
  'aod': 0.365,
  'pm25': 11.91625,
  'random': 0.1234366235718175,
  'station': 'Flathead Valley',
  'system:time_start': 1345399200000}}

### Helper functions

In [None]:
# Compute the difference between computed pm25 and actual
def difference(feature):
  diff = ee.Number(feature.get('pm_cal')) \
    .subtract(ee.Number(feature.get('pm25')))
  # Return the feature with the squared difference set to the 'diff' property.
  return feature.set('sq_diff', diff.pow(2))

def getTss(feature):
  buf = ee.Number(feature.get('pm25')).subtract(ee.Number(mean_actual))
  return feature.set('tss', buf.pow(2))

### Multiple Linear Regression Model

In [None]:
# add constant column to training set for bis
def addConstant(feature):
  return feature.set('constant', 1)
training = training.map(addConstant)

mlr_model = training.reduceColumns(**{
    'reducer': ee.Reducer.linearRegression(**{'numX': 6,'numY': 1}),
    'selectors': ['aod', 'WIND', 'RH', 'DPT', 'PBLH', 'constant', 'pm25']})

coeffs = ee.Array(mlr_model.get('coefficients')).toList()

def applyLinearRegModel(feature):
  aod_effect = ee.Number(feature.get('aod')).multiply(ee.List(coeffs.get(0)).get(0))
  wind_effect = ee.Number(feature.get('WIND')).multiply(ee.List(coeffs.get(1)).get(0))
  rh_effect = ee.Number(feature.get('RH')).multiply(ee.List(coeffs.get(2)).get(0))
  dpt_effect = ee.Number(feature.get('DPT')).multiply(ee.List(coeffs.get(3)).get(0))
  pblh_effect = ee.Number(feature.get('PBLH')).multiply(ee.List(coeffs.get(4)).get(0))
  y_int = ee.Number(ee.List(coeffs.get(5)).get(0))
  pm_cal = aod_effect.add(wind_effect).add(rh_effect).add(dpt_effect).add(pblh_effect).add(y_int)
  return feature.set('pm_cal', pm_cal)

In [None]:
# get predictions
testing_applied = testing.map(applyLinearRegModel)

In [None]:
testing_applied.first().getInfo()

{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-114.189259, 48.363709]},
 'id': '00000000000000000000',
 'properties': {'DPT': 7.049996852874756,
  'PBLH': 176.98858642578125,
  'PRES': 91151.5,
  'RH': 45.5619720135636,
  'TMP': 19.07999610900879,
  'WDIR': 157.40000915527344,
  'WIND': 2.009999990463257,
  'aod': 0.0965,
  'pm25': 8.4275,
  'pm_cal': 7.612772639114636,
  'random': 0.7443328619811324,
  'station': 'Flathead Valley',
  'system:time_start': 1335124800000}}

### Calculate Metrics (RMSE and $R^2$)

In [None]:
mean_actual = testing_applied.aggregate_mean('pm25')
testing_applied = testing_applied.map(difference)
testing_applied = testing_applied.map(getTss)

# RMSE for validation data
testing_rmse = ee.Number(testing_applied.reduceColumns(ee.Reducer.mean(), ['sq_diff']).get('mean')).sqrt()

In [None]:
print('RMSE: ' + str(testing_rmse.getInfo()))

RMSE: 10.559041793796531


In [None]:
# get R2
rss = ee.Number(testing_applied.reduceColumns(ee.Reducer.sum(), ['sq_diff']).get('sum'))
tss = ee.Number(testing_applied.reduceColumns(ee.Reducer.sum(), ['tss']).get('sum'))
r2 = ee.Number(1).subtract(rss.divide(tss))

In [None]:
print('R2: ' + str(r2.getInfo()))

R2: 0.5486668764100632
