In [1]:
import ee
import geemap
import pandas as pd
import numpy as np
import os

In [2]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [3]:
from sklearn.model_selection import RepeatedKFold, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPRegressor

In [4]:
ee.Initialize()

In [5]:
# Create a common region of interest (San Francisco)
roi = ee.Geometry.Polygon([[[-123.28736502403991,36.53106212747138],
        [-120.57374197716491,36.53106212747138],
        [-120.57374197716491,38.629116592353306],
        [-123.28736502403991,38.629116592353306],
        [-123.28736502403991,36.53106212747138]]])

In [6]:
# Data COllection for July
image_S5P = ee.ImageCollection("COPERNICUS/S5P/NRTI/L3_NO2") \
    .filterBounds(roi) \
    .filterDate('2018-07-01', '2018-07-31') \
    .sort('ALGORITHM_VERSION') \
    .select('NO2_column_number_density') \
    .median() \
    .clip(roi)


image_LANDSAT8 = ee.ImageCollection('LANDSAT/LC08/C01/T1_SR') \
    .filterBounds(roi) \
    .filterDate('2018-07-01', '2018-07-31') \
    .sort('CLOUD_COVER') \
    .select('B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B10', 'B11', 'sr_aerosol') \
    .median() \
    .clip(roi)


image_S2_SR = ee.ImageCollection("COPERNICUS/S2") \
  .filterBounds(roi) \
  .filterDate('2018-07-01', '2018-07-31'); 

# This tells us what images are inside the collection
# These are the bands that we want to be displayed
S2_bands = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7','B8','B8A','B9', 'B11', 'B12']
bands=['B4', 'B3', 'B2']

# This turns the whole S2 collection into one image, finding the middle value for each pixel
image_S2_SR = image_S2_SR.median().select(S2_bands).clip(roi)


In [8]:
#traing data for July
ground_truth= image_LANDSAT8.addBands(image_S5P).addBands(image_S2_SR)

work_dir = os.path.expanduser('C:\\Users\\Lizzy\\OneDrive - PARC\\Documents\\iem_groundsensor\\q2_model_files_plots')
in_shp = os.path.join(work_dir, 'random_points_sf_lc.shp')

in_fc = geemap.shp_to_ee(in_shp)

proj = ee.Projection('EPSG:4326')
reproj_image = ground_truth.resample('bilinear').reproject(crs=proj, scale=30)


out_csv = os.path.join(work_dir, 'Combined_ESRI_S2_LandSat8_S5_San_Francisco_TPOT July.csv')
geemap.extract_values_to_points(in_fc, reproj_image, out_csv)


The input shapefile could not be found.
expected str, bytes or os.PathLike object, not NoneType


EEException: Parameter 'collection' is required.

In [None]:
df=pd.read_csv('Combined_ESRI_S2_LandSat8_S5_San_Francisco_TPOT July.csv')

df=df.drop('system:index', axis=1)

In [None]:
df = df.iloc[: , 1:]

In [None]:
df['L'] = le.fit_transform(df['L'].values)

In [None]:
df.columns

In [None]:
y = df['NO2_column_number_density']
X=df[['L', 'B8A', 'B10', 'B11', 'B12', 'B11_1', 'B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B9', 'sr_aerosol',
       'B6_1', 'B7_1', 'B4_1', 'B5_1', 'B2_1', 'B3_1', 'B1_1']]

In [None]:
from sklearn.model_selection import RepeatedKFold, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
model = GradientBoostingRegressor(alpha=0.85, learning_rate=0.2, loss='ls', max_depth=5, max_features=1.0, 
                          min_samples_leaf=12, min_samples_split=7, 
                          n_estimators=50, subsample=0.6500000000000001)

In [None]:
model.fit(X_train, y_train)

In [None]:
print('Training Accuracy : %.3f'% model.score(X_train,y_train))

print('Test Accuracy : %.3f'%model.score(X_test,y_test))

In [None]:
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.title('Feature Importance')
plt.yticks(pos, np.array(X.columns)[sorted_idx])
result = permutation_importance(model, X_train, y_train, n_repeats=10,
                                random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(result.importances[sorted_idx].T,
            vert=False, labels=np.array(X.columns)[sorted_idx])
plt.title("Permutation Importance")
fig.tight_layout()
plt.show()

In [None]:
# define model
model = DecisionTreeRegressor(max_depth=8,
                           min_samples_split=5,
                           max_leaf_nodes=20)
# fit model
model.fit(X_train,y_train)
print('Training Accuracy : %.3f'% model.score(X_train,y_train))

print('Test Accuracy : %.3f'%model.score(X_test,y_test))

In [None]:
model = RandomForestRegressor(bootstrap=False, max_features=0.35000000000000003, min_samples_leaf=7, min_samples_split=20, n_estimators=100)
model.fit(X_train, y_train)
print('Training Accuracy : %.3f'% model.score(X_train,y_train))

print('Test Accuracy : %.3f'%model.score(X_test,y_test))

In [None]:
testdf = pd.read_csv('dense_pts_lc3.csv')

In [None]:
testdf.columns

In [None]:
col_names = ['OID_', 'Id', 'L', 'B1', 'B2', 'B3', 'B4', 'B5', 'B6',
       'B7', 'B8', 'B8A', 'B9', 'B10', 'B11', 'B12',
       'b14_S2', 'b15_S2', 'b16_S2', 'B1_1', 'B2_1', 'B3_1', 'B4_1',
       'B5_1', 'B6_1', 'B7_1', 'B10_1', 'B11_1', 'sr_aerosol', 'b11_L8',
       'b12_L8', 'Lat', 'Lon']

In [None]:
testdf.columns = col_names

In [None]:
mod_df = testdf[['L', 'B8A', 'B10', 'B11', 'B12', 'B11_1', 'B1', 'B2', 'B3', 'B4', 'B5',
        'B6', 'B7', 'B8', 'B9', 'sr_aerosol',
       'B6_1', 'B7_1', 'B4_1', 'B5_1', 'B2_1', 'B3_1', 'B1_1']]

In [None]:
df.head()

In [None]:
pred = model.predict(mod_df)

In [None]:
pred_df = pd.DataFrame(pred)

In [None]:
pred_file = pd.concat([testdf, pred_df], axis=1)

In [None]:
pred_file

In [None]:
pred_file.to_csv('dense_pts_lc_results.csv')

In [None]:
dense_train = pd.read_csv('lc_sf_training.csv')

dense_train['L'] = le.fit_transform(dense_train['L'].values)

sample_df = dense_train.sample(frac=0.1)
dense_test_df = dense_train.drop(sample_df.index)

len(dense_test_df)

dense_train.columns

y = sample_df['no2']
X=sample_df[['L', 'b1_S2', 'b2_S2', 'b3_S2', 'b4_S2', 'b5_S2', 'b6_S2',
       'b7_S2', 'b8_S2', 'b9_S2', 'b10_S2', 'b11_S2', 'b12_S2', 'b13_S2',
       'b14_S2', 'b15_S2', 'b16_S2', 'b1_L8', 'b2_L8', 'b3_L8', 'b4_L8',
       'b5_L8', 'b6_L8', 'b7_L8', 'b8_L8', 'b9_L8', 'b10_L8', 'b11_L8',
       'b12_L8', 'Lat', 'Lon']]

In [None]:
y = sample_df['no2']
X=sample_df[['L', 'b1_S2', 'b2_S2', 'b3_S2', 'b4_S2', 'b5_S2', 'b6_S2',
       'b7_S2', 'b8_S2', 'b9_S2', 'b10_S2', 'b11_S2', 'b12_S2', 'b13_S2',
       'b14_S2', 'b15_S2', 'b16_S2', 'Lat', 'Lon']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

model = GradientBoostingRegressor(alpha=0.85, learning_rate=0.2, loss='ls', max_depth=5, max_features=1.0, 
                          min_samples_leaf=12, min_samples_split=7, 
                          n_estimators=50, subsample=0.6500000000000001)

model.fit(X_train, y_train)

print('Training Accuracy : %.3f'% model.score(X_train,y_train))

print('Test Accuracy : %.3f'%model.score(X_test,y_test))

In [None]:
model = RandomForestRegressor(bootstrap=False, max_features=0.35000000000000003, min_samples_leaf=7, min_samples_split=20, n_estimators=100)
model.fit(X_train, y_train)
print('Training Accuracy : %.3f'% model.score(X_train,y_train))

print('Test Accuracy : %.3f'%model.score(X_test,y_test))

In [None]:
# define model
model = DecisionTreeRegressor(max_depth=8,
                           min_samples_split=5,
                           max_leaf_nodes=20)
# fit model
model.fit(X_train,y_train)
print('Training Accuracy : %.3f'% model.score(X_train,y_train))

print('Test Accuracy : %.3f'%model.score(X_test,y_test))

In [None]:
y_dense_test = dense_test_df['no2']
X_dense_test=dense_test_df[['L', 'b1_S2', 'b2_S2', 'b3_S2', 'b4_S2', 'b5_S2', 'b6_S2',
       'b7_S2', 'b8_S2', 'b9_S2', 'b10_S2', 'b11_S2', 'b12_S2', 'b13_S2',
       'b14_S2', 'b15_S2', 'b16_S2', 'b1_L8', 'b2_L8', 'b3_L8', 'b4_L8',
       'b5_L8', 'b6_L8', 'b7_L8', 'b8_L8', 'b9_L8', 'b10_L8', 'b11_L8',
       'b12_L8', 'Lat', 'Lon']]

In [None]:
y_dense_test.shape

In [None]:
model.score(X_dense_test, y_dense_test)

In [None]:
pred_dense_test = model.predict(X_dense_test)

In [None]:
pred = pd.DataFrame(pred_dense_test)

In [None]:
X_dense_test = X_dense_test.reset_index()
pred = pred.reset_index()

In [None]:
pred_dense_test_df = pd.concat([X_dense_test, pred], axis=1, ignore_index=True)

In [None]:
pred_dense_test_df.shape

In [None]:
pred_dense_test_df.head()

In [None]:
pred_dense_test_df.to_csv("dense_test_sf_0718_V3.csv")

In [None]:
ddf = pd.read_csv('la_dense_pts.csv')

In [None]:
ddf = ddf[['L', 'b1_S2', 'b2_S2', 'b3_S2', 'b4_S2', 'b5_S2', 'b6_S2',
       'b7_S2', 'b8_S2', 'b9_S2', 'b10_S2', 'b11_S2', 'b12_S2', 'b13_S2',
       'b14_S2', 'b15_S2', 'b16_S2', 'b1_l8', 'b2_l8', 'b3_l8', 'b4_l8',
       'b5_l8', 'b6_l8', 'b7_l8', 'b8_l8', 'b9_l8', 'b10_l8', 'b11_l8',
       'b12_l8', 'Lat', 'Lon']]

In [None]:
ddf.columns = ['L', 'b1_S2', 'b2_S2', 'b3_S2', 'b4_S2', 'b5_S2', 'b6_S2',
       'b7_S2', 'b8_S2', 'b9_S2', 'b10_S2', 'b11_S2', 'b12_S2', 'b13_S2',
       'b14_S2', 'b15_S2', 'b16_S2', 'b1_L8', 'b2_L8', 'b3_L8', 'b4_L8',
       'b5_L8', 'b6_L8', 'b7_L8', 'b8_L8', 'b9_L8', 'b10_L8', 'b11_L8',
       'b12_L8', 'Lat', 'Lon']

In [None]:
no2_dense_la = model.predict(ddf)

In [None]:
no2_la = pd.DataFrame(no2_dense_la)

In [None]:
no2_la_pred = pd.concat([ddf, no2_la], axis=1)

In [None]:
no2_la_pred.to_csv('dense_la_results.csv')

In [None]:
june_df = pd.read_csv('dense_june18_sf2.csv')

In [None]:
june_df.columns = ['ID', 'L', 'b1_S2', 'b2_S2', 'b3_S2', 'b4_S2', 'b5_S2', 'b6_S2',
       'b7_S2', 'b8_S2', 'b9_S2', 'b10_S2', 'b11_S2', 'b12_S2', 'b13_S2',
       'b14_S2', 'b15_S2', 'b16_S2',  'Lat', 'Lon']

In [None]:
june_df = june_df[['L', 'b1_S2', 'b2_S2', 'b3_S2', 'b4_S2', 'b5_S2', 'b6_S2',
       'b7_S2', 'b8_S2', 'b9_S2', 'b10_S2', 'b11_S2', 'b12_S2', 'b13_S2',
       'b14_S2', 'b15_S2', 'b16_S2',  'Lat', 'Lon']]

In [None]:
mod_june_df = june_df.dropna()

In [None]:
june18_pred = model.predict(mod_june_df)

In [None]:
june18_pred_df = pd.DataFrame(june18_pred)

In [None]:
mod_june_df = mod_june_df.reset_index()
june18_pred_df = june18_pred_df.reset_index()

In [None]:
pred_sf_june_df = pd.concat([mod_june_df, june18_pred_df], axis=1, ignore_index=True)

In [None]:
pred_sf_june_df.to_csv('pred_june_sf.csv')