<a href="https://colab.research.google.com/github/nunocesarsa/Examples/blob/main/AutoML/AutoGluon/Tutorial_AutoGluon_RemoteSensing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AutoGluon

Is an autoML approach proposed recently by an AWS team. https://auto.gluon.ai/stable/index.html


*   https://hangzhang.org/CVPR2020/
*   https://jwmueller.github.io/KDD20-tutorial/
*   Reference paper: https://arxiv.org/abs/2003.06505

# Choose Runtime - GPU


In [None]:
# Here we assume CUDA 10.0 is installed.  You should change the number
# according to your own CUDA version (e.g. mxnet-cu101 for CUDA 10.1).
!pip uninstall -y mkl
!pip install --upgrade mxnet-cu100
!pip install autogluon

!pip install -U ipykernel

!pip install -U dask

#Testing AutoGluon

In [None]:
from autogluon import TabularPrediction as task
train_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')
test_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
predictor = task.fit(train_data=train_data, label='class')
performance = predictor.evaluate(test_data)

#Fetching the data

## Downloading

The data is "stolen" from this excellent tutorial on how to do a classification with R:

*   https://urbanspatial.github.io/classifying_satellite_imagery_in_R/
*   https://github.com/urbanSpatial/classifying_satellite_imagery_in_R

In [2]:
!git clone https://github.com/urbanSpatial/classifying_satellite_imagery_in_R

Cloning into 'classifying_satellite_imagery_in_R'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 106 (delta 2), reused 23 (delta 2), pack-reused 80[K
Receiving objects: 100% (106/106), 42.79 MiB | 31.23 MiB/s, done.
Resolving deltas: 100% (30/30), done.


## Exploring

In [None]:
!pip install rasterio
!pip install geopandas
!pip install earthpy
!pip install rasterstats

In [11]:
import os
from glob import glob
import matplotlib.pyplot as plt
import rasterio as rio
from rasterio.plot import plotting_extent
import geopandas as gpd
import earthpy as et
import earthpy.spatial as es
import earthpy.plot as ep

import rasterstats


import sklearn


### Plotting the RGB image

In [None]:
#From: https://earthpy.readthedocs.io/en/latest/gallery_vignettes/plot_rgb.html

#fetches the iamges at 30m excluding the panchromatic and cirrus 
landsat_bands_data_path = "/content/classifying_satellite_imagery_in_R/data/band*[1-7]*.tif"

stack_band_paths = glob(landsat_bands_data_path)
stack_band_paths.sort()

#resorting because of the band names 
stack_band_paths_sorted = [stack_band_paths[i] for i in [0,3,4,5,6,7,8]]

#print(stack_band_paths)
#print(stack_band_paths_sorted)

# Create image stack and apply nodata value for Landsat
arr_st, meta = es.stack(stack_band_paths_sorted, nodata=-9999)

#From: https://earthpy.readthedocs.io/en/latest/gallery_vignettes/plot_rgb.html

# Create figure with one plot
fig, ax = plt.subplots(figsize=(12, 12))

ep.plot_rgb(arr_st, rgb=(3, 2, 1), ax=ax, title="Landsat 8 RGB Image")
plt.show()

### Loading the points and plotting the overal

In [6]:
#Loading and reprojecting
sample_shp = gpd.read_file('/content/classifying_satellite_imagery_in_R/data/calgary_trainingPoints.shp')
shp_prj = sample_shp.to_crs(epsg=32612)
#shp_prj.crs
shp_prj

Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.


Unnamed: 0,class,id,geometry
0,clouds,1.0,POINT (273026.189 5668459.275)
1,clouds,1.0,POINT (274910.786 5668286.479)
2,clouds,1.0,POINT (275346.879 5668903.405)
3,clouds,1.0,POINT (275993.389 5668347.157)
4,clouds,1.0,POINT (275833.212 5668744.188)
...,...,...,...
696,water,4.0,POINT (289879.993 5639054.215)
697,water,4.0,POINT (290748.279 5637522.819)
698,water,4.0,POINT (290663.551 5637559.526)
699,water,4.0,POINT (290709.630 5637615.015)


In [None]:
#we have to fetch the extent using the rasterio package
with rio.open('/content/classifying_satellite_imagery_in_R/data/band1.tif') as image_src:
  img_data = image_src.read()

  img_extent = plotting_extent(image_src)
  
fig, ax = plt.subplots(figsize=(12, 12))

ep.plot_rgb(arr_st, rgb=(3, 2, 1), ax=ax, title="Landsat 8 RGB Image",extent=(img_extent))
shp_prj.plot(ax=ax)
plt.show()

## Extracting the values

In [9]:
#first we save everything (the stacked raster and the projected shapefile to a directory)
!mkdir outputs

#saving
shp_prj.to_file('/content/outputs/sample_loc.shp')
es.stack(stack_band_paths_sorted, out_path='/content/outputs/Landsat.tif') #fails because of uint16 if we add the thermals

#other alternatives here: https://gis.stackexchange.com/questions/223910/using-rasterio-or-gdal-to-stack-multiple-bands-without-using-subprocess-commands

from rasterstats import point_query

for i in range(1,8):
  #pt_query = point_query('/content/outputs/sample_loc.shp',i)
  #print(pt_query[0])

  #querying the data
  pt_query = point_query('/content/outputs/sample_loc.shp','/content/outputs/Landsat.tif',band=i)
  #print(i)
  #print(pt_query[0])

  band_nr = "B"+str(i)
  print('Processing:',band_nr)

  #adding to the pandas
  shp_prj[band_nr]=pt_query

mkdir: cannot create directory ‘outputs’: File exists


Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.
Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.


Processing: B1


Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.


Processing: B2


Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.


Processing: B3


Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.


Processing: B4


Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.


Processing: B5


Unable to open EPSG support file gcs.csv.  Try setting the GDAL_DATA environment variable to point to the directory containing EPSG csv files.


Processing: B6
Processing: B7


#Setting up for AutoGluon

In [15]:
import pandas as pd

In [25]:
#removing NA from the datasets
test_data = shp_prj
test_data = test_data.dropna(0) #there is one row somewhere with an NA that causes errors, se we just remove it

#selecting data and conerting to category
df_x = test_data[["B1","B2","B3","B4","B5","B6","B7"]]
df_y = test_data[['class']].astype('category')

#using Sklearn data splitter
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(df_x, df_y, random_state=42,test_size=0.3) #.7/.3 split

#appending the data into the same table
df_train = pd.concat([X_train,y_train],axis=1)
df_test = pd.concat([X_test,y_test],axis=1)

In [31]:
auto_glu = task.fit(train_data=df_train,
                    label='class',
                    time_limits=180,
                    num_bagging_folds=5,num_bagging_sets=1,
                    hyperparameters='light',
                    random_seed=42)

No output_directory specified. Models will be saved in: AutogluonModels/ag-20210217_234259/
Beginning AutoGluon training ... Time limit = 180s
AutoGluon will save models to AutogluonModels/ag-20210217_234259/
AutoGluon Version:  0.0.15
Train Data Rows:    490
Train Data Columns: 7
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == category).
	4 unique label values:  ['undeveloped', 'water', 'developed', 'clouds']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 4
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12473.82 MB
	Train Data (Original)  Memory Usage: 0.03 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in t

In [32]:
from sklearn import metrics
print("Validation accuracy score", sklearn.metrics.accuracy_score(y_test, auto_glu.predict(df_test)))

print("Validation kappa accuracy score", sklearn.metrics.cohen_kappa_score(y_test, auto_glu.predict(df_test)))

Validation accuracy score 0.9523809523809523
Validation kappa accuracy score 0.9365252085600291


#Predicting to the raster

- In general: Convert the multy-layer raster object to a "table format", apply the model and reconstruct the raster. 

- (Optional) This procedure above allows me to process in data chunks and to avoid going out of memory on the google colab

PS: probably someone could do this better but i don't care

In [33]:
!pip install pyrsgis
from pyrsgis.convert import changeDimension
from pyrsgis import raster
import rasterio
import pyproj
import numpy as np
import pandas as pd


#lading the landsat data 
ds1, bands = raster.read('/content/outputs/Landsat.tif')

print(ds1)
print(bands.shape) 

Collecting pyrsgis
  Downloading https://files.pythonhosted.org/packages/ef/73/1d57263ce7780cec74cf2e7c6dbe53e840521724a131900c5b28ad05d61f/pyrsgis-0.3.3-py3-none-any.whl
Installing collected packages: pyrsgis
Successfully installed pyrsgis-0.3.3
<pyrsgis.raster.createDS object at 0x7fc81e6aca58>
(7, 1413, 1121)




In [34]:
#creates a np with 7 columns anx NxM rows (and also its tranposed)

bandByPixel = changeDimension(bands) #we have to devide all values by 10k - its a conversion from bits to reflectances
bandByPixel_t = np.transpose(bandByPixel)

print(bandByPixel.shape)
print(bandByPixel_t.shape)

pd_bandByPixel = pd.DataFrame(data=bandByPixel,columns=["B1","B2","B3","B4","B5","B6","B7"])

(1583973, 7)
(7, 1583973)


## Prediction time 

In [35]:
y_pred = auto_glu.predict(pd_bandByPixel)
y_pred

#but the outputs are strings, so they need to be integers

array(['water', 'water', 'water', ..., 'water', 'water', 'water'],
      dtype=object)

In [36]:
#checking the clases
shp_prj['class'].unique()

array(['clouds', 'developed', 'undeveloped', 'water'], dtype=object)

In [37]:
#lazy but i don't care:
y_pred_num = np.where(y_pred=='water', 0, y_pred) 
y_pred_num = np.where(y_pred_num=='undeveloped', 1, y_pred_num) 
y_pred_num = np.where(y_pred_num=='developed', 2, y_pred_num) 
y_pred_num = np.where(y_pred_num=='clouds', 3, y_pred_num)

In [38]:
#rebuilds the raster from the table we used before
y_pred_np = np.reshape(y_pred_num,(ds1.RasterYSize,ds1.RasterXSize))
y_pred_np.shape


(1413, 1121)

In [40]:
#exports the data to our work space
raster.export(y_pred_np, ds1, '/content/outputs/Landsat_Class_AutoGluon.tif', dtype='int') 