# Classification

In [5]:
import glob, os, time
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio import features

import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix

from pathlib import Path
from IPython.display import display

print('All libraries successfully imported!')
print(f'Scikit-learn : {sklearn.__version__}')

All libraries successfully imported!
Scikit-learn : 0.24.2


## Set directory

In [20]:
computer_path = '/export/miro/ndeffense/LBRAT2104/'
grp_letter    = 'X'

lut_path  = f'{computer_path}data/LUT/'

# Directory for all work files
work_path = f'{computer_path}GROUP_{grp_letter}/WORK/'

in_situ_path = f'{work_path}IN_SITU/'

im_path   = f'{work_path}3_L2A_MASKED/'
ndvi_path = f'{work_path}NDVI/'


classif_path = f'{work_path}CLASSIF/'

Path(classif_path).mkdir(parents=True, exist_ok=True)

print(f'Classification path is set to : {classif_path}')

Classification path is set to : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/CLASSIF/


## Set parameters

In [7]:
site = 'NAMUR'
year = '2020'

no_data = 0
trees_nb = 500

field_classif_code = 'sub_nb'
field_classif_name = 'sub'

field_reclassif_code = 'grp_A_nb'
field_reclassif_name = 'grp_A'

## Set filenames

In [21]:
in_situ_cal_shp = f'{in_situ_path}{site}_{year}_IN_SITU_ROI_CAL.shp'
in_situ_cal_tif = f'{in_situ_path}{site}_{year}_IN_SITU_ROI_CAL.tif'

classif_tif   = f'{classif_path}{site}_{year}_CLASSIF_RF_with_NDVI.tif'
reclassif_tif = f'{classif_path}{site}_{year}_CLASSIF_RF_with_NDVI_{field_reclassif_name}.tif'


s4s_lut_xlsx   = f'{lut_path}crop_dictionary_new.xlsx'


## 1. Prepare classification features associated to *in situ* data

### 1.1 Rasterize *in situ* data calibration shapefile

In [9]:
img_temp_tif = glob.glob(f'{im_path}*.tif')[0]

print(f'Raster template file : {img_temp_tif}')

# Open the shapefile with GeoPandas

in_situ_gdf = gpd.read_file(in_situ_cal_shp)

# Open the raster file you want to use as a template for rasterize

src = rasterio.open(img_temp_tif, "r")

# Update metadata

out_meta = src.meta
out_meta.update(nodata=no_data)

crs_shp = str(in_situ_gdf.crs).split(":",1)[1]
crs_tif = str(src.crs).split(":",1)[1]

print(f'The CRS of in situ data is    : {crs_shp}')
print(f'The CRS of raster template is : {crs_tif}')

if crs_shp == crs_tif:
    print("CRS are the same")

    print(f'Rasterize starts : {in_situ_cal_shp}')

    # Burn the features into the raster and write it out

    dst = rasterio.open(in_situ_cal_tif, 'w+', **out_meta)
    dst_arr = dst.read(1)

    # This is where we create a generator of geom, value pairs to use in rasterizing

    geom_col = in_situ_gdf.geometry
    code_col = in_situ_gdf[field_classif_code].astype(int)

    shapes = ((geom,value) for geom, value in zip(geom_col, code_col))

    in_situ_arr = features.rasterize(shapes=shapes,
                                     fill=no_data,
                                     out=dst_arr,
                                     transform=dst.transform)

    dst.write_band(1, in_situ_arr)

    print(f'Rasterize is done : {in_situ_cal_tif}')

    # Close rasterio objects
    src.close()
    dst.close()

else:
    print('CRS are different --> repoject in-situ data shapefile with "to_crs"')



Raster template file : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200417T104021_B11_10m_ROI_SCL.tif
The CRS of in situ data is    : 32631
The CRS of raster template is : 32631
CRS are the same
Rasterize starts : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/IN_SITU/NAMUR_2020_IN_SITU_ROI_CAL.shp
Rasterize is done : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/IN_SITU/NAMUR_2020_IN_SITU_ROI_CAL.tif


### 1.2 List all the classification features

In this case, one NDVI per month

In [10]:
# Get list of all files containing the features
list_im = sorted(glob.glob(f'{ndvi_path}*.tif'))

# Create an empty list to append all feature rasters one by one
list_src_arr = []

for im_file in list_im:

    src = rasterio.open(im_file, "r")
    im = src.read(1)
    list_src_arr.append(im)
    src.close()

# Merge all the 2D matrices from the list into one 3D matrix

feat_arr = np.dstack(list_src_arr).astype(np.float32)

print(feat_arr.shape)
print(f'There are {feat_arr.shape[2]} features')
print(f'The features type is : {feat_arr.dtype}')

#feat_arr_1 = np.stack(list_src_arr, axis=0)
#print(feat_arr_1.shape)

(570, 986, 12)
There are 12 features
The features type is : float32


### 1.3 Pairing *in situ* data (Y) with EO classification features (X)

Now that we have the image we want to classify (our X feature inputs), and the ROI with the land cover labels (our Y labeled data), we need to pair them up in NumPy arrays so we may feed them to Random Forest.

In [11]:
# Open in-situ used for calibration

src = rasterio.open(in_situ_cal_tif, "r")
cal_arr = src.read(1)
src.close()

# Find how many non-zero entries we have -- i.e. how many training data samples?
n_samples = (cal_arr > 0).sum()

print(f'We have {n_samples} samples (= calibration pixels)')

We have 32585 samples (= calibration pixels)


What are our classification labels?

In [12]:
labels = np.unique(cal_arr[cal_arr > 0])

print(f'The training data include {labels.size} classes: {labels}')

The training data include 16 classes: [1111 1121 1152 1171 1192 1435 1511 1771 1811 1911 1923 3199 4111 6999
 8111 8411]


We need :
- **"X" 2D matrix** containing classification features
- **"y" 1D matrix** containing our labels

These will have `n_samples` rows.

In [13]:
X = feat_arr[cal_arr > 0, :]
y = cal_arr[cal_arr > 0]

# Replace NaN by another value
X = np.nan_to_num(X, nan=-10)

print(f'Our X matrix is sized: {X.shape}')
print(f'Our y array is sized: {y.shape}')

Our X matrix is sized: (32585, 12)
Our y array is sized: (32585,)


## 2. Train the Random Forest

Now that we have our X 2D-matrix of feature inputs and our y 1D-matrix, we can train our model.

Visit this <a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html" target="_blank">web page</a>  to find the usage of RandomForestClassifier from scikit-learn.

In [14]:
start_training = time.time()

# Initialize our model
rf = RandomForestClassifier(n_estimators=trees_nb, oob_score=True)

# Fit our model to training data
rf = rf.fit(X, y)

end_training = time.time()

# Get time elapsed during the Random Forest training
hours, rem = divmod(end_training-start_training, 3600)
minutes, seconds = divmod(rem, 60)
print("Random Forest training : {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))


Random Forest training : 00:01:35.48


With our Random Forest model fit, we can check out the "Out-of-Bag" (OOB) prediction score:

In [15]:
print(f'Our OOB prediction of accuracy is: {round(rf.oob_score_ * 100,2)}%')

Our OOB prediction of accuracy is: 99.28%


To help us get an idea of which features bands were important, we can look at the feature importance scores:

In [16]:
for band_nb, imp in enumerate(rf.feature_importances_, start=1):
    print(f'Band {band_nb} importance: {round(imp,4)}')

Band 1 importance: 0.0741
Band 2 importance: 0.0181
Band 3 importance: 0.1064
Band 4 importance: 0.0936
Band 5 importance: 0.1284
Band 6 importance: 0.0632
Band 7 importance: 0.1153
Band 8 importance: 0.1143
Band 9 importance: 0.1094
Band 10 importance: 0.0693
Band 11 importance: 0.0551
Band 12 importance: 0.0529


Let's look at a crosstabulation to see the class confusion

In [17]:
# Setup a dataframe
df = pd.DataFrame()

df['truth'] = y
df['predict'] = rf.predict(X)

# Cross-tabulate predictions

cross_tab = pd.crosstab(df['truth'], df['predict'], margins=True)
display(cross_tab)


predict,1111,1121,1152,1171,1192,1435,1511,1771,1811,1911,1923,3199,4111,6999,8111,8411,All
truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1111,7406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7406
1121,0,2161,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2161
1152,0,0,1636,0,0,0,0,0,0,0,0,0,0,0,0,0,1636
1171,0,0,0,490,0,0,0,0,0,0,0,0,0,0,0,0,490
1192,0,0,0,0,1526,0,0,0,0,0,0,0,0,0,0,0,1526
1435,0,0,0,0,0,1328,0,0,0,0,0,0,0,0,0,0,1328
1511,0,0,0,0,0,0,1601,0,0,0,0,0,0,0,0,0,1601
1771,0,0,0,0,0,0,0,2578,0,0,0,0,0,0,0,0,2578
1811,0,0,0,0,0,0,0,0,1804,0,0,0,0,0,0,0,1804
1911,0,0,0,0,0,0,0,0,0,403,0,0,0,0,0,0,403


## 3. Predicting the rest of the image

With our Random Forest classifier fit, we can now proceed by trying to classify the entire image.

In [36]:
# Take our full image and reshape into long 2d array (nrow * ncol, nband) for classification

img = feat_arr

img = np.nan_to_num(img, nan=-10)

new_shape = (img.shape[0] * img.shape[1], img.shape[2])

img_as_array = img[:, :, :].reshape(new_shape)

print(f'Reshaped from {img.shape} to {img_as_array.shape}')

start_classification = time.time()

# Now predict for each pixel
class_prediction = rf.predict(img_as_array)

# Reshape our classification map
class_prediction = class_prediction.reshape(img[:, :, 0].shape)

end_classification = time.time()

hours, rem = divmod(end_classification-start_classification, 3600)
minutes, seconds = divmod(rem, 60)
print("Random Forest training : {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

print(class_prediction)


Reshaped from (570, 986, 12) to (562020, 12)
Random Forest training : 00:01:48.56
[[3199 3199 8111 ... 3199 3199 3199]
 [3199 3199 3199 ... 3199 3199 3199]
 [3199 3199 6999 ... 3199 3199 1121]
 ...
 [6999 6999 6999 ... 1111 1111 1111]
 [6999 6999 6999 ... 1111 1111 1111]
 [8111 8111 8111 ... 8111 8111 8111]]


Write Classification product into a GeoTIFF file

In [37]:
print(f'Size of classification image : {class_prediction.shape}')

# Open template image to get metadata
src = rasterio.open(img_temp_tif)
im = src.read(1)
profile = src.profile

# Write classification image
dst = rasterio.open(classif_tif, 'w', **profile)
dst.write(class_prediction, 1)

# Close rasterio objects
src.close()
dst.close()

Size of classification image : (570, 986)


## 4. Reclassification

### 4.1 Open LUT and sort values

In [38]:
lut_df = pd.read_excel(s4s_lut_xlsx)

lut_df = lut_df.sort_values(by=field_classif_code, ascending=True)

display(lut_df[[field_classif_code, field_classif_name, field_reclassif_code, field_reclassif_name]].head())

Unnamed: 0,sub_nb,sub,grp_A_nb,grp_A
0,0,Unknown,0,Remove
1,1111,Winter wheat,111,Wheat
2,1112,Spring wheat,111,Wheat
3,1113,Hard wheat,111,Wheat
4,1114,Soft wheat,111,Wheat


### 4.2 Reclassify prediction

In [41]:
src = rasterio.open(classif_tif)

profile = src.profile
reclass_prediction = src.read(1)

src.close()

print(class_prediction)

for i, row in lut_df.iterrows():
    
    old_class = row[field_classif_code]
    new_class = row[field_reclassif_code]

    #print(f'{old_class} --> {new_class}')

    #array[np.where(array == old_class)] = new_class

    reclass_prediction[reclass_prediction == old_class] = new_class

print(reclass_prediction)

[[3199 3199 8111 ... 3199 3199 3199]
 [3199 3199 3199 ... 3199 3199 3199]
 [3199 3199 6999 ... 3199 3199 1121]
 ...
 [6999 6999 6999 ... 1111 1111 1111]
 [6999 6999 6999 ... 1111 1111 1111]
 [8111 8111 8111 ... 8111 8111 8111]]
[[  3   3   8 ...   3   3   3]
 [  3   3   3 ...   3   3   3]
 [  3   3   6 ...   3   3 112]
 ...
 [  6   6   6 ... 111 111 111]
 [  6   6   6 ... 111 111 111]
 [  8   8   8 ...   8   8   8]]


### 4.3 Write re-classification product into a GeoTIFF file

In [40]:
print(f'Size of re-classification image : {reclass_prediction.shape}')

# Open template image to get metadata
src = rasterio.open(img_temp_tif)
im = src.read(1)
profile = src.profile

# Write re-classification image
dst = rasterio.open(reclassif_tif, 'w', **profile)
dst.write(reclass_prediction, 1)

# Close rasterio objects
src.close()
dst.close()

Size of re-classification image : (570, 986)
