# Classification with Random Forest
Supervised classification based in random forest algorithm 

**Import libraries**

In [3]:
import scipy
import pandas as pd
import numpy as np
import gdal
import ogr
from skimage import exposure
import geopandas as gpd
import time
from sklearn.ensemble import RandomForestClassifier
from skimage.segmentation import slic

print('Libraries imported successfully')

ModuleNotFoundError: No module named 'geopandas'

In [2]:
seg_start=time.time() #For checking ejecution time

**Define directories**

 - <b>tmp</b>: directory
 
 - <b>tmp_raster</b>: raster name
  
 - <b>naip_fn</b>: path to the raster image
 

In [3]:
# define directories
tmp= 'E:/cloudbutton/DATASET/'
tmp_raster = 'valenciana_704000.0-4400000.0_comunidad_mask.tif'

naip_fn = f'{tmp}Dataset_processing/Tiling/{tmp_raster}'


### Image segmentation
Make segmentation of the image for the future classification

In [4]:
driverTiff = gdal.GetDriverByName('GTiff')
naip_ds = gdal.Open(naip_fn)
nbands = naip_ds.RasterCount
band_data = []

for i in range(1, nbands+1):
    band = naip_ds.GetRasterBand(i).ReadAsArray()
    band_data.append(band)
band_data = np.dstack(band_data)
img = exposure.rescale_intensity(band_data)

segments = slic(img, n_segments=10000, compactness=0.1)
print('segments complete', time.time() - seg_start)

  if sys.path[0] == '':


segments complete 263.8433792591095


**Save** segments to raster

In [5]:
output = f'{tmp}OBIA/Segmentation/segmented_{tmp_raster}.tif'
segments_ds = driverTiff.Create(output, naip_ds.RasterXSize, naip_ds.RasterYSize, 1, gdal.GDT_Float32)
segments_ds.SetGeoTransform(naip_ds.GetGeoTransform())
segments_ds.SetProjection(naip_ds.GetProjectionRef())
segments_ds.GetRasterBand(1).WriteArray(segments)
segments_ds = None

### Spectral properties of image segments
Obtain all the spectral properties of the different bands of the raster image

In [6]:
def segment_features(segment_pixels):
    features = []
    npixels, nbands = segment_pixels.shape
    for b in range(nbands):
        stats = scipy.stats.describe(segment_pixels[:, b])
        band_stats = list(stats.minmax) + list(stats)[2:]
        if npixels == 1:
            # in this case the variance = nan, change it 0.0
            band_stats[3] = 0.0
        features += band_stats
    return features

segment_ids = np.unique(segments)
objects = []
object_ids = []
for id in segment_ids:
    segment_pixels = img[segments == id]
    object_features = segment_features(segment_pixels)
    objects.append(object_features)
    object_ids.append(id)

### Training and test data
training data: truth_data.shp file is needed to teach the algorithm how to work


test data: a part from the previous data is gonna be splitted

In [23]:
gdf = gpd.read_file('E:/rf_classification/truth_data_definitiva.shp')
gdf = gdf.rename(columns={"id":"label","geometry":"geometry"})

Create a unique id for each land cover class (label)

In [24]:
class_names = gdf['label'].unique()
class_ids = np.arange(class_names.size) + 1
df = pd.DataFrame({'label': class_names, 'id': class_ids})

Add a new column to geodatafame with the id for each class (label)

In [25]:
gdf['id'] = gdf['label'].map(dict(zip(class_names, class_ids)))
print(gdf)

    label                   geometry  id
0       1  POINT (-0.55361 39.71167)   1
1       1  POINT (-0.55903 39.71065)   1
2       1  POINT (-0.49431 39.70662)   1
3       1  POINT (-0.48813 39.68796)   1
4       1  POINT (-0.49374 39.66142)   1
5       1  POINT (-0.54507 39.63492)   1
6       1  POINT (-0.54804 39.62685)   1
7       1  POINT (-0.60602 39.61899)   1
8       1  POINT (-0.48554 39.65142)   1
9       1  POINT (-0.52793 39.71774)   1
10      1  POINT (-0.57780 39.63598)   1
11      1  POINT (-0.61070 39.63912)   1
12      1  POINT (-0.49545 39.59359)   1
13      1  POINT (-0.49885 39.63507)   1
14      2  POINT (-0.57473 39.68867)   2
15      2  POINT (-0.58966 39.68161)   2
16      2  POINT (-0.60366 39.68019)   2
17      2  POINT (-0.61356 39.67681)   2
18      2  POINT (-0.49606 39.61080)   2
19      2  POINT (-0.47070 39.59551)   2
20      2  POINT (-0.45132 39.60435)   2
21      2  POINT (-0.45549 39.61711)   2
22      2  POINT (-0.50543 39.61517)   2
23      2  POINT

Split the truth data into **training and test** data sets and save each to a new shapefile

In [27]:
gdf_train = gdf.sample(frac=0.7)  # 70% of observations assigned to training data (30% to test data)
gdf_test = gdf.drop(gdf_train.index)

 # save training and test data to shapefiles
gdf_train.to_file('E:/rf_classification/train_data.shp')
gdf_test.to_file('E:/rf_classification/test_data.shp')

### Associate train and test data sets with the segments 
convert the training data to raster format so each point can be associated with an image segment

In [29]:
train_fn = 'E:/rf_classification/train_data.shp'
train_ds = ogr.Open(train_fn)
lyr = train_ds.GetLayer()

Create a new raster layer in memory

In [31]:
driver = gdal.GetDriverByName('MEM')

          # driver.Create(path, cols, rows, bands, dtype)
target_ds = driver.Create('', naip_ds.RasterXSize, naip_ds.RasterYSize, 1, gdal.GDT_UInt16) 
target_ds.SetGeoTransform(naip_ds.GetGeoTransform())
target_ds.SetProjection(naip_ds.GetProjection())

0

Rasterize the training points

In [36]:
options = ['ATTRIBUTE=id']
gdal.RasterizeLayer(target_ds, [1], lyr, options=options)
data = target_ds.GetRasterBand(1).ReadAsArray()
ground_truth = target_ds.GetRasterBand(1).ReadAsArray()

Get unique values  for each land cover type

In [47]:
classes = np.unique(ground_truth)[1:] 
print(classes)

[1 2 3 4]


For each class record the associated segment ids

In [48]:
segments_per_class = {}
for klass in classes:
    segments_of_class = segments[ground_truth == klass]
    segments_per_class[klass] = set(segments_of_class)

Check those segments

In [49]:
print(segments_per_class)

{1: {2, 131, 38, 361, 10, 273, 276, 92, 318}, 2: {4, 377, 42, 76, 368, 276, 118, 183, 120, 249}, 3: {131, 35, 4, 37, 75, 252}, 4: {352, 416, 290, 4, 356, 302, 207, 313, 283, 381}}


### Land cover classification
Based on Random Forest algorithm 

Select all pixels in training segments assigned value greater than threshold

In [107]:
train_img = np.copy(segments)
threshold = train_img.max() + 1  # make the threshold value greater than any land cover class value

for klass in classes:
    class_label = threshold + klass+10
    for segment_id in segments_per_class[klass]:
        train_img[train_img == segment_id] = class_label

In [108]:
print(train_img)

435
[[  0   0   0 ...   6   6   6]
 [  0   0   0 ...   6   6   6]
 [  0   0   0 ...   6   6   6]
 ...
 [434 434 434 ... 195 195 195]
 [434 434 434 ... 195 195 195]
 [434 434 434 ... 195 195 195]]


Make 0 all segments without classification. Keep the value of the rest

In [2]:
train_img[train_img <= threshold] = 0
train_img[train_img > threshold] -= threshold

NameError: name 'train_img' is not defined

Create objects and labels for training data

In [111]:
training_objects = []
training_labels = []
for klass in classes:
    class_train_object = [v for i, v in enumerate(objects) if segment_ids[i] in segments_per_class[klass]]
    training_labels += [klass] * len(class_train_object)
    training_objects += class_train_object

Setup **Random Forest Classifier**, train and predict

In [None]:
classifier = RandomForestClassifier(n_jobs=-1)  # setup random forest classifier
classifier.fit(training_objects, training_labels)  # fit rf classifier
predicted = classifier.predict(objects)  # predict with rf classifier

Create numpy array and save to raster

In [113]:
clf = np.copy(segments)
for segment_id, klass in zip(segment_ids, predicted):
    clf[clf == segment_id] = klass

Masks no data values

In [None]:
mask = np.sum(img, axis=2)
mask[mask > 0.0] = 1.0
mask[mask == 0.0] = -1.0
clf = np.multiply(clf, mask)
clf[clf < 0] = -9999.0

Save classification

In [114]:
output_rf = 'E:/rf_classification/classified_result.tif'

clfds = driverTiff.Create(output_rf, naip_ds.RasterXSize, naip_ds.RasterYSize,
                          1, gdal.GDT_Float32)  # this section saves to raster
clfds.SetGeoTransform(naip_ds.GetGeoTransform())
clfds.SetProjection(naip_ds.GetProjection())
clfds.GetRasterBand(1).SetNoDataValue(-9999.0)
clfds.GetRasterBand(1).WriteArray(clf)
clfds = None

Done!


In [116]:
print(f'Random forest classification finished successfully in {time.time() - seg_start} s')

Random forest classification finished successfully in 13086.662047863007 s
