In [1]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# Warning used to notify implicit data conversions happening in the code.
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


In [2]:
import os
import glob
from multiprocessing import Pool, cpu_count

import rasterio
from rasterio import features
from rasterio import plot
import numpy as np

import pandas as pd
import geopandas as gpd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.externals.joblib import parallel_backend

import matplotlib.pylab as plt

In [3]:
# point to the folder with the input data
DATA_PATH = '/home/io/ASTROSAT/code/urban_extraction'


# load training datasets
glob_path = glob.glob(os.path.join(DATA_PATH, 'training_data', '*'))
shapefiles = [f for f in glob_path if f.endswith('.shp')]

def load_raster(input_file):
    with rasterio.open(input_file) as src:
        band_rgb = src.read()
        transform = src.transform
        shape = src.shape
        profile = src.profile

        return {'band_rgb': band_rgb, 'transform': transform, 'shape': shape, 'profile': profile}



def rasterize(vector_Data):
    raster = load_raster(os.path.join(
        DATA_PATH, 'input', 'Sentinel-2_RGB.tiff'))

    labeled_pixels = np.zeros((raster['shape'][0], raster['shape'][1]))
    for i, shp in enumerate(vector_Data):
        label = i+1
        df = gpd.read_file(shp)
        geom = df['geometry']
        vectors_rasterized = features.rasterize(geom,
                                                out_shape=raster['shape'],
                                                transform=raster['transform'],
                                                all_touched=True,
                                                fill=0, default_value=label)
        labeled_pixels += vectors_rasterized

    return labeled_pixels


def training_samples():
    raster = load_raster(os.path.join(
        DATA_PATH, 'input', 'Sentinel-2_RGB.tiff'))
    # convert shape of raster from bands:rows:cols to rows:cols:bands
    raster_img = np.rollaxis(raster['band_rgb'], 0, 3)
    # produce rasterized data
    labeled_pixels = rasterize(shapefiles)

    roi_int = labeled_pixels.astype(int)
    # X is the matrix containing our features
    X = raster_img[roi_int > 0]
    # y contains the values of our training data
    y = labeled_pixels[labeled_pixels > 0]

    return X, y


def split(X, y):
    #X, y = training_samples()
    split_test_data = 0.30
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=split_test_data, stratify=y)

    return X_train, X_test, y_train, y_test

In [4]:
def tune(X, y, search_type):
    X, y = training_samples()
    X_train, X_test, y_train, y_test = split(X,y)
    
    param_range_c = np.logspace(0, 2, 8)
    param_range_gamma = np.logspace(-6, -1, 8)

    param_grid = {'svm__C': param_range_c,
                  'svm__gamma': param_range_gamma}

    pip = Pipeline([('scale', preprocessing.StandardScaler()),
                    ('svm', SVC(kernel='rbf', class_weight='balanced'))])

    if search_type == 'grid':
        clf = GridSearchCV(estimator=pip,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3)
                           #n_jobs=-1)

        
        clf = clf.fit(X_train, y_train)

        # print accuracy of the model
        print('Best parameters:', clf.best_params_)
        print('Classification accuracy', clf.best_score_)

    elif search_type == 'random':
        clf = RandomizedSearchCV(estimator=pip,
                                 param_distributions=param_grid,
                                 scoring='accuracy',
                                 cv=3,
                                 n_iter=15,
                                 error_score='numeric')  # it supresses the warning error
                                 #n_jobs=-1)

        clf = clf.fit(X_train, y_train)

        # print accuracy of the model
        print('Best parameters:', clf.best_params_)
        print('Classification accuracy:', clf.best_score_)

    return clf


def predict(input_data):
    X, y = training_samples()
    X_train, X_test, y_train, y_test = split(X,y)
    clf = tune(X_train, y_train, 'random')
    y_predict = clf.predict(input_data)
    return y_predict


def parallel_processing():
    raster = load_raster(os.path.join(
        DATA_PATH, 'input', 'Sentinel-2_RGB.tiff'))
    raster_img = np.rollaxis(raster['band_rgb'], 0, 3)
    # split good data into chunks for parallel processing
    cpu_n = cpu_count()

    # Reshape the data so that we make predictions for the whole raster
    new_shape = (raster_img.shape[0] *
                 raster_img.shape[1], raster_img.shape[2])

    
    img_as_array = raster_img[:, :].reshape(new_shape)
    image_array = np.copy(img_as_array)
    split = np.array_split(image_array, cpu_n)

    # run parallel processing of all data with SVM
    pool = Pool(cpu_n)
    svmLablesPredict = pool.map(predict, split)
    # join results back from the queue and insert into full matrix
    svmLablesPredict = np.hstack(svmLablesPredict)
    svm_reshape = svmLablesPredict.reshape(
        raster_img.shape[0], raster_img.shape[1])

    return svm_reshape



def model_accuracy():
    svm_classified = parallel_processing()
    labeled_pixels = rasterize(shapefiles)
    target_names = [os.path.split(s)[1][:-4] for s in shapefiles]

    for_verification = np.nonzero(labeled_pixels)
    verification_labels = labeled_pixels[for_verification]
    predicted_labels = svm_classified[for_verification]  # svm_reshape

    print('Confusion matrix: \n %s' %
          confusion_matrix(verification_labels, predicted_labels))
    print('\n')

    print('Classificaion report: \n %s' %
          classification_report(verification_labels, predicted_labels, target_names=target_names))

    return confusion_matrix, classification_report


accuracy = model_accuracy()



Best parameters: {'svm__gamma': 0.019306977288832496, 'svm__C': 51.7947467923121}
Classification accuracy: 0.9426256077795786
Best parameters: {'svm__gamma': 0.019306977288832496, 'svm__C': 51.7947467923121}
Classification accuracy: 0.9426256077795786
Best parameters: {'svm__gamma': 0.019306977288832496, 'svm__C': 51.7947467923121}
Classification accuracy: 0.9426256077795786
Best parameters: {'svm__gamma': 0.019306977288832496, 'svm__C': 51.7947467923121}
Classification accuracy: 0.9426256077795786
Best parameters: {'svm__gamma': 0.019306977288832496, 'svm__C': 51.7947467923121}
Classification accuracy: 0.9426256077795786
Best parameters: {'svm__gamma': 0.019306977288832496, 'svm__C': 51.7947467923121}
Classification accuracy: 0.9426256077795786
Best parameters: {'svm__gamma': 0.019306977288832496, 'svm__C': 51.7947467923121}
Classification accuracy: 0.9426256077795786
Best parameters: {'svm__gamma': 0.019306977288832496, 'svm__C': 51.7947467923121}
Best parameters: {'svm__gamma': 0.01

In [5]:
DATA_PATH = '/home/io/ASTROSAT/code/urban_extraction'

In [31]:
def load_raster(input_file):
    with rasterio.open(input_file) as src:
        band_rgb = src.read()
        transform = src.transform
        shape = src.shape
        profile = src.profile
        
        return {'band_rgb':band_rgb ,'transform':transform, 'shape':shape, 'profile':profile}

In [26]:
raster, transform, shape, profile = load_raster(os.path.join(DATA_PATH,'input','Sentinel-2_RGB.tiff'))


In [40]:
raster = load_raster(os.path.join(DATA_PATH,'input','Sentinel-2_RGB.tiff'))


In [45]:
raster['transform']

Affine(0.0003593084656084656, 0.0, -4.521561,
       0.0, -0.00020171566054243356, 55.955351)

In [30]:
# ground = gpd.read_file(os.path.join(DATA_PATH,'training_data','bare_ground.shp'))
# water = gpd.read_file(os.path.join(DATA_PATH,'training_data','water.shp'))
# grass = gpd.read_file(os.path.join(DATA_PATH,'training_data','grass.shp'))
# urban = gpd.read_file(os.path.join(DATA_PATH,'training_data','urban.shp'))

In [31]:
# ground_rasterized = features.rasterize([(x.geometry, 1) for i, x in ground.iterrows()],
#                                        out_shape = shape,
#                                        transform = transform,
#                                        all_touched=True,
#                                        fill=0)

# water_rasterized = features.rasterize([(x.geometry, 1) for i, x in water.iterrows()],
#                                        out_shape = shape,
#                                        transform = transform,
#                                        all_touched=True,
#                                        fill=0)

# grass_rasterized = features.rasterize([(x.geometry, 1) for i, x in grass.iterrows()],
#                                        out_shape = shape,
#                                        transform = transform,
#                                        all_touched=True,
#                                        fill=0)

# urban_rasterized = features.rasterize([(x.geometry, 1) for i, x in urban.iterrows()],
#                                        out_shape = shape,
#                                        transform = transform,
#                                        all_touched=True,
#                                        fill=0)

In [6]:
# print ('ground',ground_rasterized.sum())
# print ('water',water_rasterized.sum())
# print ('grass',grass_rasterized.sum())
# print ('urban',urban_rasterized.sum())

ground 1190
water 1049
grass 1424
urban 745


In [40]:
glob_path = glob.glob(os.path.join(DATA_PATH,'training_data','*'))

shapefiles = [f for f in glob_path if f.endswith('.shp')]

In [41]:
labeled_pixels = np.zeros((shape[0],shape[1]))


for i,shp in enumerate(shapefiles):
    label = i+1 
    df = gpd.read_file(shp)
    geom = df['geometry']
    #print (i,geom)
    vectors_rasterized = features.rasterize(geom,
                                           out_shape = shape,
                                           transform = transform,
                                           all_touched=True,
                                           default_value=label)
    labeled_pixels += vectors_rasterized


In [129]:
for i, shp in sorted(enumerate(shapefiles)):
    i = i+1
    shp_path = os.path.split(shp)
    land_classes = shp_path[1][:-4]
    print('Class {land_classes} contains {n} pixels'.format(land_classes=land_classes, n=(labeled_pixels == i).sum()))

Class urban contains 745 pixels
Class water contains 1049 pixels
Class grass contains 1424 pixels
Class bare_ground contains 1190 pixels


In [211]:
raster_img = np.rollaxis(band_rgb,0,3)

roi_int = labeled_pixels.astype(int)
# X is the matrix containing our features
X = raster_img[roi_int>0] 
# y contains the values of our training data
y = labeled_pixels[labeled_pixels>0]


#Split our dataset into training and testing. Test data will be used to make predictions
split_test_data = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_data, stratify = y)

#use pipeline to do all the steps automatically
pip = Pipeline([('scale', preprocessing.StandardScaler()), 
                ('svm', SVC(kernel='rbf', C=1, gamma=10, decision_function_shape='ovo', class_weight='balanced'))])
pip.fit(X_train, y_train)
y_predict = pip.predict(X_test)#make prediction
print (classification_report(y_test, y_predict))

             precision    recall  f1-score   support

        1.0       0.96      0.86      0.91       224
        2.0       0.97      1.00      0.98       315
        3.0       0.99      0.99      0.99       427
        4.0       0.96      0.99      0.97       357

avg / total       0.97      0.97      0.97      1323





In [212]:
#stratify
print (np.sum(y_test==1))
print (np.sum(y_train==1))

224
521


In [209]:
print (np.sum(y_test==1))
print (np.sum(y_train==1))

240
505


In [213]:
#stratify
print (np.sum(y_test==2))
print (np.sum(y_train==2))

315
734


In [210]:
print (np.sum(y_test==2))
print (np.sum(y_train==2))

322
727


In [203]:
print (np.sum(y_train==3))
print (np.sum(y_test==3))

686
738


In [204]:
print (np.sum(y_train==4))
print (np.sum(y_test==4))

616
574


In [35]:
new_shape = (raster_img.shape[0] * raster_img.shape[1], raster_img.shape[2] )

img_as_array = raster_img[:,:].reshape(new_shape)
print('Reshaped from {o} to {n}'.format(o=raster_img.shape,
                                        n=img_as_array.shape))


#Split our dataset into training and testing. Test data will be used to make predictions
split_test_data = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_data, stratify = y)

#use pipeline to do all the steps automatically
pip = Pipeline([('scale', preprocessing.StandardScaler()), 
                ('svm', SVC(kernel='rbf', C=10, gamma=0.1, decision_function_shape='ovo', class_weight='balanced'))])
pip.fit(X_train, y_train)

# Now predict for each pixel
class_prediction = pip.predict(img_as_array)

# Reshape our classification map
class_prediction = class_prediction.reshape(raster_img[:, :, 0].shape)


Reshaped from (1143, 1890, 3) to (2160270, 3)




In [36]:
profile = profile
profile.update(
            dtype=class_prediction.dtype,
            count=1,
            compress='lzw', 
            nodata=0)

with rasterio.open("/home/io/Desktop/class_prediction12.tif", 'w', **profile) as out:
    out.write_band(1, class_prediction)