<a href="https://colab.research.google.com/github/nilaynishant/AIMLTutorial/blob/main/02_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# run this only if you have not installed the requried packages
!pip install rasterio
!pip install geopandas
!pip install gitpython

# 1-Feature Extraction
## Load libraries

In [29]:
import os
import rasterio as rio
from rasterio.plot import show
import geopandas as gpd
import fiona
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import git

repo_url = 'https://github.com/nilaynishant/AIMLTutorial.git'  # set the URL of the repository
repo_dir = 'https://github.com/nilaynishant/AIMLTutorial/tree/main/Data'  # set the path to the local repository directory

git.Repo.clone_from(repo_url, repo_dir)

# # Optional: checkout a specific branch or commit
# repo = git.Repo(repo_dir)
# repo.git.checkout('branch_name_or_commit_hash')


In [31]:
# variables 
# Note: labels should be always last column with name "labels"
# Note: Make sure input labels shapefile and input raster have same CRS, otherwise code will not run

# input files
raster_loc = '/content/https:/github.com/nilaynishant/AIMLTutorial/tree/main/Data/Data/Sentinel2_Agartala.tif'
points_loc = '/content/https:/github.com/nilaynishant/AIMLTutorial/tree/main/Data/Data/Training.gpkg'
temp_point_loc = '/content/https:/github.com/nilaynishant/AIMLTutorial/tree/main/Data/Data/temp_y_points.shp'

# land cover names (for post visualization)
lulc_name = ['Vegetation', 'Builtup', 'Water', 'Agriculture','Fallow']

In [32]:
src = rio.open(raster_loc)

blue = src.read(4, masked=True)
green = src.read(3, masked=True)
red = src.read(2, masked=True)
nir = src.read(1, masked=True)

def normalize(array):
    """Normalizes numpy arrays into scale 0.0 - 1.0"""
    array_min, array_max = array.min(), array.max()
    return ((array - array_min)/(array_max - array_min))

# Normalize the bands
redn = normalize(red)
greenn = normalize(green)
bluen = normalize(blue)
nirn = normalize(nir)

In [33]:
#  reading bands from input
with rio.open(raster_loc) as img:
    bands = (img.read()).shape[0]
print('Bands of input image: ', bands)

# using ilteration to automatically create a bands list

features = []
for i in range(bands):
    features.append('band'+str(i+1))
print('Bands names: ', features)
f_len = len(features)
print(f_len)
points = gpd.read_file(points_loc)
# # adding a new column 'id' with range of points
# points = points.assign(id=range(len(points)))
# # saving nenw point file with 'id'
points.to_file(temp_point_loc) 
# # converting gdf to pd df and removing geometry
points_df = pd.DataFrame(points.drop(columns='geometry'))
# # ilterating over multiband raster
sampled = pd.Series()
print(sampled)
# #inputShape= temp_point_loc
# # Read input shapefile with fiona and iterate over each feature
with fiona.open(temp_point_loc) as shp:
    for feature in shp:
        siteID = feature['properties']['id']
        coords = feature['geometry']['coordinates']
        # Read pixel value at the given coordinates using Rasterio
        # NB: `sample()` returns an iterable of ndarrays.
        with rio.open(raster_loc) as stack_src:
                  value = [v for v in stack_src.sample([coords])]
        # Update the pandas serie accordingly
        sampled.loc[siteID] = value

# # reshaping sampled values
df1 = pd.DataFrame(sampled.values.tolist(), index=sampled.index)
df1['id'] = df1.index
df1 = pd.DataFrame(df1[0].values.tolist(), 
                   columns=features)
df1['id'] = df1.index

data = pd.merge(df1, points_df, on ='id')
print('Sampled Data: \n',data)

x = data.iloc[:,0:f_len]
X = x.values
y = data.iloc[:,-1]
Y = y.values

Bands of input image:  4
Bands names:  ['band1', 'band2', 'band3', 'band4']
4
Series([], dtype: float64)


  sampled = pd.Series()


Sampled Data: 
      band1   band2   band3  band4  id
0   1691.0  1102.0  1050.0  903.0   1
1   1691.0  1102.0  1050.0  903.0   1
2   1691.0  1102.0  1050.0  903.0   1
3   1691.0  1102.0  1050.0  903.0   1
4   1691.0  1102.0  1050.0  903.0   1
5   1691.0  1102.0  1050.0  903.0   1
6   1691.0  1102.0  1050.0  903.0   1
7   1691.0  1102.0  1050.0  903.0   1
8   1691.0  1102.0  1050.0  903.0   1
9   1691.0  1102.0  1050.0  903.0   1
10   936.0   802.0  1052.0  774.0   2
11   936.0   802.0  1052.0  774.0   2
12   936.0   802.0  1052.0  774.0   2
13   936.0   802.0  1052.0  774.0   2
14   936.0   802.0  1052.0  774.0   2
15   936.0   802.0  1052.0  774.0   2
16   936.0   802.0  1052.0  774.0   2
17   936.0   802.0  1052.0  774.0   2
18   936.0   802.0  1052.0  774.0   2
19   936.0   802.0  1052.0  774.0   2
20  3721.0   906.0   996.0  691.0   3
21  3721.0   906.0   996.0  691.0   3
22  3721.0   906.0   996.0  691.0   3
23  3721.0   906.0   996.0  691.0   3
24  3721.0   906.0   996.0  691.0 

## Train Test data split

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, stratify = Y)

print(f'X_train Shape: {X_train.shape}\nX_test Shape: {X_test.shape}\ny_train Shape: {y_train.shape}\ny_test Shape:{y_test.shape}')

X_train Shape: (29, 4)
X_test Shape: (13, 4)
y_train Shape: (29,)
y_test Shape:(13,)
