<a href="https://colab.research.google.com/github/nilaynishant/AIMLTutorial/blob/main/04_train_test_data_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# run this only if you have not installed the requried packages
!pip install -q rasterio
!pip install -q geopandas
!pip install -q gitpython

In [18]:
import os
import rasterio as rio
from rasterio.plot import show
import geopandas as gpd
import fiona
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import git

repo_url = 'https://github.com/nilaynishant/AIMLTutorial.git'  # set the URL of the repository
repo_dir = 'https://github.com/nilaynishant/AIMLTutorial/tree/main/Data'  # set the path to the local repository directory
if not os.path.exists('https://github.com/nilaynishant/AIMLTutorial/tree/main/Data'):
  git.Repo.clone_from(repo_url, repo_dir)
# variables 
# Note: labels should be always last column with name "labels"
# Note: Make sure input labels shapefile and input raster have same CRS, otherwise code will not run

# input files
raster_loc = '/content/https:/github.com/nilaynishant/AIMLTutorial/tree/main/Data/Data/Sentinel2_Agartala.tif'
points_loc = '/content/https:/github.com/nilaynishant/AIMLTutorial/tree/main/Data/Data/Training.gpkg'
temp_point_loc = '/content/https:/github.com/nilaynishant/AIMLTutorial/tree/main/Data/Data/temp_y_points.shp'

# land cover names (for post visualization)
lulc_name = ['Vegetation', 'Builtup', 'Water', 'Agriculture','Fallow']

#  reading bands from input
with rio.open(raster_loc) as img:
    bands = (img.read()).shape[0]
print('Bands of input image: ', bands)

# using ilteration to automatically create a bands list

features = []
for i in range(bands):
    features.append('band'+str(i+1))
print('Bands names: ', features)
f_len = len(features)

points = gpd.read_file(points_loc)
# adding a new column 'id' with range of points
points = points.assign(id1=range(len(points)))
# saving nenw point file with 'id'
points.to_file(temp_point_loc) 
# converting gdf to pd df and removing geometry
points_df = pd.DataFrame(points.drop(columns='geometry'))
# ilterating over multiband raster
sampled = pd.Series()

#inputShape= temp_point_loc
# Read input shapefile with fiona and iterate over each feature
with fiona.open(temp_point_loc) as shp:
    for feature in shp:
        siteID = feature['properties']['id1']
        coords = feature['geometry']['coordinates']
        # Read pixel value at the given coordinates using Rasterio
        # NB: `sample()` returns an iterable of ndarrays.
        with rio.open(raster_loc) as stack_src:
                  value = [v for v in stack_src.sample([coords])]
        # Update the pandas serie accordingly
        sampled.loc[siteID] = value
# reshaping sampled values
df1 = pd.DataFrame(sampled.values.tolist(), index=sampled.index)
df1['id'] = df1.index
df1 = pd.DataFrame(df1[0].values.tolist(), 
                   columns=features)
df1['id'] = df1.index

data = pd.merge(df1, points_df, on ='id')
print('Sampled Data: \n',data)

x = data.iloc[:,0:f_len]
X = x.values
y = data.iloc[:,-2]
Y = y.values

Bands of input image:  4
Bands names:  ['band1', 'band2', 'band3', 'band4']


  sampled = pd.Series()


Sampled Data: 
      band1  band2  band3  band4  id  id1
0   2664.0  500.0  601.0  446.0   1    0
1   2664.0  500.0  601.0  446.0   1    1
2   2664.0  500.0  601.0  446.0   1    2
3   2664.0  500.0  601.0  446.0   1    3
4   2664.0  500.0  601.0  446.0   1    4
5   2664.0  500.0  601.0  446.0   1    5
6   2664.0  500.0  601.0  446.0   1    6
7   2664.0  500.0  601.0  446.0   1    7
8   2664.0  500.0  601.0  446.0   1    8
9   2664.0  500.0  601.0  446.0   1    9
10  2628.0  451.0  577.0  433.0   2   10
11  2628.0  451.0  577.0  433.0   2   11
12  2628.0  451.0  577.0  433.0   2   12
13  2628.0  451.0  577.0  433.0   2   13
14  2628.0  451.0  577.0  433.0   2   14
15  2628.0  451.0  577.0  433.0   2   15
16  2628.0  451.0  577.0  433.0   2   16
17  2628.0  451.0  577.0  433.0   2   17
18  2628.0  451.0  577.0  433.0   2   18
19  2628.0  451.0  577.0  433.0   2   19
20  2335.0  481.0  613.0  476.0   3   20
21  2335.0  481.0  613.0  476.0   3   21
22  2335.0  481.0  613.0  476.0   3   22


In [5]:
import sklearn
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, stratify = Y)

print(f'X_train Shape: {X_train.shape}\nX_test Shape: {X_test.shape}\ny_train Shape: {y_train.shape}\ny_test Shape:{y_test.shape}')

X_train Shape: (35, 4)
X_test Shape: (16, 4)
y_train Shape: (35,)
y_test Shape:(16,)
