# Batching pointclouds
To tile the point cloud into batches we will use a fixed number of points, e.g. 200,000, und select points in a circular shape. <br>
To ensure that every point is covered we calculate a grid of centroid for those circles, which depends on the pointdensity.

<b>This Jupyter-Notebook is part of a master thesis with the topic<br>
<i>Analysis of deep learning methods for semantic segmentation of photogrammetric point clouds from aerial images</i><br>
&copy; Markus Hülsen, Matr.-Nr. 6026370<br>
Date: 15.08.2023</b>

In [1]:
import numpy as np
import laspy
import pandas as pd
import os
from scipy.spatial import KDTree
from sklearn.preprocessing import normalize

First we will create a class for the a single `Dataset`.

In [2]:
class Dataset():
    def __init__(self, file, load=True):
        # file = path to pointcloud file (*.las or *.laz)
        self.file = file
        self.df_pc = None
        self.xmax = self.ymax = self.xmin = self.ymin = None
        if load:
            self.load_data()
            
    def load_data(self):
        with laspy.open(self.file) as f:
            las = f.read()
    
        # read coordinates from las
        x = np.array(las.x)
        y = np.array(las.y)
        z = np.array(las.z)
        
        # define maximum and minimal values
        self.xmin = min(x)
        self.ymin = min(y)
        self.xmax = max(x)
        self.ymax = max(y)
    
        # save coords to DataFrame
        self.df_pc = pd.DataFrame({'X':x,'Y':y,'Z':z},index=np.arange(len(x)))
    
        # save every dimension to DataFrame
        for i in range(3, len(las.point_format.dimensions)):
            dim = las.point_format.dimensions[i].name
            self.df_pc[dim] = np.array(las[dim])
    
    @property
    def labels(self):
        # get labels as numpy array 
        if self.df_pc is None:
            self.load_data()
        labels = self.df_pc.loc[:, 'classification'].to_numpy()
        return labels
    
    @property
    def xyz(self):
        # get coordinates (XYZ) as numpy array
        if self.df_pc is None:
            self.load_data()
        xyz = self.df_pc.loc[:, 'X':'Z'].to_numpy()
        return xyz
    
    @property
    def features(self):
        # get features a numpy array
        if self.df_pc is None:
            self.load_data()
        feat = self.df_pc.drop(['X','Y','Z'], axis=1).to_numpy()
        return feat
    
    @property
    def xyz_and_features(self):
        # get coordinates (XYZ) and features as numpy array
        if self.df_pc is None:
            self.load_data()
        xyz = self.xyz
        feat = self.features
        xyz_feat = np.hstack((xyz, feat))
        return xyz_feat
    
    @property
    def pointdensity(self):
        area = (self.xmax - self.xmin) * (self.ymax - self.ymin)
        return len(self) / area
    
    def __len__(self):
        return self.df_pc.shape[0]

Get laz.files

In [3]:
# path where the data ist stored
data_path = '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited'

# save files that are in laz-format
lst_files = []
for file in os.listdir(data_path):
    if file.endswith('.laz'):
        lst_files.append(data_path + '/' + file)
lst_files = sorted(lst_files)
lst_files

['../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/554000_5798000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/554000_5799000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/554000_5800000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/554000_5801000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/555000_5798000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/555000_5799000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/555000_5800000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/555000_5801000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/556000_5798000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/556000_5799000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/556000_5800000.laz',
 '../../Daten/Datensatz_H3D/DIM_2022/7 - DBScan/edited/556000_5801000.laz']

## Test our class
Initalize class with first file.

In [4]:
dataset = Dataset(lst_files[0])

Test functions.

In [5]:
print(f'length of dataset:\t\t{len(dataset)}')
print(f'Minimum X of pointcloud:\t{dataset.xmin}')
print(f'average point density:\t\t{dataset.pointdensity:.3f}')
print(f'coordinates of pointcloud:\n{dataset.xyz}')

length of dataset:		5631164
Minimum X of pointcloud:	554000.0
average point density:		5.633
coordinates of pointcloud:
[[5.54866150e+05 5.79848756e+06 7.05600000e+01]
 [5.54864200e+05 5.79848763e+06 7.04500000e+01]
 [5.54877160e+05 5.79849281e+06 7.06100000e+01]
 ...
 [5.54330680e+05 5.79858891e+06 6.92100000e+01]
 [5.54324510e+05 5.79858784e+06 6.92700000e+01]
 [5.54330830e+05 5.79858860e+06 6.92400000e+01]]


In [6]:
dataset.df_pc

Unnamed: 0,X,Y,Z,intensity,return_number,number_of_returns,scan_direction_flag,edge_of_flight_line,classification,synthetic,...,planarity,eigenentropy,curvature change,local_pointdensity,roughness,label,z_to_dem,inside_road,count_veg,count_ground
0,554866.15,5798487.56,70.56,8869,1,5,0,0,6,0,...,,,,,,,0.507541,0.0,,
1,554864.20,5798487.63,70.45,8873,1,2,0,0,6,0,...,,,,,,,0.343253,0.0,,
2,554877.16,5798492.81,70.61,9356,1,2,0,0,6,0,...,,,,,,,0.510372,0.0,,
3,554864.20,5798487.79,70.47,8873,1,2,0,0,6,0,...,,,,,,,0.361733,0.0,,
4,554869.83,5798491.05,74.13,17329,1,7,0,0,6,0,...,,,,,,,4.054098,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5631159,554330.37,5798588.29,69.24,7499,1,4,0,0,6,0,...,1.228618,-0.501297,0.431701,0.713115,0.674832,1.0,2.455299,0.0,33.0,19.0
5631160,554326.79,5798588.75,69.24,7499,1,4,0,0,6,0,...,0.274811,-3.273526,0.075397,1.068392,0.123233,1.0,2.453916,0.0,42.0,2.0
5631161,554330.68,5798588.91,69.21,7499,1,5,0,0,6,0,...,0.522878,-1.338616,0.041746,0.730131,0.963822,1.0,2.435219,0.0,31.0,23.0
5631162,554324.51,5798587.84,69.27,10151,1,3,0,0,6,0,...,-0.604568,-0.630222,0.383430,0.659132,1.499709,5.0,2.428519,0.0,26.0,42.0


## Batching-Class 
Next we will create a class for batching.

In [7]:
class kNNBatchDataset(Dataset):
    def __init__(self, k, spacing, *args, **kwargs):
        super(kNNBatchDataset, self).__init__(*args, **kwargs)
        # distance between grid points
        self.spacing = spacing
        # number of points for kNN
        self.k = k
        # initalze KDTree
        self.tree = None
        # current index
        self.currIdx = 0
        # calculate numver of rows and columns of the grid
        self.num_cols = (self.xmax - self.xmin - self.spacing/2) // (self.spacing) + 1
        self.num_rows = (self.ymax - self.ymin - self.spacing/2) // (self.spacing) + 1
        #calc number of batches, which is equal to number of gridpoints
        self.num_batches = int(self.num_cols * self.num_rows)
        self.rndzer = list(range(self.num_batches))
        np.random.shuffle(self.rndzer)
        self.buildKD()
    
    def buildKD(self):
        self.tree = KDTree(self.xyz[:, 0:2])    # build only on XY
    
    def getBatches(self, batch_size=None):
        centers = []
        self.currIdx = 0
        
        if batch_size is None:
            batch_size = self.num_batches
        
        for i in range(batch_size):
            if self.currIdx >= self.num_batches:
                break
            centers.append([self.xmin + self.spacing/2 + (self.currIdx // self.num_cols) * self.spacing,
                            self.ymin + self.spacing/2 + (self.currIdx % self.num_cols) * self.spacing])
            self.currIdx += 1
        if centers:
            _, idx = self.tree.query(centers, k=self.k)
            return self.xyz_and_features[idx, :], self.labels[idx]
        else:
            return None, None

Set hyperparameters for batching

In [8]:
num_points = 200000
spacing = np.sqrt(num_points / dataset.pointdensity/np.pi) * np.sqrt(2)/2 * 0.95
print(spacing)

71.41417597467252


create a batch dataset

In [9]:
kNN_dataset = kNNBatchDataset(file=lst_files[0], k=num_points, spacing=spacing)

Get the batches and corresponding labels

In [10]:
points, labels = kNN_dataset.getBatches(kNN_dataset.num_batches)
points.shape

(196, 200000, 35)

In [12]:
labels.shape

(196, 200000)

Function to save a DataFrame as LAS-files

In [14]:
def save_df_to_las(df, path):
    
    header = laspy.LasHeader(point_format=3, version="1.2")
    
    atts = []
    for dim in header.point_format.dimensions:
        atts.append(dim.name)
    
    for dim in df.columns:
        if dim not in atts:
            header.add_extra_dim(laspy.ExtraBytesParams(name=dim, type=np.float32))
    
    las_new = laspy.LasData(header)

    las_new.x = df.X.to_numpy()
    las_new.y = df.Y.to_numpy()
    las_new.z = df.Z.to_numpy()
    
    for col in df.loc[:,'intensity':].columns:
        las_new[col] = df[col].to_numpy()
    
    las_new.write(path)

Save Batches

In [28]:
for i in range(len(points)):
    save_df_to_las(pd.DataFrame(points[i], columns=kNN_dataset.df_pc.columns), 'batches/batch_' + f'{i:03d}' + '.laz')