# KML data manipulation process

This is where we show our process of manipulating the KML data to get the desired output. In the project notebook, we'll be accessing these functions from `src/kml.py`. Below are the step to process the file:
- First, parse the KML file and extract building footprints
- Define bounding box and create grids within the bounding box
- Define the functions to calculate building coverage in each grid
- Find the grid where the corresponding longitude and latitude belong to
- Assign the values of building coverage and building count to the original datasets

In [1]:
import os
from fastkml import kml
from lxml import etree
import numpy as np
import pandas as pd
from shapely.geometry import Point
from shapely.geometry import Polygon, box
from xml.dom import minidom


  from pandas.core import (


In [2]:
# Get the absolute path of the project's root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
# Change working directory
os.chdir(project_root)

In [3]:
# Path to the KML file
file_path = "data/raw/Building_Footprint.kml"

# Read the KML file as bytes
with open(file_path, "rb") as file:
    kml_data = file.read()

# Decode and remove XML declaration (to inspect the raw XML)
kml_str = kml_data.decode("utf-8").split("?>", 1)[-1].strip()

# Parse KML string using lxml to handle namespaces properly
root = etree.fromstring(kml_str)

# Use the correct namespace
namespace = {'kml': 'http://www.opengis.net/kml/2.2'}

# Extract the Folder and Placemark elements
folders = root.findall('.//kml:Folder', namespace)

# Check if we found folders and placemarks
if not folders:
    print("⚠️ No folders found in the KML file.")
else:
    print("Folders Found:")
    for folder in folders:
        # Safely extract the folder name
        folder_name = folder.find('kml:name', namespace)
        if folder_name is not None:
            print(f"  Folder Name: {folder_name.text}")
        else:
            print("  Folder Name: (No name provided)")

        # # Extract placemarks within this folder
        # for placemark in folder.findall('kml:Placemark', namespace):
        #     # Safely extract the placemark name
        #     placemark_name = placemark.find('kml:name', namespace)
        #     if placemark_name is not None:
        #         placemark_label = placemark_name.text
        #     else:
        #         placemark_label = "(No name provided)"

        #     # Print placemark name
        #     print(f"    Placemark Name: {placemark_label}")
            
        #     # Extract coordinates or other geometry
        #     coordinates = placemark.findall('.//kml:coordinates', namespace)
        #     if coordinates:
        #         print("      Coordinates:")
        #         for coord in coordinates:
        #             # Format the coordinates
        #             coord_list = coord.text.strip().split()
        #             for coordinate in coord_list:
        #                 lat, lon = coordinate.split(',')
        #                 print(f"        Latitude: {lat}, Longitude: {lon}")
        #     else:
        #         print("      Coordinates: (No coordinates found)")


Folders Found:
  Folder Name: Challenge_footprint


In [4]:
# Step 1: Load the original dataset
train_df = pd.read_csv("data/raw/Training_data_uhi_index_2025-02-18.csv")
test_df = pd.read_csv("data/test/Submission_template.csv")

# **Parse the KML file and extract building footprints**


In [5]:
def parse_kml(file_path):
    """
    Description: Parse a KML file and extract building footprints as polygons.
    Handles both two and three component coordinates.
    Parameter (str): file_path: The path to KML file
    return: polygon (list).
    """
    kml = minidom.parse(file_path)
    polygons = []
    
    # Loop through each Placemark in the KML file
    for placemark in kml.getElementsByTagName("Placemark"):
        coords_text = placemark.getElementsByTagName("coordinates")[0].firstChild.nodeValue.strip()
        
        coords = []
        for coord in coords_text.split():
            # Split each coordinate into components
            coord_parts = coord.split(',')
            if len(coord_parts) == 2:
                lon, lat = map(float, coord_parts)  # Only longitude and latitude
            elif len(coord_parts) == 3:
                lon, lat, _ = map(float, coord_parts)  # Ignore altitude
            else:
                continue  # Skip invalid coordinates

            coords.append((lon, lat))
        
        polygons.append(Polygon(coords))  # Create a Polygon for each footprint

    return polygons

# Parse the KML file
polygons = parse_kml(file_path)
print(f"Extracted {len(polygons)} building footprints.")


Extracted 9436 building footprints.


# **Calculate Building Coverage**

In [6]:
def extract_bounding_box(kml_file):
    tree = etree.parse(kml_file)
    root = tree.getroot()
    """
    Extracts the bounding box (minimum and maximum latitude and longitude) 
    from a KML file.

    This function:
    1. Parses the KML file and retrieves all coordinate elements.
    2. Iterates through the coordinates to determine the min/max latitude and longitude.
    3. Ignores altitude values if present.
    4. Returns the bounding box as (min_lat, max_lat, min_lon, max_lon).

    Parameters:
    kml_file (str): The file path to the KML file.

    Returns:
    tuple: (min_lat, max_lat, min_lon, max_lon) representing the bounding box.

    """

    # Define namespaces
    ns = {'kml': 'http://www.opengis.net/kml/2.2'}

    # Find all coordinates within the KML file
    coords = root.xpath('.//kml:coordinates', namespaces=ns)

    min_lat, max_lat = float('inf'), float('-inf')
    min_lon, max_lon = float('inf'), float('-inf')

    # Iterate through all coordinates and find the bounding box
    for coord in coords:
        coords_text = coord.text.strip()
        for coord_pair in coords_text.split():
            coord_values = coord_pair.split(',')
            lon = float(coord_values[0])
            lat = float(coord_values[1])

            # Ignore the altitude (if present)
            if len(coord_values) == 3:
                _ = coord_values[2]  # We don't need altitude

            # Update the bounding box
            min_lat = min(min_lat, lat)
            max_lat = max(max_lat, lat)
            min_lon = min(min_lon, lon)
            max_lon = max(max_lon, lon)

    return min_lat, max_lat, min_lon, max_lon

# Let's use the data
min_lat, max_lat, min_lon, max_lon = extract_bounding_box(file_path)
print(f"Bounding Box: \nMin Lat: {min_lat}\nMax Lat: {max_lat}\nMin Lon: {min_lon}\nMax Lon: {max_lon}")


Bounding Box: 
Min Lat: 40.751285
Max Lat: 40.869321
Min Lon: -74.0022894813697
Max Lon: -73.869205


In [7]:
# Define a function to create a grid of cells within the bounding box
def create_grid(min_lat, max_lat, min_lon, max_lon, cell_size=500):
    """
    Description: Creates a grid of rectangular cells within a given bounding box.

    This function:
    1. Converts latitude and longitude degrees into approximate meters.
    2. Determines the number of grid cells required in both latitude and longitude directions.
    3. Iterates through the bounding box to generate individual grid cells.
    4. Uses the `shapely.geometry.box` function to create rectangular polygons representing each grid cell.

    Parameters:
    min_lat (float): Minimum latitude of the bounding box.
    max_lat (float): Maximum latitude of the bounding box.
    min_lon (float): Minimum longitude of the bounding box.
    max_lon (float): Maximum longitude of the bounding box.
    cell_size (int, optional): Size of each grid cell in meters (default is 500m).

    Returns:
    list: A list of `shapely.geometry.Polygon` objects representing the grid cells.
    """
    # Convert degrees to meters (approximation at mid-latitude)
    lat_to_meters = 111320  # meters per degree of latitude
    lon_to_meters = 111320 * np.cos(np.radians((max_lat + min_lat) / 2))  # meters per degree of longitude at mid-latitude
    
    # Number of grid cells in each direction
    n_lat_cells = int((max_lat - min_lat) * lat_to_meters / cell_size)
    n_lon_cells = int((max_lon - min_lon) * lon_to_meters / cell_size)
    
    # Create the grid
    grid_cells = []
    for i in range(n_lat_cells):
        for j in range(n_lon_cells):
            # Calculate the bounds of each grid cell
            cell_min_lat = min_lat + i * cell_size / lat_to_meters
            cell_max_lat = min_lat + (i + 1) * cell_size / lat_to_meters
            cell_min_lon = min_lon + j * cell_size / lon_to_meters
            cell_max_lon = min_lon + (j + 1) * cell_size / lon_to_meters
            
            # Create a polygon for the grid cell
            grid_cell = box(cell_min_lon, cell_min_lat, cell_max_lon, cell_max_lat)
            grid_cells.append(grid_cell)
    
    return grid_cells  


In [8]:

# Function to calculate building coverage for each grid cell without buffering
def calculate_coverage(building_footprints, grid_cells):
    coverage = []
    """
   Description: Calculates the total building coverage area within each grid cell. This function
    iterates over each grid cell in the provided list, checks for intersections between the grid cell and building footprints,
    computes the intersection area where buildings overlap with the grid cell, and then aggregates the total building coverage per grid cell.

    Parameters:
    building_footprints: A list of polygons representing building footprints.
    grid_cells: A list of polygons representing the grid cells.

    Returns:
    list: A list of coverage values where each entry corresponds to the total building area within a grid cell.
    """
    # Loop over each grid cell and calculate the intersection with building footprints
    for cell in grid_cells:
        cell_coverage = 0
        for building in building_footprints:
            if building.intersects(cell):  # Direct intersection without buffer
                # Calculate the intersection area between the building and the grid cell
                intersection = building.intersection(cell)
                cell_coverage += intersection.area
        
        # Normalize coverage by dividing by the area of the grid cell
        normalized_coverage = cell_coverage / cell.area if cell.area > 0 else 0
        coverage.append(normalized_coverage)
    
    return coverage

In [9]:
# Function to compute building count per grid cell without buffering
def building_count(building_footprints, grid_cells):
    building_counts = []
    """ 
   Description: Computes the number of buildings that intersect each grid cell. This function iterates over a list of grid cells and counts how many building footprints
    intersect each grid cell without applying any buffering.

    Parameters:
    building_footprints (list of shapely.geometry.Polygon): A list of polygons representing building footprints.
    grid_cells (list of shapely.geometry.Polygon): A list of polygons representing the grid cells.

    Returns:
    list: A list where each entry corresponds to the number of buildings intersecting a grid cell.
    """
    
    # Loop through grid cells to count how many buildings intersect each grid cell
    for cell in grid_cells:
        count = sum(1 for building in building_footprints if building.intersects(cell))  # Direct intersection without buffer
        building_counts.append(count)
    
    return building_counts

## Assign values to the original datasets

In [10]:
cell_sizes = [500,1000] 

In [11]:
def assign_coverage_and_count_to_points(df, grid_cells, coverage, building_counts):
    """
    Assigns building coverage and building count to each point based on the grid cell it falls into.

    Parameters:
    df (pd.DataFrame): DataFrame containing 'Longitude' and 'Latitude' columns.
    grid_cells (list): List of shapely Polygon objects representing grid cells.
    coverage (list): List of coverage values corresponding to each grid cell.
    building_counts (list): List of building count values corresponding to each grid cell.

    Returns:
    pd.DataFrame: A new DataFrame with 'building_coverage' and 'building_count' columns.
    """
    assigned_coverage = []
    assigned_building_count = []

    for _, row in df.iterrows():
        point = Point(row['Longitude'], row['Latitude'])  # Create a point from longitude and latitude
        
        # Default values if point is not in any grid cell
        point_coverage = 0  
        point_building_count = 0  

        # Check which grid cell the point belongs to
        for i, grid_cell in enumerate(grid_cells):
            if grid_cell.contains(point):
                point_coverage = coverage[i]  
                point_building_count = building_counts[i]  
                break  # Stop searching once the point is found in a grid cell
        
        assigned_coverage.append(point_coverage)
        assigned_building_count.append(point_building_count)
    
    # Create a copy of the dataframe to avoid modifying the original
    df_copy = df.copy()
    
    # Add the new columns to the copy of the dataframe
    df_copy['building_coverage'] = assigned_coverage
    df_copy['building_count'] = assigned_building_count
    
    return df_copy

In [12]:
cell_sizes = [500, 1000]

for cell_size in cell_sizes:
    print(f"Processing for cell size: {cell_size}")

    # Step 2: Define the bounding box for the area
    min_lat, max_lat, min_lon, max_lon = 40.751285, 40.869321, -74.0022894813697, -73.869205

    # Step 3: Create the grid cells within the bounding box
    grid_cells = create_grid(min_lat, max_lat, min_lon, max_lon, cell_size)

    # Calculate the building coverage and count for each grid cell
    coverage = calculate_coverage(polygons, grid_cells)
    building_counts = building_count(polygons, grid_cells)

    # Assign coverage and count to training and test datasets
    ground_df_copy = assign_coverage_and_count_to_points(train_df, grid_cells, coverage, building_counts)
    test_ground_df_copy = assign_coverage_and_count_to_points(test_df, grid_cells, coverage, building_counts)

    # Save the updated datasets
    ground_df_copy.to_csv(f"data/interim/ground_df_{cell_size}.csv", index=False)
    test_ground_df_copy.to_csv(f"data/interim/test_ground_df_{cell_size}.csv", index=False)

    # Print results inside the loop
    print(f"Results for cell size: {cell_size}")
    print(ground_df_copy[['building_coverage', 'building_count']].head())
    print(test_ground_df_copy[['building_coverage', 'building_count']].head())


Processing for cell size: 500


  values = values.astype(str)


Results for cell size: 500
   building_coverage  building_count
0           0.216605              44
1           0.216605              44
2           0.216605              44
3           0.216605              44
4           0.216605              44
   building_coverage  building_count
0           0.458796              43
1           0.458796              43
2           0.458796              43
3           0.458796              43
4           0.458796              43
Processing for cell size: 1000
Results for cell size: 1000
   building_coverage  building_count
0           0.297667             212
1           0.297667             212
2           0.297667             212
3           0.297667             212
4           0.297667             212
   building_coverage  building_count
0           0.430795             140
1           0.430795             140
2           0.430795             140
3           0.430795             140
4           0.430795             140


  values = values.astype(str)
