In [2]:
import sys
print(sys.executable)


/home/px4_sitl/ets_work/montreal_research/venv_montreal/bin/python


In [3]:
import geopandas as gpd
import pandas as pd
import numpy as np
import random
from shapely.geometry import Point, Polygon
from shapely.ops import transform as shapely_transform
import pyproj
from datetime import datetime, timedelta
import time
import os
import rasterio
import warnings

GEOJSON_PATH = r"/home/px4_sitl/ets_work/montreal_research/thermal_synthetic_data_generation/montreal_government_data/df.geojson"
LAND_COVER_PATH = r"/home/px4_sitl/ets_work/montreal_research/thermal_synthetic_data_generation/montreal_government_data/landcover-2020-classification.tif"
OUTPUT_FOLDER = r"/home/px4_sitl/ets_work/montreal_research/thermal_synthetic_data_generation/synthetic_generated_data"
OUTPUT_FILENAME = "synthetic_thermal_data_5yr_landcover.csv"

end_time = datetime.now()
start_time = end_time - timedelta(days=5*365)
total_seconds_in_period = (end_time - start_time).total_seconds()

thermal_months = [5, 6, 7, 8, 9]
min_thermal_hour = 10
max_thermal_hour = 18

thermal_categories = {
    "Weak": ([0.5, 1.5], [50, 150], [300, 1000], [5, 15], 0.4),
    "Medium": ([1.5, 3.0], [100, 300], [800, 2000], [10, 30], 0.35),
    "Strong": ([3.0, 5.0], [150, 400], [1500, 3000], [15, 45], 0.2),
    "Very Strong": ([5.0, 7.0], [150, 500], [2000, 4000], [20, 60], 0.05)
}
category_names = list(thermal_categories.keys())
category_weights = [thermal_categories[cat][4] for cat in category_names]

# --- Land Cover Mapping (Code to Name & Probability) ---
# !! IMPORTANT: Verify and complete this based on the XML metadata descriptions !!
land_cover_names = {
    1: "Needleleaf Forest",
    2: "Taiga Forest",
    # 3: "Missing Category 3", # Check XML if code 3 exists and add name
    5: "Broadleaf Forest",
    6: "Mixed Forest",
    8: "Shrubland",
    10: "Grassland",
    11: "Shrubland-Lichen-Moss",
    12: "Grassland-Lichen-Moss",
    13: "Barren-Lichen-Moss",
    14: "Wetland",
    15: "Cropland",
    16: "Barren lands",
    17: "Urban",
    18: "Water",
    19: "Snow/Ice",
    0: "NoData", # Handle NoData value
    'default': "Unknown" # Fallback for unexpected codes
}

land_cover_probabilities = {
    1: 0.20, 2: 0.20, 5: 0.25, 6: 0.22, 8: 0.40, 10: 0.50,
    11: 0.30, 12: 0.40, 13: 0.35, 14: 0.10, 15: 0.70, 16: 0.75,
    17: 0.85, 18: 0.01, 19: 0.00, 0: 0.00, 'default': 0.1
}

approx_active_hours = 5 * len(thermal_months) * 30 * (max_thermal_hour - min_thermal_hour)
thermals_per_active_hour_avg_density = 5
total_thermals_to_generate = approx_active_hours * thermals_per_active_hour_avg_density

MIN_DIST_FACTOR = 3.0
RECENT_POINTS_CHECK = 50
MAX_SPATIAL_ATTEMPTS = 200
MAX_TIMESTAMP_ATTEMPTS = 100

def get_random_point_in_polygon(poly):
    min_x, min_y, max_x, max_y = poly.bounds
    while True:
        random_point = Point(random.uniform(min_x, max_x), random.uniform(min_y, max_y))
        if poly.contains(random_point):
            return random_point

def sample_thermal_properties(category):
    props = thermal_categories[category]
    lift = random.uniform(props[0][0], props[0][1])
    diameter = random.uniform(props[1][0], props[1][1])
    height = random.uniform(props[2][0], props[2][1])
    duration = random.uniform(props[3][0], props[3][1])
    return lift, diameter, height, duration

def get_season(month):
    if month in [6, 7, 8]: return "Summer"
    if month in [5, 9]: return "Spring/Fall"
    return "Off-Season"

def get_time_of_day(hour):
    if 10 <= hour < 12: return "Morning"
    if 12 <= hour < 16: return "Afternoon"
    if 16 <= hour < 18: return "Late Afternoon"
    return "Off-Hours"

print(f"Starting synthetic thermal data generation with land cover bias...")
print(f"Reading GeoJSON from: {GEOJSON_PATH}")
print(f"Using Land Cover from: {LAND_COVER_PATH}")

try:
    gdf = gpd.read_file(GEOJSON_PATH)
except Exception as e:
    print(f"Error reading GeoJSON file: {e}")
    exit()

source_crs_geojson = gdf.crs
if source_crs_geojson is None:
     source_crs_geojson = "EPSG:32188"
     print(f"Warning: GeoJSON CRS not set. Assuming {source_crs_geojson}.")
     gdf.crs = source_crs_geojson
else:
     print(f"Source GeoJSON CRS identified as: {source_crs_geojson}")

target_crs_latlon = "EPSG:4326"

print(f"Loading Land Cover data...")
try:
    land_cover_dataset = rasterio.open(LAND_COVER_PATH)
    source_crs_raster = land_cover_dataset.crs
    print(f"Land Cover CRS identified as: {source_crs_raster}")
    if source_crs_raster != source_crs_geojson:
         print(f"CRS mismatch detected. Will transform points from {source_crs_geojson} to {source_crs_raster} for sampling.")
         project_coords = pyproj.Transformer.from_crs(source_crs_geojson, source_crs_raster, always_xy=True).transform
    else:
         project_coords = None
except Exception as e:
    print(f"Error opening land cover file {LAND_COVER_PATH}: {e}")
    exit()

print(f"Target includes all {len(gdf)} regions found in GeoJSON.")
total_area = gdf.geometry.area.sum()
print(f"Total area: {total_area:.2f} square units (in source GeoJSON CRS units)")

print(f"Generating approximately {total_thermals_to_generate} thermals...")

synthetic_data = []
generated_points_history = []
generated_count = 0
start_exec_time = time.time()

warnings.filterwarnings("ignore", category=RuntimeWarning)

while generated_count < total_thermals_to_generate:
    valid_timestamp_found = False
    timestamp_attempts = 0
    timestamp = None
    while not valid_timestamp_found and timestamp_attempts < MAX_TIMESTAMP_ATTEMPTS:
        timestamp_attempts += 1
        random_second_offset = random.uniform(0, total_seconds_in_period)
        timestamp = start_time + timedelta(seconds=random_second_offset)
        if timestamp.month in thermal_months and min_thermal_hour <= timestamp.hour < max_thermal_hour:
             valid_timestamp_found = True
    if not valid_timestamp_found: continue

    point_accepted = False
    spatial_attempts = 0
    point_geojson_crs = None
    land_cover_code = None
    region_name = None

    while not point_accepted and spatial_attempts < MAX_SPATIAL_ATTEMPTS:
        spatial_attempts += 1
        rand_area_val = random.uniform(0, total_area)
        cumulative_area = 0
        target_region = None
        for index, region in gdf.iterrows():
            cumulative_area += region.geometry.area
            if rand_area_val <= cumulative_area:
                target_region = region
                break
        if target_region is None: target_region = gdf.iloc[-1]
        region_geom = target_region.geometry

        point_candidate_geojson_crs = get_random_point_in_polygon(region_geom)

        coords_for_sampling = (point_candidate_geojson_crs.x, point_candidate_geojson_crs.y)
        if project_coords:
            try:
                coords_for_sampling = project_coords(point_candidate_geojson_crs.x, point_candidate_geojson_crs.y)
            except Exception as e:
                continue

        try:
            lc_value_generator = land_cover_dataset.sample([coords_for_sampling], indexes=1)
            land_cover_code = next(lc_value_generator)[0]
        except IndexError:
             land_cover_code = 0
        except Exception as e:
             land_cover_code = 'error'
             continue

        if land_cover_code == land_cover_dataset.nodata:
             land_cover_code = 0

        probability = land_cover_probabilities.get(land_cover_code, land_cover_probabilities.get('default', 0))

        if random.random() < probability:
            point_geojson_crs = point_candidate_geojson_crs

            is_too_close = False
            temp_radius_for_check = np.mean(thermal_categories["Medium"][1]) / 2.0
            check_against = generated_points_history[-RECENT_POINTS_CHECK:]
            for prev_point_geojson_crs, prev_radius in check_against:
                 if prev_radius is None or temp_radius_for_check is None: continue
                 distance = point_geojson_crs.distance(prev_point_geojson_crs)
                 min_allowed_distance = MIN_DIST_FACTOR * prev_radius
                 if distance < min_allowed_distance:
                     is_too_close = True
                     break

            if not is_too_close:
                 point_accepted = True
                 region_name = target_region['NOM']
        # else: Point rejected by land cover

    if not point_accepted: continue

    latitude = None
    longitude = None
    try:
         point_gs = gpd.GeoSeries([point_geojson_crs], crs=source_crs_geojson)
         point_latlon = point_gs.to_crs(target_crs_latlon).iloc[0]
         latitude = point_latlon.y
         longitude = point_latlon.x
    except Exception as e:
         print(f"Warning: Lat/Lon conversion failed for point {point_geojson_crs}. Error: {e}")

    category = random.choices(category_names, weights=category_weights, k=1)[0]
    lift_mps, diameter_m, max_height_m_agl, duration_min = sample_thermal_properties(category)
    radius_m = diameter_m / 2.0
    season = get_season(timestamp.month)
    time_of_day = get_time_of_day(timestamp.hour)
    land_cover_name = land_cover_names.get(land_cover_code, land_cover_names.get('default', "Unknown")) # Lookup name

    generated_points_history.append((point_geojson_crs, radius_m))
    if len(generated_points_history) > RECENT_POINTS_CHECK * 2:
         generated_points_history = generated_points_history[-RECENT_POINTS_CHECK:]

    synthetic_data.append({
        "timestamp_utc": timestamp.strftime('%Y-%m-%d %H:%M:%S'),
        "latitude": latitude,
        "longitude": longitude,
        "easting": point_geojson_crs.x,
        "northing": point_geojson_crs.y,
        "region_name": region_name,
        "season": season,
        "time_of_day": time_of_day,
        "land_cover_type": land_cover_name, # Changed from land_cover_code
        "strength_category": category,
        "lift_rate_mps": round(lift_mps, 2),
        "core_diameter_m": round(diameter_m, 1),
        "radius_m": round(radius_m, 1),
        "max_height_m_agl": round(max_height_m_agl),
        "duration_min": round(duration_min)
    })
    generated_count += 1

    if generated_count % 1000 == 0:
        print(f"  Generated {generated_count}/{total_thermals_to_generate} thermals...")


# --- Close Raster and Save Data ---
land_cover_dataset.close()
warnings.resetwarnings()

output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILENAME)
print(f"\nSaving {len(synthetic_data)} generated thermals to: {output_path}")

if synthetic_data:
    df_output = pd.DataFrame(synthetic_data)
    cols_order = [
        "timestamp_utc", "latitude", "longitude", "easting", "northing",
        "region_name", "season", "time_of_day", "land_cover_type", # Updated column name
        "strength_category", "lift_rate_mps",
        "core_diameter_m", "radius_m", "max_height_m_agl", "duration_min"
    ]
    df_output = df_output.reindex(columns=[col for col in cols_order if col in df_output.columns])
    try:
        df_output.to_csv(output_path, index=False)
        print("Save successful.")
    except Exception as e:
        print(f"Error saving CSV file: {e}")
else:
    print("No data generated to save.")

end_exec_time = time.time()
print(f"Script finished in {end_exec_time - start_exec_time:.2f} seconds.")

Starting synthetic thermal data generation with land cover bias...
Reading GeoJSON from: /home/px4_sitl/ets_work/montreal_research/thermal_synthetic_data_generation/montreal_government_data/df.geojson
Using Land Cover from: /home/px4_sitl/ets_work/montreal_research/thermal_synthetic_data_generation/montreal_government_data/landcover-2020-classification.tif
Source GeoJSON CRS identified as: EPSG:32188
Loading Land Cover data...
Land Cover CRS identified as: EPSG:3979
CRS mismatch detected. Will transform points from EPSG:32188 to EPSG:3979 for sampling.
Target includes all 34 regions found in GeoJSON.
Total area: 619200849.51 square units (in source GeoJSON CRS units)
Generating approximately 30000 thermals...
  Generated 1000/30000 thermals...
  Generated 2000/30000 thermals...
  Generated 3000/30000 thermals...
  Generated 4000/30000 thermals...
  Generated 5000/30000 thermals...
  Generated 6000/30000 thermals...
  Generated 7000/30000 thermals...
  Generated 8000/30000 thermals...
 