In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import (
    train_test_split,
)  # Though chronological split is preferred
import matplotlib.pyplot as plt

# Configuration & Constants

In [2]:
TARGET_CRS = "EPSG:32648"

# LSTM Sequence Generation
SEQUENCE_LENGTH = 10  # Length of input sequence for LSTM
PREDICTION_HORIZON = 1  # How many steps ahead to predict (e.g., 1 means next step)

# --- File Paths (assuming they are in the same directory or provide full paths) ---
GRID_FEATURES_PATH = "./GIS/danang_grid_with_features.parquet"
MOVEMENT_DATA_PATH = "./HistoricalMovement/danang_movement_processed_decrypted.parquet"
HISTORICAL_WEATHER_PATH = (
    "./WeatherForecast/danang_historical_weather_oikolab_processed.csv"
)

# Load briefly Pre-processed Data

In [3]:
try:
    grid_gdf = gpd.read_parquet(GRID_FEATURES_PATH)
    print(f"Successfully loaded grid data: {grid_gdf.shape}")
    # Ensure 'grid_id' is the index for easier joining later if it's unique and suitable
    if "grid_id" in grid_gdf.columns and grid_gdf["grid_id"].is_unique:
        grid_gdf = grid_gdf.set_index("grid_id")
except FileNotFoundError:
    print(f"ERROR: Grid data file not found at {GRID_FEATURES_PATH}")
    exit()
except Exception as e:
    print(f"ERROR: Could not load grid data: {e}")
    exit()

try:
    movement_df = pd.read_parquet(MOVEMENT_DATA_PATH)
    print(f"Successfully loaded movement data: {movement_df.shape}")
except FileNotFoundError:
    print(f"ERROR: Movement data file not found at {MOVEMENT_DATA_PATH}")
    exit()
except Exception as e:
    print(f"ERROR: Could not load movement data: {e}")
    exit()

try:
    historical_weather_df = pd.read_csv(HISTORICAL_WEATHER_PATH)
    print(f"Successfully loaded historical weather data: {historical_weather_df.shape}")
except FileNotFoundError:
    print(f"ERROR: Historical weather data file not found at {HISTORICAL_WEATHER_PATH}")
    exit()
except Exception as e:
    print(f"ERROR: Could not load historical weather data: {e}")
    exit()

Successfully loaded grid data: (212350, 238)
Successfully loaded movement data: (6857, 10)
Successfully loaded historical weather data: (10521, 25)


# Further Processing and Feature Engineering

### GIS Data

In [4]:
# Ensure 'dominant_building_type_none' and other one-hot encoded columns exist if 'none' was a category
if (
    "btype_none" not in grid_gdf.columns
    and "dominant_building_type" in movement_with_grid.columns
):
    # This check implies that 'none' might have been a category for dominant_building_type
    print(
        "Available btype columns:",
        [col for col in grid_gdf.columns if col.startswith("btype_")],
    )

grid_gdf_original_for_stats = gpd.read_parquet(
    GRID_FEATURES_PATH
)  # Load original again for before/after stats
# Normalize numerical GIS features (road_density, poi_counts, building_counts)
gis_cols_to_normalize = [
    col
    for col in grid_gdf.columns
    if "count" in col or "density" in col or "length" in col
]
# Filter out non-numeric columns just in case
gis_numeric_cols_to_normalize = [
    col for col in gis_cols_to_normalize if pd.api.types.is_numeric_dtype(grid_gdf[col])
]

if gis_numeric_cols_to_normalize:
    if "road_density" in grid_gdf_original_for_stats.columns:
        print("\nGIS data 'road_density' BEFORE normalization (sample & describe):")
        print(grid_gdf_original_for_stats["road_density"].head())
        print(grid_gdf_original_for_stats["road_density"].describe())

    scaler_gis = MinMaxScaler()
    grid_gdf[gis_numeric_cols_to_normalize] = scaler_gis.fit_transform(
        grid_gdf[gis_numeric_cols_to_normalize]
    )
    print(f"Normalized GIS columns in grid_gdf: {gis_numeric_cols_to_normalize}")

    if "road_density" in grid_gdf.columns:
        print("GIS data 'road_density' AFTER normalization (sample & describe):")
        print(grid_gdf["road_density"].head())
        print(grid_gdf["road_density"].describe())
else:
    print("No GIS columns found to normalize in grid_gdf.")


GIS data 'road_density' BEFORE normalization (sample & describe):
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: road_density, dtype: float64
count    212350.000000
mean          0.003638
std           0.011472
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           0.121717
Name: road_density, dtype: float64
Normalized GIS columns in grid_gdf: ['building_count_apartments', 'building_count_bridge', 'building_count_cathedral', 'building_count_church', 'building_count_civic', 'building_count_college', 'building_count_commercial', 'building_count_construction', 'building_count_dam', 'building_count_dormitory', 'building_count_grandstand', 'building_count_hangar', 'building_count_hospital', 'building_count_hotel', 'building_count_house', 'building_count_hut', 'building_count_industrial', 'building_count_navigationaid', 'building_count_no', 'building_count_office', 'building_count_public', 'building_count_residential', 'building_count_r

### Weather Data

In [5]:
# Convert historical weather timestamp to datetime and ensure UTC
if not pd.api.types.is_datetime64_any_dtype(historical_weather_df["timestamp_utc"]):
    historical_weather_df["timestamp_utc"] = pd.to_datetime(
        historical_weather_df["timestamp_utc"], utc=True
    )
else:
    historical_weather_df["timestamp_utc"] = historical_weather_df[
        "timestamp_utc"
    ].dt.tz_localize("UTC")

# Select relevant columns and set timestamp_utc as index for easy lookup
weather_df_processed = historical_weather_df.set_index("timestamp_utc")

# Normalize numerical weather features (example, more can be added)
weather_cols_to_normalize = [
    "temp_c",
    "relative_humidity",
    "wind_speed_mps",
    "cloud_cover",
    "precip_mm",
]
# Check which columns are actually present
weather_cols_present = [
    col for col in weather_cols_to_normalize if col in weather_df_processed.columns
]

if weather_cols_present:
    scaler_weather = MinMaxScaler()
    weather_df_processed[weather_cols_present] = scaler_weather.fit_transform(
        weather_df_processed[weather_cols_present]
    )
    print(f"Normalized weather columns: {weather_cols_present}")
else:
    print("No weather columns found to normalize.")

Normalized weather columns: ['temp_c', 'relative_humidity', 'wind_speed_mps', 'cloud_cover', 'precip_mm']


### Movement Data

In [6]:
print(f"Original movement data shape: {movement_df.shape}")
print("Movement data sample BEFORE dropping NaNs:")
print(movement_df[["TimestampUTC", "Latitude", "Longitude"]].head())

# Corrected TimestampUTC handling for movement_df
if "TimestampUTC" in movement_df.columns:
    if not pd.api.types.is_datetime64_any_dtype(movement_df["TimestampUTC"]):
        movement_df["TimestampUTC"] = pd.to_datetime(
            movement_df["TimestampUTC"], errors="coerce", utc=False
        )  # Convert to naive first

    # Now, check if it's timezone aware after potential conversion
    if movement_df["TimestampUTC"].dt.tz is None:  # If naive
        movement_df["TimestampUTC"] = movement_df["TimestampUTC"].dt.tz_localize("UTC")
    else:  # If aware
        movement_df["TimestampUTC"] = movement_df["TimestampUTC"].dt.tz_convert("UTC")
else:
    print("ERROR: 'TimestampUTC' column not found in movement_df.")
    # Handle this case, perhaps by exiting or creating a dummy column if appropriate
    exit()


movement_df.dropna(subset=["Latitude", "Longitude", "TimestampUTC"], inplace=True)
print(
    f"\nMovement data shape AFTER dropping NaNs in Lat/Lon/Timestamp: {movement_df.shape}"
)
print("Movement data sample AFTER dropping NaNs:")
print(movement_df[["TimestampUTC", "Latitude", "Longitude"]].head())


try:
    movement_gdf = gpd.GeoDataFrame(
        movement_df,
        geometry=gpd.points_from_xy(movement_df.Longitude, movement_df.Latitude),
        crs="EPSG:4326",
    )
    movement_gdf = movement_gdf.to_crs(TARGET_CRS)
except Exception as e:
    print(f"ERROR: Could not convert movement data to GeoDataFrame or reproject: {e}")
    exit()

if grid_gdf.index.name == "grid_id":
    grid_gdf_for_join = grid_gdf.reset_index()
else:
    grid_gdf_for_join = grid_gdf

if "geometry" not in grid_gdf_for_join.columns:
    print(
        "ERROR: 'geometry' column not found in grid_gdf_for_join. Check grid data loading."
    )
    exit()
if "grid_id" not in grid_gdf_for_join.columns:
    print(
        "ERROR: 'grid_id' column not found in grid_gdf_for_join. Check grid data loading."
    )
    exit()

print("\nPerforming spatial join between movement data and grid cells...")
movement_with_grid = gpd.sjoin(
    movement_gdf,
    grid_gdf_for_join[["grid_id", "geometry"]],
    how="left",
    predicate="within",
)
original_gridded_count = len(movement_with_grid)
movement_with_grid.dropna(subset=["grid_id"], inplace=True)
print(f"Movement data shape after assigning grid_id: {original_gridded_count}")
print(
    f"Movement data shape after dropping non-gridded points: {movement_with_grid.shape}"
)
print("Movement data sample WITH grid_id:")
print(movement_with_grid[["TimestampUTC", "Latitude", "Longitude", "grid_id"]].head())

if "index_right" in movement_with_grid.columns:
    movement_with_grid.drop(columns=["index_right"], inplace=True)

dt_col_movement = movement_with_grid["TimestampUTC"]
movement_with_grid["hour_sin_mov"] = np.sin(2 * np.pi * dt_col_movement.dt.hour / 24)
movement_with_grid["hour_cos_mov"] = np.cos(2 * np.pi * dt_col_movement.dt.hour / 24)
movement_with_grid["day_of_week_sin_mov"] = np.sin(
    2 * np.pi * dt_col_movement.dt.dayofweek / 7
)
movement_with_grid["day_of_week_cos_mov"] = np.cos(
    2 * np.pi * dt_col_movement.dt.dayofweek / 7
)
movement_with_grid["month_sin_mov"] = np.sin(
    2 * np.pi * (dt_col_movement.dt.month - 1) / 12
)
movement_with_grid["month_cos_mov"] = np.cos(
    2 * np.pi * (dt_col_movement.dt.month - 1) / 12
)
print("\nMovement data sample with cyclical time features:")
print(
    movement_with_grid[
        [
            "TimestampUTC",
            "hour_sin_mov",
            "hour_cos_mov",
            "day_of_week_sin_mov",
            "day_of_week_cos_mov",
        ]
    ].head()
)

Original movement data shape: (6857, 10)
Movement data sample BEFORE dropping NaNs:
               TimestampUTC   Latitude   Longitude
0 2024-11-20 11:05:23+00:00  16.074451  108.152405
1 2024-11-20 11:05:23+00:00  16.074451  108.152405
2 2024-11-20 11:09:48+00:00  16.074177  108.152565
3 2024-11-20 11:05:23+00:00  16.074451  108.152405
4 2024-11-20 06:38:47+00:00  16.074172  108.152865

Movement data shape AFTER dropping NaNs in Lat/Lon/Timestamp: (6857, 10)
Movement data sample AFTER dropping NaNs:
               TimestampUTC   Latitude   Longitude
0 2024-11-20 11:05:23+00:00  16.074451  108.152405
1 2024-11-20 11:05:23+00:00  16.074451  108.152405
2 2024-11-20 11:09:48+00:00  16.074177  108.152565
3 2024-11-20 11:05:23+00:00  16.074451  108.152405
4 2024-11-20 06:38:47+00:00  16.074172  108.152865

Performing spatial join between movement data and grid cells...
Movement data shape after assigning grid_id: 6857
Movement data shape after dropping non-gridded points: (6857, 13)
Movemen

# Merging Data

In [7]:
if grid_gdf.index.name == "grid_id":
    # If grid_gdf is indexed by 'grid_id', we use join. Ensure movement_with_grid also has 'grid_id' as a regular column for joining.
    if "grid_id" not in movement_with_grid.columns:
        print("ERROR: 'grid_id' not in movement_with_grid columns for joining.")
        exit()
    merged_df = movement_with_grid.join(
        grid_gdf.drop(columns=["geometry"], errors="ignore"), on="grid_id", how="left"
    )
else:  # If grid_gdf is not indexed by 'grid_id', use merge.
    merged_df = pd.merge(
        movement_with_grid,
        grid_gdf.drop(columns=["geometry"], errors="ignore"),
        on="grid_id",
        how="left",
    )


merged_df["timestamp_round_hour_utc"] = merged_df["TimestampUTC"].dt.round("h")

if not pd.api.types.is_datetime64_any_dtype(weather_df_processed.index):
    weather_df_processed.index = pd.to_datetime(weather_df_processed.index, utc=True)
elif weather_df_processed.index.tz is None:
    weather_df_processed.index = weather_df_processed.index.tz_localize("UTC")

# Ensure columns to merge on are sorted for merge_asof
merged_df = merged_df.sort_values("timestamp_round_hour_utc")
weather_df_processed = weather_df_processed.sort_index()


merged_df = pd.merge_asof(
    merged_df,
    weather_df_processed,
    left_on="timestamp_round_hour_utc",
    right_index=True,
    direction="nearest",
    suffixes=("", "_weather"),
)
print(f"Merged data shape: {merged_df.shape}")
# It's crucial to inspect NaNs after merging, especially from weather data if time alignment is imperfect
print("\nNaN counts in merged_df AFTER weather merge (first 10 columns):")
print(merged_df.isnull().sum().head(10))
print("NaN counts in merged_df AFTER weather merge (weather columns):")
weather_related_cols_in_merged = [
    col
    for col in merged_df.columns
    if col in weather_df_processed.columns or "_weather" in col
]
print(merged_df[weather_related_cols_in_merged].isnull().sum())

if (
    "DBDatePublishedUTC" in merged_df.columns
    and merged_df["DBDatePublishedUTC"].isna().all()
):
    print("DBDatePublishedUTC column is all NaN. Dropping it before final dropna.")
    merged_df.drop(columns=["DBDatePublishedUTC"], inplace=True)


merged_df.dropna(
    inplace=True
)  # Consider a more targeted dropna based on critical columns
print(f"Merged data shape after final dropna: {merged_df.shape}")
print("\nSample of Merged Data (first 5 rows):")
pd.set_option("display.max_columns", None)
merged_df.head()

Merged data shape: (6857, 279)

NaN counts in merged_df AFTER weather merge (first 10 columns):
LocationID               0
DeviceID                 0
TimestampUTC             0
Latitude                 0
Longitude                0
Confidence               0
Description              0
StatusCode               0
DBDatePublishedUTC    6857
EncryptedPayloadDB       0
dtype: int64
NaN counts in merged_df AFTER weather merge (weather columns):
coordinates_lat_lon    0
model_name             0
model_elevation_m      0
utc_offset_hrs         0
temp_c                 0
relative_humidity      0
wind_speed_mps         0
wind_deg               0
wind_gust_mps          0
cloud_cover            0
precip_mm              0
hour_of_day            0
day_of_week            0
day_of_year            0
month_of_year          0
year                   0
hour_sin               0
hour_cos               0
day_of_week_sin        0
day_of_week_cos        0
month_sin              0
month_cos              0
day_of_y

Unnamed: 0,LocationID,DeviceID,TimestampUTC,Latitude,Longitude,Confidence,Description,StatusCode,EncryptedPayloadDB,geometry,grid_id,hour_sin_mov,hour_cos_mov,day_of_week_sin_mov,day_of_week_cos_mov,month_sin_mov,month_cos_mov,building_count_apartments,building_count_bridge,building_count_cathedral,building_count_church,building_count_civic,building_count_college,building_count_commercial,building_count_construction,building_count_dam,building_count_dormitory,building_count_grandstand,building_count_hangar,building_count_hospital,building_count_hotel,building_count_house,building_count_hut,building_count_industrial,building_count_navigationaid,building_count_no,building_count_office,building_count_public,building_count_residential,building_count_restaurant,building_count_retail,building_count_roof,building_count_ruins,building_count_school,building_count_sports_centre,building_count_stadium,building_count_temple,building_count_terrace,building_count_tower,building_count_train_station,building_count_transportation,building_count_university,building_count_warehouse,building_count_yes,poi_count_vinmart+,poi_count_alcohol,poi_count_amusement_arcade,poi_count_apartment,poi_count_art,poi_count_artwork,poi_count_atm,poi_count_attraction,poi_count_baby_goods,poi_count_bag,poi_count_bakery,poi_count_bank,poi_count_bar,poi_count_beauty,poi_count_bed,poi_count_bench,poi_count_beverages,poi_count_bicycle,poi_count_bicycle_parking,poi_count_bicycle_rental,poi_count_bicycle_repair_station,poi_count_biergarten,poi_count_bleachers,poi_count_bookmaker,poi_count_books,poi_count_bureau_de_change,poi_count_bus_station,poi_count_butcher,poi_count_cafe,poi_count_camp_site,poi_count_car,poi_count_car_parts,poi_count_car_rental,poi_count_car_repair,poi_count_car_wash,poi_count_casino,poi_count_chalet,poi_count_charging_station,poi_count_chocolate,poi_count_cinema,poi_count_clock,poi_count_clothes,poi_count_coffee,poi_count_community_centre,poi_count_computer,poi_count_confectionery,poi_count_convenience,poi_count_copyshop,poi_count_cosmetics,poi_count_dentist,poi_count_department_store,poi_count_doctors,poi_count_drinking_water,poi_count_dry_cleaning,poi_count_electronics,poi_count_events_venue,poi_count_fast_food,poi_count_ferry_terminal,poi_count_fitness_centre,poi_count_fitness_station,poi_count_florist,poi_count_food_court,poi_count_fountain,poi_count_fuel,poi_count_furniture,poi_count_gallery,poi_count_garden,poi_count_gift,poi_count_grave_yard,poi_count_greengrocer,poi_count_guest_house,poi_count_hairdresser,poi_count_hospital,poi_count_hostel,poi_count_hotel,poi_count_ice_cream,poi_count_information,poi_count_interior_decoration,poi_count_internet_cafe,poi_count_jewelry,poi_count_karaoke_box,poi_count_kindergarten,poi_count_laundry,poi_count_library,poi_count_marketplace,poi_count_massage,poi_count_mobile_phone,poi_count_motel,poi_count_motorcycle,poi_count_motorcycle_parking,poi_count_motorcycle_rental,poi_count_motorcycle_repair,poi_count_museum,poi_count_musical_instrument,poi_count_nightclub,poi_count_online_gaming,poi_count_optician,poi_count_outdoor,poi_count_park,poi_count_parking,poi_count_pastry,poi_count_pawnbroker,poi_count_pet,poi_count_pharmacy,poi_count_photo,poi_count_photo_booth,poi_count_picnic_site,poi_count_pitch,poi_count_place_of_worship,poi_count_platform,poi_count_playground,poi_count_police,poi_count_post_box,poi_count_post_office,poi_count_pub,poi_count_radiotechnics,poi_count_reception_desk,poi_count_resort,poi_count_restaurant,poi_count_sanitation,poi_count_sauna,poi_count_school,poi_count_scooter_rental,poi_count_seafood,poi_count_second_hand,poi_count_shelter,poi_count_shoes,poi_count_shower,poi_count_smartshop,poi_count_smoking_area,poi_count_sports,poi_count_sports_centre,poi_count_sports_hall,poi_count_stadium,poi_count_station,poi_count_stationery,poi_count_stop_position,poi_count_storage_rental,poi_count_supermarket,poi_count_swimming_pool,poi_count_tattoo,poi_count_taxi,poi_count_tea,poi_count_theatre,poi_count_theme_park,poi_count_ticket,poi_count_toilets,poi_count_townhall,poi_count_track,poi_count_travel_agency,poi_count_tyres,poi_count_university,poi_count_variety_store,poi_count_vending_machine,poi_count_veterinary,poi_count_viewpoint,poi_count_waste_basket,poi_count_waste_disposal,poi_count_watches,poi_count_water_park,poi_count_water_sports,poi_count_wilderness_hut,poi_count_yes,poi_count_zoo,total_poi_count,road_length_m,road_density,btype_apartments,btype_bridge,btype_church,btype_college,btype_commercial,btype_construction,btype_dam,btype_dormitory,btype_grandstand,btype_hangar,btype_hospital,btype_hotel,btype_house,btype_industrial,btype_navigationaid,btype_no,btype_none,btype_office,btype_public,btype_residential,btype_retail,btype_roof,btype_school,btype_sports_centre,btype_stadium,btype_temple,btype_tower,btype_train_station,btype_transportation,btype_university,btype_warehouse,btype_yes,timestamp_round_hour_utc,coordinates_lat_lon,model_name,model_elevation_m,utc_offset_hrs,temp_c,relative_humidity,wind_speed_mps,wind_deg,wind_gust_mps,cloud_cover,precip_mm,hour_of_day,day_of_week,day_of_year,month_of_year,year,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,month_sin,month_cos,day_of_year_sin,day_of_year_cos
383,395,afirx1LlNk5vh7BnbGukU+L8o9E3pHhd/uogNOdmdv8=,2024-11-14 00:05:26+00:00,16.074298,108.152192,175.0,found,0,LOV2RgABBIWu4zxJRX7u/P68FRyE8fAlXW7CXFbwlmEcY0...,POINT (837278.062 1779724.608),cell_78463,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.510276,0.510276,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,2024-11-14 00:00:00+00:00,"(16.068, 108.212)",era5,90.12,7.0,0.415924,0.87234,0.099362,186.99,2.91,0.1,0.0,0,3,319,11,2024,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,-0.726225,0.687457
2675,2687,afirx1LlNk5vh7BnbGukU+L8o9E3pHhd/uogNOdmdv8=,2024-11-14 00:05:26+00:00,16.074298,108.152192,175.0,found,0,LOV2RgABBIWu4zxJRX7u/P68FRyE8fAlXW7CXFbwlmEcY0...,POINT (837278.062 1779724.608),cell_78463,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.510276,0.510276,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,2024-11-14 00:00:00+00:00,"(16.068, 108.212)",era5,90.12,7.0,0.415924,0.87234,0.099362,186.99,2.91,0.1,0.0,0,3,319,11,2024,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,-0.726225,0.687457
192,204,afirx1LlNk5vh7BnbGukU+L8o9E3pHhd/uogNOdmdv8=,2024-11-14 00:05:26+00:00,16.074298,108.152192,175.0,found,0,LOV2RgABBIWu4zxJRX7u/P68FRyE8fAlXW7CXFbwlmEcY0...,POINT (837278.062 1779724.608),cell_78463,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.510276,0.510276,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,2024-11-14 00:00:00+00:00,"(16.068, 108.212)",era5,90.12,7.0,0.415924,0.87234,0.099362,186.99,2.91,0.1,0.0,0,3,319,11,2024,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,-0.726225,0.687457
574,586,afirx1LlNk5vh7BnbGukU+L8o9E3pHhd/uogNOdmdv8=,2024-11-14 00:05:26+00:00,16.074298,108.152192,175.0,found,0,LOV2RgABBIWu4zxJRX7u/P68FRyE8fAlXW7CXFbwlmEcY0...,POINT (837278.062 1779724.608),cell_78463,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.510276,0.510276,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,2024-11-14 00:00:00+00:00,"(16.068, 108.212)",era5,90.12,7.0,0.415924,0.87234,0.099362,186.99,2.91,0.1,0.0,0,3,319,11,2024,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,-0.726225,0.687457
956,968,afirx1LlNk5vh7BnbGukU+L8o9E3pHhd/uogNOdmdv8=,2024-11-14 00:05:26+00:00,16.074298,108.152192,175.0,found,0,LOV2RgABBIWu4zxJRX7u/P68FRyE8fAlXW7CXFbwlmEcY0...,POINT (837278.062 1779724.608),cell_78463,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.510276,0.510276,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,2024-11-14 00:00:00+00:00,"(16.068, 108.212)",era5,90.12,7.0,0.415924,0.87234,0.099362,186.99,2.91,0.1,0.0,0,3,319,11,2024,0.0,1.0,0.433884,-0.900969,-0.866025,0.5,-0.726225,0.687457


# Sequence Generation

In [8]:
# Sort data by DeviceID and Timestamp
merged_df.sort_values(by=["DeviceID", "TimestampUTC"], inplace=True)

# Select feature columns
gis_feature_columns = [
    col
    for col in grid_gdf.columns
    if col.startswith("btype_")
    or col.startswith("poi_count_")
    or col.startswith("building_count_")
    or "density" in col
    or "length" in col
]
# Ensure 'geometry' is not accidentally included if it was re-added to grid_gdf
gis_feature_columns = [col for col in gis_feature_columns if col != "geometry"]

weather_feature_columns = list(weather_cols_present) + [
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
]  # from weather
# ensure they exist in merged_df (some might have _weather suffix if merge_asof created dupes)
weather_feature_columns = [
    col if col in merged_df.columns else col + "_weather"
    for col in weather_feature_columns
]
weather_feature_columns = [
    col for col in weather_feature_columns if col in merged_df.columns
]  # final check

movement_time_features = [
    "hour_sin_mov",
    "hour_cos_mov",
    "day_of_week_sin_mov",
    "day_of_week_cos_mov",
    "month_sin_mov",
    "month_cos_mov",
]

final_gis_features_in_merged = [
    col for col in gis_feature_columns if col in merged_df.columns
]

feature_columns = (
    final_gis_features_in_merged + weather_feature_columns + movement_time_features
)
# Ensure no duplicates
feature_columns = sorted(list(set(feature_columns)))

# Ensure target column exists
if "grid_id" not in merged_df.columns:
    print(
        "ERROR: 'grid_id' column is missing from merged_df. Cannot create target variable."
    )
    exit()

# Create sequences
sequences = []
targets = []
device_ids_for_sequences = []

for device_id, group in merged_df.groupby("DeviceID"):
    # group is a DataFrame for a single device, sorted by time
    feature_values = group[feature_columns].values
    target_values = group["grid_id"].values  # Target is the grid_id

    if len(feature_values) >= SEQUENCE_LENGTH + PREDICTION_HORIZON:
        for i in range(len(feature_values) - SEQUENCE_LENGTH - PREDICTION_HORIZON + 1):
            sequences.append(feature_values[i : i + SEQUENCE_LENGTH])
            targets.append(
                target_values[i + SEQUENCE_LENGTH + PREDICTION_HORIZON - 1]
            )  # Predict the grid_id at the horizon
            device_ids_for_sequences.append(device_id)

if not sequences:
    print(
        "WARNING: No sequences generated. Check SEQUENCE_LENGTH, PREDICTION_HORIZON, and data length per device."
    )
else:
    X = np.array(sequences)
    # Let's map grid_id to an integer index first.
    unique_grid_ids = grid_gdf_for_join["grid_id"].unique()
    grid_id_to_index = {grid_id: i for i, grid_id in enumerate(unique_grid_ids)}

    y_indices = np.array(
        [grid_id_to_index.get(target, -1) for target in targets]
    )  # Get -1 if target not in map

    # Filter out any targets that were not found in the map
    valid_indices_mask = y_indices != -1
    X = X[valid_indices_mask]
    y = y_indices[valid_indices_mask]
    num_classes = len(unique_grid_ids)

    print(f"Generated {len(X)} sequences.")
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    print(f"Number of unique grid_ids (classes): {num_classes}")

Generated 6846 sequences.
X shape: (6846, 10, 252), y shape: (6846,)
Number of unique grid_ids (classes): 212350


# Data Splitting and Scaling

In [9]:
if X.size == 0 or y.size == 0:
    print("No data to split or scale.")
else:
    unique_devices = pd.Series(device_ids_for_sequences).unique()
    if len(unique_devices) >= 3:  # Prefer splitting by device if possible
        train_devices, temp_devices = train_test_split(
            unique_devices, test_size=0.3, random_state=42
        )
        val_devices, test_devices = train_test_split(
            temp_devices, test_size=0.5, random_state=42
        )

        train_mask = np.isin(device_ids_for_sequences, train_devices)
        val_mask = np.isin(device_ids_for_sequences, val_devices)
        test_mask = np.isin(device_ids_for_sequences, test_devices)

        X_train, y_train = X[train_mask], y[train_mask]
        X_val, y_val = X[val_mask], y[val_mask]
        X_test, y_test = X[test_mask], y[test_mask]
        print(
            f"Split data by DeviceID: Train ({len(X_train)}), Val ({len(X_val)}), Test ({len(X_test)})"
        )
    else:
        print("Falling back to chronological-like random split.")

        # Create a time-ordered index for all sequences before splitting
        # This requires associating each sequence with a timestamp (e.g., end timestamp of the sequence)
        # For simplicity, if not splitting by device, using shuffle=False is a basic attempt.
        if len(X) > 1:  # Ensure there's enough data to even attempt a split
            train_size = int(0.7 * len(X))
            val_size = int(0.15 * len(X))

            X_train, y_train = X[:train_size], y[:train_size]
            X_val, y_val = (
                X[train_size : train_size + val_size],
                y[train_size : train_size + val_size],
            )
            X_test, y_test = X[train_size + val_size :], y[train_size + val_size :]
            print(
                f"Split data chronologically (approx): Train ({len(X_train)}), Val ({len(X_val)}), Test ({len(X_test)})"
            )

        else:  # Not enough sequences for a split
            print("Not enough sequences for a split. Using all data for training.")
            X_train, y_train = X, y
            X_val, y_val = (
                np.array([]).reshape(0, X.shape[1], X.shape[2])
                if X.ndim == 3
                else np.array([])
            ), np.array([])
            X_test, y_test = (
                np.array([]).reshape(0, X.shape[1], X.shape[2])
                if X.ndim == 3
                else np.array([])
            ), np.array([])

    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    scalers = {}
    X_train_scaled = np.zeros_like(X_train)
    X_val_scaled = np.zeros_like(X_val) if X_val.shape[0] > 0 else X_val
    X_test_scaled = np.zeros_like(X_test) if X_test.shape[0] > 0 else X_test

    for i in range(X_train.shape[2]):
        scalers[i] = MinMaxScaler()
        X_train_scaled[:, :, i] = scalers[i].fit_transform(X_train[:, :, i])
        if X_val.shape[0] > 0:
            X_val_scaled[:, :, i] = scalers[i].transform(X_val[:, :, i])
        if X_test.shape[0] > 0:
            X_test_scaled[:, :, i] = scalers[i].transform(X_test[:, :, i])

    print("Applied MinMax scaling to features.")

Falling back to chronological-like random split.
Split data chronologically (approx): Train (4792), Val (1026), Test (1028)
X_train shape: (4792, 10, 252), y_train shape: (4792,)
X_val shape: (1026, 10, 252), y_val shape: (1026,)
X_test shape: (1028, 10, 252), y_test shape: (1028,)
Applied MinMax scaling to features.


# Saving the AI-ready data

In [10]:
ai_ready_dir = "ai_ready_data"
import os

if not os.path.exists(ai_ready_dir):
    os.makedirs(ai_ready_dir)

np.save(os.path.join(ai_ready_dir, "X_train.npy"), X_train_scaled)
np.save(os.path.join(ai_ready_dir, "y_train.npy"), y_train)
if X_val.size > 0:
    np.save(os.path.join(ai_ready_dir, "X_val.npy"), X_val_scaled)
    np.save(os.path.join(ai_ready_dir, "y_val.npy"), y_val)
if X_test.size > 0:
    np.save(os.path.join(ai_ready_dir, "X_test.npy"), X_test_scaled)
    np.save(os.path.join(ai_ready_dir, "y_test.npy"), y_test)

import json

with open(os.path.join(ai_ready_dir, "grid_id_to_index.json"), "w") as f:
    json.dump(grid_id_to_index, f)
with open(os.path.join(ai_ready_dir, "feature_columns.json"), "w") as f:
    json.dump(feature_columns, f)
print(
    f"Saved AI-ready data, grid_id mapping, and feature list to '{ai_ready_dir}' directory."
)

Saved AI-ready data, grid_id mapping, and feature list to 'ai_ready_data' directory.
