In [4]:
import pandas as pd
import geopandas as gpd

In [5]:
emdat_path = "/home/nissim/Documents/dev/arg-inundaciones/data/public_emdat_custom_request_2025-06-26_fe5041a7-dc26-4391-aef0-f49e5e5d8657.xlsx"
emdat = pd.read_excel(emdat_path)

In [12]:
print(emdat.columns.to_list())

['DisNo.', 'Historic', 'Classification Key', 'Disaster Group', 'Disaster Subgroup', 'Disaster Type', 'Disaster Subtype', 'External IDs', 'Event Name', 'ISO', 'Country', 'Subregion', 'Region', 'Location', 'Origin', 'Associated Types', 'OFDA/BHA Response', 'Appeal', 'Declaration', "AID Contribution ('000 US$)", 'Magnitude', 'Magnitude Scale', 'Latitude', 'Longitude', 'River Basin', 'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month', 'End Day', 'Total Deaths', 'No. Injured', 'No. Affected', 'No. Homeless', 'Total Affected', "Reconstruction Costs ('000 US$)", "Reconstruction Costs, Adjusted ('000 US$)", "Insured Damage ('000 US$)", "Insured Damage, Adjusted ('000 US$)", "Total Damage ('000 US$)", "Total Damage, Adjusted ('000 US$)", 'CPI', 'Admin Units', 'Entry Date', 'Last Update']


In [10]:
# Check the data type of the column
print("Column dtype:", emdat['Admin Units'].dtype)
print("Column type:", type(emdat['Admin Units']))

# Check the type of individual elements
print("\nFirst element type:", type(emdat['Admin Units'].iloc[0]))
print("Second element type:", type(emdat['Admin Units'].iloc[1]))

# Look at the actual content of a few rows
print("\nFirst row content:")
print(emdat['Admin Units'].iloc[0])
print("\nSecond row content:")
print(emdat['Admin Units'].iloc[1])

# Check if it's a string that needs parsing
if isinstance(emdat['Admin Units'].iloc[0], str):
    print("\nIt's a string - might need JSON parsing")
    import json
    try:
        parsed = json.loads(emdat['Admin Units'].iloc[0])
        print("Successfully parsed as JSON:", type(parsed))
    except:
        print("Not valid JSON")
elif isinstance(emdat['Admin Units'].iloc[0], list):
    print("\nIt's already a list")
    print("List element types:", [type(item) for item in emdat['Admin Units'].iloc[0]])

Column dtype: object
Column type: <class 'pandas.core.series.Series'>

First element type: <class 'str'>
Second element type: <class 'str'>

First row content:
[{"adm1_code":431,"adm1_name":"Catamarca"},{"adm1_code":434,"adm1_name":"Cordoba"},{"adm1_code":438,"adm1_name":"Jujuy"},{"adm1_code":440,"adm1_name":"La Rioja"},{"adm1_code":445,"adm1_name":"Salta"},{"adm1_code":450,"adm1_name":"Santiago Del Estero"},{"adm1_code":452,"adm1_name":"Tucuman"}]

Second row content:
[{"adm1_code":430,"adm1_name":"Buenos Aires D.f."},{"adm1_code":434,"adm1_name":"Cordoba"},{"adm1_code":439,"adm1_name":"La Pampa"},{"adm2_code":4386,"adm2_name":"Avellaneda"},{"adm2_code":4395,"adm2_name":"Berisso"},{"adm2_code":4445,"adm2_name":"Lanus"},{"adm2_code":4477,"adm2_name":"Quilmes"},{"adm2_code":82738,"adm2_name":"San Miguel"},{"adm2_code":190525,"adm2_name":"San  Fernando"},{"adm2_code":4631,"adm2_name":"Parana"},{"adm2_code":4836,"adm2_name":"Rosario"}]

It's a string - might need JSON parsing
Successfully

In [9]:
import json
import pandas as pd
import numpy as np

def extract_admin_names(admin_units_str):
    """
    Extract adm1 and adm2 names from a JSON string of admin units.
    Returns (adm1_names, adm2_names) as lists.
    """
    try:
        parsed = json.loads(admin_units_str)
        adm1_names = []
        adm2_names = []
        
        for unit in parsed:
            if 'adm1_name' in unit:
                adm1_names.append(unit['adm1_name'])
            if 'adm2_name' in unit:
                adm2_names.append(unit['adm2_name'])
        
        return adm1_names, adm2_names
    except:
        # Return empty lists if parsing fails
        return [], []

# Apply the extraction function to create new columns
emdat[['adm1_names', 'adm2_names']] = emdat['Admin Units'].apply(
    lambda x: pd.Series(extract_admin_names(x))
)

# Display the results
print("=== EXTRACTION RESULTS ===")
print(f"Total rows processed: {len(emdat)}")
print(f"Rows with adm1 names: {sum(emdat['adm1_names'].apply(len) > 0)}")
print(f"Rows with adm2 names: {sum(emdat['adm2_names'].apply(len) > 0)}")

# Show a few examples
print("\n=== SAMPLE RESULTS ===")
for i in range(min(5, len(emdat))):
    print(f"Row {i}:")
    print(f"  adm1_names: {emdat['adm1_names'].iloc[i]}")
    print(f"  adm2_names: {emdat['adm2_names'].iloc[i]}")
    print()

=== EXTRACTION RESULTS ===
Total rows processed: 43
Rows with adm1 names: 27
Rows with adm2 names: 17

=== SAMPLE RESULTS ===
Row 0:
  adm1_names: ['Catamarca', 'Cordoba', 'Jujuy', 'La Rioja', 'Salta', 'Santiago Del Estero', 'Tucuman']
  adm2_names: []

Row 1:
  adm1_names: ['Buenos Aires D.f.', 'Cordoba', 'La Pampa']
  adm2_names: ['Avellaneda', 'Berisso', 'Lanus', 'Quilmes', 'San Miguel', 'San  Fernando', 'Parana', 'Rosario']

Row 2:
  adm1_names: ['Buenos Aires', 'Cordoba', 'La Pampa', 'Santa Fe']
  adm2_names: []

Row 3:
  adm1_names: []
  adm2_names: ['Iriondo']

Row 4:
  adm1_names: ['Buenos Aires']
  adm2_names: []



In [10]:
import geopandas as gpd
from shapely.geometry import box
import numpy as np
from utils.pygeoboundaries.main import get_area_of_interest_by_names

def get_flood_bounding_box(emdat_row, country_iso3):
    """
    Get bounding box for a flood event based on admin unit names.
    
    Args:
        emdat_row: Single row from emdat dataframe with adm1_names and adm2_names columns
        country_iso3: ISO3 country code for filtering
    
    Returns:
        bounding_box: shapely geometry representing the bounding box
        bbox_dict: dictionary with bounding box coordinates
    """
    
    # Check if we have adm2 names (prefer adm2 over adm1 for more precision)
    if len(emdat_row['adm2_names']) > 0:
        adm2_names = emdat_row['adm2_names']
        print(f"Filtered by {len(adm2_names)} adm2 names: {adm2_names}")
        
        # Get bounding box using geoBoundaries
        bbox_dict = get_area_of_interest_by_names(
            unit_names=adm2_names,
            adm_level="ADM2",
            country_iso3=country_iso3
        )
        
    elif len(emdat_row['adm1_names']) > 0:
        adm1_names = emdat_row['adm1_names']
        print(f"Filtered by {len(adm1_names)} adm1 names: {adm1_names}")
        
        # Get bounding box using geoBoundaries
        bbox_dict = get_area_of_interest_by_names(
            unit_names=adm1_names,
            adm_level="ADM1",
            country_iso3=country_iso3
        )
        
    else:
        print("No admin names found!")
        return None, None
    
    if bbox_dict is None:
        print("No matching administrative units found!")
        return None, None
    
    # Convert dictionary to shapely geometry
    bbox_geometry = box(
        bbox_dict['min_lon'], 
        bbox_dict['min_lat'], 
        bbox_dict['max_lon'], 
        bbox_dict['max_lat']
    )
    
    print(f"Bounding box coordinates: {bbox_dict}")
    print(f"Bounding box area: {bbox_geometry.area:.6f}")
    
    return bbox_geometry, bbox_dict

# Example usage for a single row
# Assuming you have the country ISO3 code (e.g., 'ARG' for Argentina)
country_iso3 = 'ARG'  # You'll need to get this from your emdat data

# Test with row 1 (which has both adm1 and adm2 names)
test_row = emdat.iloc[1]
print("=== TESTING ROW 1 ===")
print(f"adm1_names: {test_row['adm1_names']}")
print(f"adm2_names: {test_row['adm2_names']}")

bbox, bbox_dict = get_flood_bounding_box(test_row, country_iso3)

if bbox is not None:
    print(f"\nBounding box geometry: {bbox}")
    
    # You can also get the coordinates for satellite imagery search
    bounds = bbox.bounds
    print(f"Min/Max coordinates: {bounds}")

=== TESTING ROW 1 ===
adm1_names: ['Buenos Aires D.f.', 'Cordoba', 'La Pampa']
adm2_names: ['Avellaneda', 'Berisso', 'Lanus', 'Quilmes', 'San Miguel', 'San  Fernando', 'Parana', 'Rosario']
Filtered by 8 adm2 names: ['Avellaneda', 'Berisso', 'Lanus', 'Quilmes', 'San Miguel', 'San  Fernando', 'Parana', 'Rosario']
Fetching full ADM2 data for ARG...

=== METADATA FOR ARG ADM2 ===
boundaryID: ARG-ADM2-61730980
boundaryName: Argentina
boundaryISO: ARG
boundaryYearRepresented: 2020
boundaryType: ADM2
boundaryCanonical: departments
boundarySource: Instituto Geografico Nacional and UNHCR, OCHA ROLAC
boundaryLicense: Creative Commons Attribution 3.0 Intergovernmental Organisations (CC BY 3.0 IGO)
licenseDetail: nan
licenseSource: data.humdata.org/dataset/argentina-administrative-level-0-boundaries
boundarySourceURL: data.humdata.org/dataset/argentina-administrative-level-0-boundaries
sourceDataUpdateDate: Thu Jan 19 07:31:04 2023
buildDate: Dec 12, 2023
Continent: Latin America and the Caribbean