# This is a temporary notebook used for doing some initial analysis. This is not updated for the new pipeline.

# Load the Combined Dataset

In [1]:
import fiona
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [8]:
landslides = gpd.read_file('./bc_or_wa_merged_landslides.geojson')

In [9]:
print(landslides.shape)

landslides.head()

(90618, 53)


Unnamed: 0,Reference,Type_details,Contributo,DATE_RANGE,Latitude,Longitude,SLOPE,MOVEMENT,Type,Resource_road_type,...,Point_location,Comment,Shape_Area,MVMT_AZIMUTH_DEG,Volume_estimate_method,Shape_Length,REACTIVATION,GEOL,AVG_SCARP_DIST_FT,geometry
0,,,,,,,4.256579,Flow,,,...,,,11291.764767,0.0,,447.239405,,vol.M.cr.gr.nd.nd.bas,0.0,"MULTIPOLYGON (((-122.88334 45.36111, -122.8832..."
1,,,,,,,3.350513,Flow,,,...,,,13701.014637,180.0,,455.775817,,vol.M.cr.gr.nd.nd.bas,0.0,"MULTIPOLYGON (((-122.88317 45.36171, -122.8831..."
2,,,,,,,5.811321,Flow,,,...,,,14879.315815,337.5,,568.959463,,vol.M.cr.gr.nd.nd.bas,0.0,"MULTIPOLYGON (((-122.88144 45.36087, -122.8814..."
3,,,,,,,5.413578,Flow,,,...,,,20034.804942,315.0,,591.09711,,sed.Q.qsd.mf.nd.nd.fine,0.0,"MULTIPOLYGON (((-122.87991 45.36164, -122.8800..."
4,,,,,,,13.638928,Flow,,,...,,,8462.716893,270.0,,391.627031,,vol.M.cr.gr.nd.nd.bas,0.0,"MULTIPOLYGON (((-122.88515 45.35148, -122.885 ..."


In [10]:
landslides.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 90618 entries, 0 to 90617
Data columns (total 53 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   Reference                  4240 non-null   object  
 1   Type_details               4240 non-null   object  
 2   Contributo                 4240 non-null   object  
 3   DATE_RANGE                 386 non-null    object  
 4   Latitude                   4240 non-null   float64 
 5   Longitude                  4240 non-null   float64 
 6   SLOPE                      75139 non-null  float64 
 7   MOVEMENT                   90618 non-null  object  
 8   Type                       4240 non-null   object  
 9   Resource_road_type         4240 non-null   object  
 10  TYPE_MOVE                  42383 non-null  object  
 11  HS_HEIGHT                  65508 non-null  float64 
 12  VOLUME                     15055 non-null  float64 
 13  NAME                   

In [12]:
origin_count = landslides['original_dataset'].value_counts()
print(origin_count)

original_dataset
OR    71318
WA    15060
BC     4240
Name: count, dtype: int64


In [29]:
# Calculate completeness for each column
def calculate_completeness(df):
    completeness = {}
    total_rows = len(df)
    
    for col in df.columns:
        # Count non-null values that are not empty strings
        filled_count = df[col].notna().sum() - (df[col] == '').sum()
        completeness_pct = (filled_count / total_rows) * 100
        completeness[col] = completeness_pct
    
    return pd.Series(completeness).sort_values(ascending=False)

completeness_results = calculate_completeness(landslides)
print("Column Completeness (% filled):")
print(completeness_results.round(2))

Column Completeness (% filled):
geometry                     100.00
original_dataset             100.00
MATERIAL                     100.00
MOVEMENT                     100.00
LANDSLIDE_ID                 100.00
Shape_Area                    95.32
Shape_Length                  95.32
SLOPE                         82.92
VOLUME_MERGED                 81.41
MVMT_AZIMUTH_DEG              80.69
DESCRIPTION                   78.70
REF_ID_COD                    78.70
FAIL_DEPTH                    72.33
HS_HEIGHT                     72.29
AREA                          72.20
CONFIDENCE                    66.74
RELATIVE_AGE                  63.40
MOVE_CODE                     63.27
AVG_SCARP_DIST_FT             53.05
FAN_HEIGHT                    49.04
TYPE_MOVE                     46.77
GEOL                          45.72
MAP_UNIT_L                    37.74
DEEP_SHAL                     32.95
NAME                           9.24
Longitude                      4.68
Latitude                       4

Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.

  filled_count = df[col].notna().sum() - (df[col] == '').sum()


In [28]:
# Check for rows that have values in both VOL and VOLUME columns
both_vol_mask = landslides['VOL'].notna() & landslides['VOLUME'].notna()
rows_with_both = landslides[both_vol_mask]

if len(rows_with_both) > 0:
    print(f"Error: Found {len(rows_with_both)} rows with values in both VOL and VOLUME columns")
    print("Sample rows with both values:")
    print(rows_with_both[['VOL', 'VOLUME', 'original_dataset']].head())
else:
    print("No conflicts found. Proceeding with merge...")
    
    # Create a new merged volume column
    landslides['VOLUME_MERGED'] = landslides['VOL'].fillna(landslides['VOLUME'])
    
    # Drop the original columns
    landslides = landslides.drop(columns=['VOL', 'VOLUME'])
    
    print("Successfully merged VOL and VOLUME into VOLUME_MERGED column")
    print(f"New column has {landslides['VOLUME_MERGED'].notna().sum()} non-null values")

No conflicts found. Proceeding with merge...
Successfully merged VOL and VOLUME into VOLUME_MERGED column
New column has 73772 non-null values


# Column Inspection

## MATERIAL

In [None]:

# Find the original_dataset values for each valid Material
material_by_origin = landslides[landslides['MATERIAL'].notna() & (landslides['MATERIAL'] != '')].groupby('MATERIAL')['original_dataset'].value_counts()
print("Material distribution by original dataset:")
print(material_by_origin)

Material distribution by original dataset:
MATERIAL         original_dataset
<NA>             OR                  28961
Complex          OR                   6007
                 BC                      7
Debris           OR                  14719
                 BC                   2419
Debris+Earth     OR                      2
Debris+Rock      OR                     17
Earth            OR                   9105
                 BC                    766
Earth or debris  WA                  14712
Earth+Rock       OR                   1110
Other            OR                      1
Rock             OR                  11396
                 BC                    982
                 WA                    348
Submarine        BC                      3
Water            BC                     63
Name: count, dtype: int64


In [17]:
material_counts = landslides['MATERIAL'].value_counts()
print("Material counts:")
print(material_counts)

Material counts:
MATERIAL
<NA>               28961
Debris             17138
Earth or debris    14712
Rock               12726
Earth               9871
Complex             6014
Earth+Rock          1110
Water                 63
Debris+Rock           17
Submarine              3
Debris+Earth           2
Other                  1
Name: count, dtype: int64


## Material                       4.68

In [14]:

# Find the original_dataset values for each valid Material
material_by_origin = landslides[landslides['Material'].notna() & (landslides['Material'] != '')].groupby('Material')['original_dataset'].value_counts()
print("Material distribution by original dataset:")
print(material_by_origin)

Material distribution by original dataset:
Material       original_dataset
 Surficial     BC                     2
Anthropogenic  BC                     9
Rock           BC                  1192
Surficial      BC                  3033
Surficial      BC                     4
Name: count, dtype: int64


## Movement

In [19]:
movement_counts = landslides['MOVEMENT'].value_counts()
print("Movement counts:")
print(movement_counts)

Movement counts:
MOVEMENT
<NA>                                28961
Flow                                25680
Complex                             14123
Slide-Rotational                     9720
Slide-Translational                  6612
Complex+Slide-Rotational+Flow        1404
Slide                                1119
Fall                                 1075
Avalanche                             624
Flood                                 391
Slide-Rotational+Flow                 288
Complex+Slide-Translational+Flow      224
Deformation                           211
Slide-Translational+Flow               77
Topple                                 41
Complex+Slide+Flow                     19
Spread                                 18
Slide-Translational+Fall               16
Slide+Flow+Flow+Slide                   7
Slide+Submarine                         3
Complex+Flow                            1
Slide-Rotational+Fall                   1
Slide-Translational+Flow+Fall           1
Slide-Ro

## Slope

In [None]:
slope_by_origin = landslides[landslides['SLOPE'].notna()].groupby('SLOPE')['original_dataset'].value_counts()
print("Slope distribution by original dataset:")
print(slope_by_origin)


Slope distribution by original dataset:
SLOPE       original_dataset
0.000000    OR                  17983
0.910504    OR                      1
0.929008    OR                      1
1.000000    OR                     33
1.149971    OR                      1
                                ...  
75.000000   OR                      1
78.000000   OR                      1
80.000000   OR                      6
85.000000   OR                      1
315.000000  OR                      1
Name: count, Length: 6723, dtype: int64


In [21]:
slope_origins = landslides[landslides['SLOPE'].notna()]['original_dataset'].unique()
print("Unique original datasets for SLOPE:")
print(slope_origins)

Unique original datasets for SLOPE:
['OR' 'WA']


## MVMT_AZIMUTH_DEG

In [22]:
mvmt_azimuth_deg_origin = landslides[landslides['MVMT_AZIMUTH_DEG'].notna()].groupby('MVMT_AZIMUTH_DEG')['original_dataset'].value_counts()
print("Movement azimuth degrees by original dataset:")
print(mvmt_azimuth_deg_origin)

Movement azimuth degrees by original dataset:
MVMT_AZIMUTH_DEG  original_dataset
0.0               OR                  19565
                  WA                     43
20.0              OR                      1
22.5              OR                   2852
                  WA                    813
                                      ...  
338.5             OR                      1
355.0             OR                      1
357.5             OR                      1
360.0             OR                   2037
                  WA                    923
Name: count, Length: 62, dtype: int64


In [23]:
mvmt_azimuth_deg_origin_origins = landslides[landslides['MVMT_AZIMUTH_DEG'].notna()]['original_dataset'].unique()
print("Unique original datasets for MVMT_AZIMUTH_DEG:")
print(mvmt_azimuth_deg_origin_origins)

Unique original datasets for MVMT_AZIMUTH_DEG:
['OR' 'WA']


## FAIL_DEPTH

## HS_HEIGHT

## AREA

## Confidence

## VOL

In [27]:
vol_origins = landslides[landslides['VOL'].notna()]['original_dataset'].unique()
print("Unique original datasets for VOLUME:")
print(vol_origins)

Unique original datasets for VOLUME:
['OR']


## RELATIVE_AGE

## MOVE_CODE

# AVG_SCARP_DIST_FT             53.05


# FAN_HEIGHT                    49.04


# TYPE_MOVE                     46.77


# GEOL                          45.72


# MAP_UNIT_L                    37.74


# DEEP_SHAL                     32.95


# VOLUME                        16.61


In [None]:
volume_origins = landslides[landslides['VOLUME'].notna()]['original_dataset'].unique()
print("Unique original datasets for VOLUME:")
print(volume_origins)

Unique original datasets for VOLUME:
['WA']


# NAME                           9.24


# Longitude                      4.68
# Latitude                       4.68


Type                           4.68
Discharge estimate (m3/s)      4.58
DATE_MOVE                      4.34
Point_location                 3.31
Timing                         2.47
Reference                      2.05
Trigger                        2.05
Event_verification             1.67
Contributo                     1.61
Watercours                     1.06
Study_area                     0.97
Cont_details                   0.90
YEAR                           0.78
Type_details                   0.46
DATE_RANGE                     0.43
Size_class                     0.28
Resource_road_type             0.22
Resource_road_activity         0.22
Bedrock_type                   0.12
Comment                        0.11
Volume_estimate_method         0.10
MONTH                          0.09
DAY                            0.05
REACTIVATION                   0.02

In [None]:
# Function to find unique origins for each valid value in specified columns
def find_origins_by_value(df, columns):
    results = {}
    
    for col in columns:
        print(f"\n=== {col} ===")
        
        # Check if column exists
        if col not in df.columns:
            print(f"Column {col} not found in dataframe")
            continue
            
        # Filter for valid (non-null, non-empty) values
        valid_mask = df[col].notna() & (df[col] != '')
        
        if valid_mask.sum() == 0:
            print(f"No valid values found in {col}")
            continue
            
        # Show unique origins for this column
        unique_origins = df[valid_mask]['original_dataset'].unique()
        print(f"Unique origins: {list(unique_origins)}")
        
        # Group by column value and show origin datasets
        value_origins = df[valid_mask].groupby(col)['original_dataset'].unique()
        
        print(f"Total unique values: {len(value_origins)}")
        
        # Show first 10 values as examples
        for i, (value, origins) in enumerate(value_origins.items()):
            if i < 10:  # Show first 10 examples
                print(f"  {value}: {list(origins)}")
            elif i == 10:
                print(f"  ... and {len(value_origins) - 10} more values")
                break
        
        results[col] = value_origins
    
    return results

# All columns from your completeness results
all_columns = [
    'geometry', 'original_dataset', 'MATERIAL', 'MOVEMENT', 'LANDSLIDE_ID',
    'Shape_Area', 'Shape_Length', 'SLOPE', 'VOLUME_MERGED', 'MVMT_AZIMUTH_DEG',
    'DESCRIPTION', 'REF_ID_COD', 'FAIL_DEPTH', 'HS_HEIGHT', 'AREA',
    'CONFIDENCE', 'RELATIVE_AGE', 'MOVE_CODE', 'AVG_SCARP_DIST_FT',
    'FAN_HEIGHT', 'TYPE_MOVE', 'GEOL', 'MAP_UNIT_L', 'DEEP_SHAL',
    'NAME', 'Longitude', 'Latitude', 'Type', 'Material',
    'Discharge estimate (m3/s)', 'DATE_MOVE', 'Point_location',
    'Timing', 'Reference', 'Trigger', 'Event_verification',
    'Contributo', 'Watercours', 'Study_area', 'Cont_details',
    'YEAR', 'Type_details', 'DATE_RANGE', 'Size_class',
    'Resource_road_type', 'Resource_road_activity', 'Bedrock_type',
    'Comment', 'Volume_estimate_method', 'MONTH', 'DAY', 'REACTIVATION'
]

print("Analyzing origin datasets for all columns...")
origins_by_value = find_origins_by_value(landslides, all_columns)

In [None]:
# Function to find unique origins for each valid value in specified columns
def find_origins_by_value(df, columns):
    results = {}
    
    for col in columns:
        print(f"\n=== {col} ===")
        
        # Check if column exists
        if col not in df.columns:
            print(f"Column {col} not found in dataframe")
            continue
            
        # Filter for valid (non-null, non-empty) values
        valid_mask = df[col].notna() & (df[col] != '')
        
        if valid_mask.sum() == 0:
            print(f"No valid values found in {col}")
            continue
            
        # Show unique origins for this column
        unique_origins = df[valid_mask]['original_dataset'].unique()
        print(f"Unique origins: {list(unique_origins)}")
        
        # Group by column value and show origin datasets
        value_origins = df[valid_mask].groupby(col)['original_dataset'].unique()
        
        print(f"Total unique values: {len(value_origins)}")
        
        # # Show first 10 values as examples
        # for i, (value, origins) in enumerate(value_origins.items()):
        #     if i < 10:  # Show first 10 examples
        #         print(f"  {value}: {list(origins)}")
        #     elif i == 10:
        #         print(f"  ... and {len(value_origins) - 10} more values")
        #         break
        
        results[col] = value_origins
    
    return results

# All columns from your completeness results
all_columns = [
    'geometry', 'original_dataset', 'MATERIAL', 'MOVEMENT', 'LANDSLIDE_ID',
    'Shape_Area', 'Shape_Length', 'SLOPE', 'VOLUME_MERGED', 'MVMT_AZIMUTH_DEG',
    'DESCRIPTION', 'REF_ID_COD', 'FAIL_DEPTH', 'HS_HEIGHT', 'AREA',
    'CONFIDENCE', 'RELATIVE_AGE', 'MOVE_CODE', 'AVG_SCARP_DIST_FT',
    'FAN_HEIGHT', 'TYPE_MOVE', 'GEOL', 'MAP_UNIT_L', 'DEEP_SHAL',
    'NAME', 'Longitude', 'Latitude', 'Type', 'Material',
    'Discharge estimate (m3/s)', 'DATE_MOVE', 'Point_location',
    'Timing', 'Reference', 'Trigger', 'Event_verification',
    'Contributo', 'Watercours', 'Study_area', 'Cont_details',
    'YEAR', 'Type_details', 'DATE_RANGE', 'Size_class',
    'Resource_road_type', 'Resource_road_activity', 'Bedrock_type',
    'Comment', 'Volume_estimate_method', 'MONTH', 'DAY', 'REACTIVATION'
]

print("Analyzing origin datasets for all columns...")
origins_by_value = find_origins_by_value(landslides, all_columns)

Analyzing origin datasets for all columns...

=== geometry ===


Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.

  valid_mask = df[col].notna() & (df[col] != '')


Unique origins: ['OR', 'WA', 'BC']
Total unique values: 90348
  MULTIPOLYGON (EMPTY): ['WA']
  MULTIPOLYGON (((-124.53287953207415 42.874662140959664, -124.53295961832292 42.874623478054964, -124.5330180714885 42.874624941461555, -124.53305202973584 42.87462371361384, -124.53318245510326 42.874683378378805, -124.53324862183396 42.87472748203755, -124.53339041904049 42.874811771929124, -124.53344203148932 42.87485640267738, -124.53344147317704 42.87492080270478, -124.53336623839581 42.874959290524174, -124.53322841514391 42.875007194363334, -124.53309886135146 42.87503333847575, -124.53302368017852 42.875060497691585, -124.53292495598158 42.87513560087509, -124.53290242219597 42.875186489606335, -124.5328579382197 42.87520002008438, -124.53281912806993 42.8752014232834, -124.53276192143083 42.87517010893017, -124.53271796913185 42.87514308455673, -124.5326881207834 42.87508455219724, -124.53265800934186 42.87497357165508, -124.53261496177255 42.87486305880079, -124.53265621192394 42.874

In [31]:
# Function to find unique origins for each valid value in specified columns
def find_origins_by_value(df, columns):
    results = {}
    
    for col in columns:
        print(f"\n=== {col} ===")
        
        # Check if column exists
        if col not in df.columns:
            print(f"Column {col} not found in dataframe")
            continue
            
        # Filter for valid (non-null, non-empty) values
        valid_mask = df[col].notna() & (df[col] != '')
        
        if valid_mask.sum() == 0:
            print(f"No valid values found in {col}")
            continue
            
        # Show unique origins for this column
        unique_origins = df[valid_mask]['original_dataset'].unique()
        print(f"Unique origins: {list(unique_origins)}")
        
        # Group by column value and show origin datasets
        value_origins = df[valid_mask].groupby(col)['original_dataset'].unique()
        
        print(f"Total unique values: {len(value_origins)}")
        
        # # Show first 10 values as examples
        # for i, (value, origins) in enumerate(value_origins.items()):
        #     if i < 10:  # Show first 10 examples
        #         print(f"  {value}: {list(origins)}")
        #     elif i == 10:
        #         print(f"  ... and {len(value_origins) - 10} more values")
        #         break
        
        results[col] = value_origins
    
    return results

# All columns from your completeness results
all_columns = [
    'geometry', 'original_dataset', 'MATERIAL', 'MOVEMENT', 'LANDSLIDE_ID',
    'Shape_Area', 'Shape_Length', 'SLOPE', 'VOLUME_MERGED', 'MVMT_AZIMUTH_DEG',
    'DESCRIPTION', 'REF_ID_COD', 'FAIL_DEPTH', 'HS_HEIGHT', 'AREA',
    'CONFIDENCE', 'RELATIVE_AGE', 'MOVE_CODE', 'AVG_SCARP_DIST_FT',
    'FAN_HEIGHT', 'TYPE_MOVE', 'GEOL', 'MAP_UNIT_L', 'DEEP_SHAL',
    'NAME', 'Longitude', 'Latitude', 'Type', 'Material',
    'Discharge estimate (m3/s)', 'DATE_MOVE', 'Point_location',
    'Timing', 'Reference', 'Trigger', 'Event_verification',
    'Contributo', 'Watercours', 'Study_area', 'Cont_details',
    'YEAR', 'Type_details', 'DATE_RANGE', 'Size_class',
    'Resource_road_type', 'Resource_road_activity', 'Bedrock_type',
    'Comment', 'Volume_estimate_method', 'MONTH', 'DAY', 'REACTIVATION'
]

print("Analyzing origin datasets for all columns...")
origins_by_value = find_origins_by_value(landslides, all_columns)


Analyzing origin datasets for all columns...

=== geometry ===
Unique origins: ['OR', 'WA', 'BC']


Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.

  valid_mask = df[col].notna() & (df[col] != '')


Total unique values: 90348

=== original_dataset ===
Unique origins: ['OR', 'WA', 'BC']
Total unique values: 3

=== MATERIAL ===
Unique origins: ['OR', 'WA', 'BC']
Total unique values: 12

=== MOVEMENT ===
Unique origins: ['OR', 'WA', 'BC']
Total unique values: 25

=== LANDSLIDE_ID ===
Unique origins: ['OR', 'WA', 'BC']
Total unique values: 89733

=== Shape_Area ===
Unique origins: ['OR', 'WA']
Total unique values: 86279

=== Shape_Length ===
Unique origins: ['OR', 'WA']
Total unique values: 86279

=== SLOPE ===
Unique origins: ['OR', 'WA']
Total unique values: 6658

=== VOLUME_MERGED ===
Unique origins: ['OR', 'WA']
Total unique values: 55298

=== MVMT_AZIMUTH_DEG ===
Unique origins: ['OR', 'WA']
Total unique values: 45

=== DESCRIPTION ===
Unique origins: ['OR']
Total unique values: 3

=== REF_ID_COD ===
Unique origins: ['OR']
Total unique values: 346

=== FAIL_DEPTH ===
Unique origins: ['OR', 'WA']
Total unique values: 13694

=== HS_HEIGHT ===
Unique origins: ['OR', 'WA']
Total uniq