In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [57]:
# Step 1: Load and Explore the Data
print("=" * 60)
print("DISASTER EVENTS DATASET - INITIAL EXPLORATION")
print("=" * 60)

DISASTER EVENTS DATASET - INITIAL EXPLORATION


In [58]:
df = pd.read_csv("C:\\Users\\lmode\\OneDrive\\Documentos\\DATA_ANALYTICS_PROJECTS\\4_Disaster_Events_2025\\synthetic_disaster_events_2025.csv")

In [59]:
# Basic information
print("\n1. DATASET OVERVIEW")
print("-" * 60)
print(f"Total Records: {len(df):,}")
print(f"Total Columns: {len(df.columns)}")
print(f"\nColumn Names and Types:")
print(df.dtypes)


1. DATASET OVERVIEW
------------------------------------------------------------
Total Records: 20,000
Total Columns: 13

Column Names and Types:
event_id                         int64
disaster_type                   object
location                        object
latitude                       float64
longitude                      float64
date                            object
severity_level                   int64
affected_population              int64
estimated_economic_loss_usd    float64
response_time_hours            float64
aid_provided                    object
infrastructure_damage_index    float64
is_major_disaster                int64
dtype: object


In [60]:
# First few rows
print("\n2. SAMPLE DATA (First 5 rows)")
print("-" * 60)
print(df.head())


2. SAMPLE DATA (First 5 rows)
------------------------------------------------------------
   event_id      disaster_type location   latitude  longitude        date  \
0         1           Wildfire    Chile -34.681672 -71.819529  2025-08-27   
1         2          Hurricane    India  22.128569  78.023951  2023-05-29   
2         3  Volcanic Eruption    Italy  42.316058  11.031447  2023-01-15   
3         4            Drought    Chile -33.436253 -69.984615  2024-02-08   
4         5  Volcanic Eruption   Turkey  39.400977  37.006822  2023-12-23   

   severity_level  affected_population  estimated_economic_loss_usd  \
0               8                31104                   2768213.39   
1               5                29340                   5996226.87   
2               7                34804                   9222541.48   
3               8                31191                   1827703.09   
4               8                46284                  13435921.49   

   response_time_h

In [61]:
# Check for missing values
print("\n3. MISSING VALUES CHECK")
print("-" * 60)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])


3. MISSING VALUES CHECK
------------------------------------------------------------
Empty DataFrame
Columns: [Missing Count, Percentage]
Index: []


In [62]:
# Statistical summary
print("\n4. STATISTICAL SUMMARY")
print("-" * 60)
print(df.describe())


4. STATISTICAL SUMMARY
------------------------------------------------------------
           event_id      latitude     longitude  severity_level  \
count  20000.000000  20000.000000  20000.000000    20000.000000   
mean   10000.500000     18.749811     41.976629        5.489300   
std     5773.647028     25.026759     82.535922        2.866269   
min        1.000000    -43.127383   -102.133704        1.000000   
25%     5000.750000      3.560059      8.346726        3.000000   
50%    10000.500000     24.361006     74.600322        5.000000   
75%    15000.250000     38.285895    117.460836        8.000000   
max    20000.000000     48.452161    145.902669       10.000000   

       affected_population  estimated_economic_loss_usd  response_time_hours  \
count         20000.000000                 2.000000e+04         20000.000000   
mean          27641.248950                 4.831073e+06            36.369664   
std           16017.199074                 3.624308e+06            20.4

In [63]:
# Unique values in categorical columns
print("\n5. CATEGORICAL COLUMNS ANALYSIS")
print("-" * 60)
print(f"Disaster Types: {df['disaster_type'].nunique()}")
print(df['disaster_type'].value_counts())
print(f"\nLocations: {df['location'].nunique()}")
print(f"\nAid Types: {df['aid_provided'].nunique()}")
print(df['aid_provided'].value_counts())


5. CATEGORICAL COLUMNS ANALYSIS
------------------------------------------------------------
Disaster Types: 7
disaster_type
Earthquake           2910
Landslide            2891
Wildfire             2870
Hurricane            2866
Drought              2863
Volcanic Eruption    2827
Flood                2773
Name: count, dtype: int64

Locations: 8

Aid Types: 2
aid_provided
Yes    14030
No      5970
Name: count, dtype: int64


In [64]:
# Date range
print("\n6. DATE RANGE")
print("-" * 60)
df['date'] = pd.to_datetime(df['date'])
print(f"From: {df['date'].min()}")
print(f"To: {df['date'].max()}")


6. DATE RANGE
------------------------------------------------------------
From: 2022-12-08 00:00:00
To: 2025-12-07 00:00:00


In [65]:
# Severity levels
print("\n7. SEVERITY DISTRIBUTION")
print("-" * 60)
print(df['severity_level'].value_counts().sort_index())


7. SEVERITY DISTRIBUTION
------------------------------------------------------------
severity_level
1     2026
2     1979
3     1991
4     2031
5     1993
6     1986
7     1995
8     2053
9     2025
10    1921
Name: count, dtype: int64


In [66]:
# Major disasters
print("\n8. MAJOR DISASTERS")
print("-" * 60)
major_count = df['is_major_disaster'].sum()
print(f"Major Disasters: {major_count:,} ({(major_count/len(df)*100):.2f}%)")
print(f"Regular Events: {len(df) - major_count:,} ({((len(df)-major_count)/len(df)*100):.2f}%)")


8. MAJOR DISASTERS
------------------------------------------------------------
Major Disasters: 8,001 (40.01%)
Regular Events: 11,999 (59.99%)


In [67]:
print("\n" + "=" * 60)
print("DATA EXPLORATION COMPLETE")
print("=" * 60)


DATA EXPLORATION COMPLETE


In [68]:
from datetime import datetime

In [69]:
print("=" * 60)
print("DATA CLEANING & PREPROCESSING")
print("=" * 60)

DATA CLEANING & PREPROCESSING


In [70]:
# 1. Convert date to datetime
print("\n1. Converting date column to datetime...")
df['date'] = pd.to_datetime(df['date'])


1. Converting date column to datetime...


In [71]:
# 2. Create temporal features for analysis
print("2. Creating temporal features...")
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['month_name'] = df['date'].dt.month_name()
df['quarter'] = df['date'].dt.quarter
df['day_of_week'] = df['date'].dt.day_name()
df['week_of_year'] = df['date'].dt.isocalendar().week

2. Creating temporal features...


In [72]:
# 3. Handle any missing values
print("3. Handling missing values...")
initial_rows = len(df)
df = df.dropna()  # Remove rows with any missing values
print(f"   Rows removed: {initial_rows - len(df)}")

3. Handling missing values...
   Rows removed: 0


In [73]:
# 4. Remove duplicates if any
print("4. Checking for duplicates...")
duplicates = df.duplicated().sum()
if duplicates > 0:
    df = df.drop_duplicates()
    print(f"   Duplicates removed: {duplicates}")
else:
    print("   No duplicates found")

4. Checking for duplicates...
   No duplicates found


In [74]:
# 5. Validate data ranges
print("5. Validating data ranges...")
# Severity should be between reasonable bounds
df = df[(df['severity_level'] >= 1) & (df['severity_level'] <= 10)]
# Response time should be positive
df = df[df['response_time_hours'] >= 0]
# Affected population should be positive
df = df[df['affected_population'] >= 0]
# Economic loss should be non-negative
df = df[df['estimated_economic_loss_usd'] >= 0]
# Infrastructure damage index should be between 0 and 1
df = df[(df['infrastructure_damage_index'] >= 0) & (df['infrastructure_damage_index'] <= 1)]

5. Validating data ranges...


In [75]:
# 6. Create categorical severity labels
print("6. Creating severity categories...")
def categorize_severity(severity):
    if severity <= 3:
        return 'Low'
    elif severity <= 6:
        return 'Medium'
    elif severity <= 8:
        return 'High'
    else:
        return 'Critical'

df['severity_category'] = df['severity_level'].apply(categorize_severity)

6. Creating severity categories...


In [76]:
# 7. Create economic loss categories
print("7. Creating economic impact categories...")
def categorize_economic_loss(loss):
    if loss < 1_000_000:
        return 'Minor (<$1M)'
    elif loss < 10_000_000:
        return 'Moderate ($1M-$10M)'
    elif loss < 100_000_000:
        return 'Severe ($10M-$100M)'
    else:
        return 'Catastrophic (>$100M)'

df['economic_impact_category'] = df['estimated_economic_loss_usd'].apply(categorize_economic_loss)

7. Creating economic impact categories...


In [77]:
# 8. Create response time categories
print("8. Creating response time categories...")
def categorize_response(hours):
    if hours < 6:
        return 'Immediate (<6h)'
    elif hours < 24:
        return 'Fast (6-24h)'
    elif hours < 72:
        return 'Moderate (24-72h)'
    else:
        return 'Slow (>72h)'

df['response_category'] = df['response_time_hours'].apply(categorize_response)

8. Creating response time categories...


In [78]:
# 9. Create affected population categories
print("9. Creating population impact categories...")
def categorize_population(pop):
    if pop < 1000:
        return 'Small (<1K)'
    elif pop < 10000:
        return 'Medium (1K-10K)'
    elif pop < 100000:
        return 'Large (10K-100K)'
    else:
        return 'Very Large (>100K)'

df['population_impact_category'] = df['affected_population'].apply(categorize_population)

9. Creating population impact categories...


In [79]:
# 10. Save cleaned data
print("10. Saving cleaned dataset...")
df.to_csv('disaster_events_cleaned.csv', index=False)

10. Saving cleaned dataset...


In [80]:
print("\n" + "=" * 60)
print("CLEANING SUMMARY")
print("=" * 60)
print(f"Final dataset shape: {df.shape}")
print(f"Total events: {len(df):,}")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"Disaster types: {df['disaster_type'].nunique()}")
print(f"Locations: {df['location'].nunique()}")
print(f"\nNew columns created:")
print("- Temporal: year, month, month_name, quarter, day_of_week, week_of_year")
print("- Categories: severity_category, economic_impact_category")
print("- Categories: response_category, population_impact_category")

print("\n✓ Cleaned data saved to 'disaster_events_cleaned.csv'")
print("=" * 60)


CLEANING SUMMARY
Final dataset shape: (20000, 23)
Total events: 20,000
Date range: 2022-12-08 to 2025-12-07
Disaster types: 7
Locations: 8

New columns created:
- Temporal: year, month, month_name, quarter, day_of_week, week_of_year
- Categories: severity_category, economic_impact_category
- Categories: response_category, population_impact_category

✓ Cleaned data saved to 'disaster_events_cleaned.csv'


In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [82]:
# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load cleaned data
df = pd.read_csv("C:\\Users\\lmode\\Downloads\\disaster_events.csv")
df['date'] = pd.to_datetime(df['date'])

print("=" * 60)
print("DISASTER EVENTS ANALYSIS & VISUALIZATIONS")
print("=" * 60)

DISASTER EVENTS ANALYSIS & VISUALIZATIONS


In [83]:
# ============================================
# 1. TEMPORAL ANALYSIS
# ============================================
print("\n1. TEMPORAL PATTERNS")
print("-" * 60)


1. TEMPORAL PATTERNS
------------------------------------------------------------


In [84]:
# Events over time
events_by_month = df.groupby(df['date'].dt.to_period('M')).size()
print(f"Average events per month: {events_by_month.mean():.1f}")

# Seasonal analysis
seasonal = df.groupby('month_name').size().reindex([
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
])
print(f"Peak month: {seasonal.idxmax()} ({seasonal.max()} events)")

Average events per month: 540.5
Peak month: July (1787 events)


In [85]:
# ============================================
# 2. DISASTER TYPE ANALYSIS
# ============================================
print("\n2. DISASTER TYPE ANALYSIS")
print("-" * 60)

disaster_summary = df.groupby('disaster_type').agg({
    'event_id': 'count',
    'affected_population': 'sum',
    'estimated_economic_loss_usd': 'sum',
    'severity_level': 'mean',
    'response_time_hours': 'mean'
}).round(2)
disaster_summary.columns = ['Count', 'Total Affected', 'Total Loss ($)', 'Avg Severity', 'Avg Response (hrs)']
disaster_summary = disaster_summary.sort_values('Count', ascending=False)
print(disaster_summary)



2. DISASTER TYPE ANALYSIS
------------------------------------------------------------
                   Count  Total Affected  Total Loss ($)  Avg Severity  \
disaster_type                                                            
Earthquake          2910        81009081    1.433931e+10          5.51   
Landslide           2891        80219411    1.398122e+10          5.50   
Wildfire            2870        78944575    1.360251e+10          5.50   
Hurricane           2866        79723895    1.391690e+10          5.50   
Drought             2863        78450094    1.383185e+10          5.45   
Volcanic Eruption   2827        78777543    1.382094e+10          5.55   
Flood               2773        75700380    1.312871e+10          5.42   

                   Avg Response (hrs)  
disaster_type                          
Earthquake                      36.05  
Landslide                       36.19  
Wildfire                        36.30  
Hurricane                       36.15  
Droug

In [86]:
# ============================================
# 3. SEVERITY ANALYSIS
# ============================================
print("\n3. SEVERITY ANALYSIS")
print("-" * 60)

severity_dist = df['severity_category'].value_counts()
print(severity_dist)

# Major disasters analysis
major_disasters = df[df['is_major_disaster'] == 1]
print(f"\nMajor Disasters: {len(major_disasters):,}")
print(f"Total affected by major disasters: {major_disasters['affected_population'].sum():,}")
print(f"Total economic loss from major disasters: ${major_disasters['estimated_economic_loss_usd'].sum():,.0f}")


3. SEVERITY ANALYSIS
------------------------------------------------------------
severity_category
Medium      6010
Low         5996
High        4048
Critical    3946
Name: count, dtype: int64

Major Disasters: 8,001
Total affected by major disasters: 339,166,370
Total economic loss from major disasters: $59,330,143,427


In [87]:
# ============================================
# 4. GEOGRAPHIC ANALYSIS
# ============================================
print("\n4. GEOGRAPHIC ANALYSIS")
print("-" * 60)

top_locations = df['location'].value_counts().head(10)
print("Top 10 Most Affected Locations:")
print(top_locations)


4. GEOGRAPHIC ANALYSIS
------------------------------------------------------------
Top 10 Most Affected Locations:
location
India          2583
Indonesia      2530
Turkey         2515
Chile          2515
Italy          2485
Japan          2477
Philippines    2453
USA            2442
Name: count, dtype: int64


In [88]:
# ============================================
# 5. RESPONSE TIME ANALYSIS
# ============================================
print("\n5. RESPONSE TIME ANALYSIS")
print("-" * 60)

response_stats = df.groupby('disaster_type')['response_time_hours'].agg(['mean', 'median', 'min', 'max']).round(2)
print(response_stats)


5. RESPONSE TIME ANALYSIS
------------------------------------------------------------
                    mean  median   min    max
disaster_type                                
Drought            36.93   37.58  1.00  71.96
Earthquake         36.05   36.23  1.06  71.99
Flood              36.18   36.50  1.02  71.98
Hurricane          36.15   36.88  1.08  71.97
Landslide          36.19   35.98  1.00  71.98
Volcanic Eruption  36.78   36.86  1.02  71.94
Wildfire           36.30   36.22  1.00  71.93


In [89]:

# ============================================
# 6. ECONOMIC IMPACT ANALYSIS
# ============================================
print("\n6. ECONOMIC IMPACT ANALYSIS")
print("-" * 60)

total_loss = df['estimated_economic_loss_usd'].sum()
avg_loss = df['estimated_economic_loss_usd'].mean()
print(f"Total Economic Loss: ${total_loss:,.0f}")
print(f"Average Loss per Event: ${avg_loss:,.0f}")

economic_by_type = df.groupby('disaster_type')['estimated_economic_loss_usd'].sum().sort_values(ascending=False)
print(f"\nMost Economically Damaging Disaster Type: {economic_by_type.idxmax()} (${economic_by_type.max():,.0f})")


6. ECONOMIC IMPACT ANALYSIS
------------------------------------------------------------
Total Economic Loss: $96,621,452,916
Average Loss per Event: $4,831,073

Most Economically Damaging Disaster Type: Earthquake ($14,339,311,149)


In [90]:
# ============================================
# 7. CORRELATION ANALYSIS
# ============================================
print("\n7. CORRELATION ANALYSIS")
print("-" * 60)

correlations = df[['severity_level', 'affected_population', 'estimated_economic_loss_usd', 
                   'response_time_hours', 'infrastructure_damage_index']].corr()
print(correlations)


7. CORRELATION ANALYSIS
------------------------------------------------------------
                             severity_level  affected_population  \
severity_level                     1.000000             0.875442   
affected_population                0.875442             1.000000   
estimated_economic_loss_usd        0.678421             0.772898   
response_time_hours               -0.005492            -0.005973   
infrastructure_damage_index        0.672367             0.587946   

                             estimated_economic_loss_usd  response_time_hours  \
severity_level                                  0.678421            -0.005492   
affected_population                             0.772898            -0.005973   
estimated_economic_loss_usd                     1.000000            -0.002595   
response_time_hours                            -0.002595             1.000000   
infrastructure_damage_index                     0.460182             0.001305   

                  

In [91]:
# ============================================
# 8. KEY INSIGHTS
# ============================================
print("\n" + "=" * 60)
print("KEY INSIGHTS")
print("=" * 60)


KEY INSIGHTS


In [92]:
print(f"1. Dataset contains {len(df):,} disaster events")
print(f"2. {len(major_disasters):,} ({len(major_disasters)/len(df)*100:.1f}%) are classified as major disasters")
print(f"3. Total population affected: {df['affected_population'].sum():,}")
print(f"4. Total economic loss: ${df['estimated_economic_loss_usd'].sum():,.0f}")
print(f"5. Average response time: {df['response_time_hours'].mean():.1f} hours")
print(f"6. Most common disaster type: {df['disaster_type'].mode()[0]}")
print(f"7. Highest severity events: {len(df[df['severity_level'] >= 9]):,}")
print(f"8. Average infrastructure damage: {df['infrastructure_damage_index'].mean():.2%}")

print("\n✓ Analysis complete! Ready for dashboard creation.")
print("=" * 60)

1. Dataset contains 20,000 disaster events
2. 8,001 (40.0%) are classified as major disasters
3. Total population affected: 552,824,979
4. Total economic loss: $96,621,452,916
5. Average response time: 36.4 hours
6. Most common disaster type: Earthquake
7. Highest severity events: 3,946
8. Average infrastructure damage: 55.73%

✓ Analysis complete! Ready for dashboard creation.


In [93]:
essential_cols = [
    'date', 'disaster_type', 'location', 'latitude', 'longitude', 'severity_level', 
    'affected_population', 'estimated_economic_loss_usd', 'response_time_hours', 
    'aid_provided', 'infrastructure_damage_index', 'is_major_disaster', 'severity_category',
    'economic_impact_category']
df[essential_cols].to_csv('disaster_events.csv', index=False)
    