In [1]:
import pandas as pd

DATA = '/kaggle/input/california-wildfire-damage-2014-feb2025/California Wildfire Damage.csv'
df = pd.read_csv(filepath_or_buffer=DATA, parse_dates=['Date'])
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df.head()

Unnamed: 0,Incident_ID,Date,Location,Area_Burned (Acres),Homes_Destroyed,Businesses_Destroyed,Vehicles_Damaged,Injuries,Fatalities,Estimated_Financial_Loss (Million $),Cause,year,month
0,INC1000,2020-11-22,Sonoma County,14048,763,474,235,70,19,2270.57,Lightning,2020,11
1,INC1001,2021-09-23,Sonoma County,33667,1633,4,263,100,2,1381.14,Lightning,2021,9
2,INC1002,2022-02-10,Shasta County,26394,915,291,31,50,6,2421.96,Human Activity,2022,2
3,INC1003,2021-05-17,Sonoma County,20004,1220,128,34,28,0,3964.16,Unknown,2021,5
4,INC1004,2021-09-22,Sonoma County,40320,794,469,147,0,15,1800.09,Unknown,2021,9


First let's just make some bar charts and histograms.

In [2]:
from plotly import express
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)
express.histogram(data_frame=df, x='year').show(renderer='iframe_connected',)

How many incidents are there in a typical year?

In [3]:
df['year'].value_counts().mean()

10.0

Is there a fire season?

In [4]:
express.histogram(data_frame=df, x='month', nbins=12).show(renderer='iframe_connected',)

There is only sort of a fire season; fires happen any month of the year, but they're unevenly distributed.

In [5]:
express.histogram(data_frame=df, x='Area_Burned (Acres)', nbins=40, color='year').show(renderer='iframe_connected',)

The damage, in terms of acres burned, also seems somewhat randomly distributed, with no real pattern.

In [6]:
express.histogram(data_frame=df[['year', 'Area_Burned (Acres)']].groupby(by='year').mean().reset_index(), x='year', y='Area_Burned (Acres)', nbins=16).show(renderer='iframe_connected',)

The average damage per fire per year doesn't show a trend either.

In [7]:
express.histogram(data_frame=df, x='Homes_Destroyed', nbins=40, color='year').show(renderer='iframe_connected',)

Are they concentrated geographically?

In [8]:
df['Location'].value_counts().to_frame().T

Location,Shasta County,Sonoma County,San Diego County,Butte County,Mendocino County,Napa Valley,Orange County,Riverside County,Los Angeles County,Santa Barbara County
count,14,12,12,11,11,10,9,8,7,6


Yes. California has 58 counties but only ten are represented in this dataset, and even among those fires are unevenly distributed.

In [9]:
df['Cause'].value_counts().to_frame().T

Cause,Human Activity,Lightning,Unknown
count,38,31,31


And of our three recorded causes the fires are pretty evenly distributed.

In [10]:
df.columns

Index(['Incident_ID', 'Date', 'Location', 'Area_Burned (Acres)',
       'Homes_Destroyed', 'Businesses_Destroyed', 'Vehicles_Damaged',
       'Injuries', 'Fatalities', 'Estimated_Financial_Loss (Million $)',
       'Cause', 'year', 'month'],
      dtype='object')

In [11]:
from sklearn.manifold import TSNE

COLUMNS = ['Area_Burned (Acres)',
       'Homes_Destroyed', 'Businesses_Destroyed', 'Vehicles_Damaged',
       'Injuries', 'Fatalities', 'Estimated_Financial_Loss (Million $)']
RANDOM_STATE = 2025
reducer = TSNE(random_state=2025)
plot_df = pd.DataFrame(data=reducer.fit_transform(X=df[COLUMNS]), columns=['x', 'y'])
plot_df = pd.concat(axis='columns', objs=[plot_df, df[['Incident_ID', 'Date', 'Location', 'Cause']]])

In [12]:
express.scatter(data_frame=plot_df, x='x', y='y', color='Location', hover_name='Incident_ID', hover_data=['Date', 'Location', 'Cause']).show(renderer='iframe_connected', )

What does this tell us? It tells us that our different ways of estimating severity do not seem to have an underlying variable that causes them to cluster together; also, if we can estimate severity, it doesn't seem to be correlated with location.

In [13]:
express.scatter(data_frame=plot_df, x='x', y='y', color='Cause', hover_name='Incident_ID', hover_data=['Date', 'Location', 'Cause']).show(renderer='iframe_connected', )

And we can only see a little clustering by cause.

In [14]:
df[COLUMNS].corr()

Unnamed: 0,Area_Burned (Acres),Homes_Destroyed,Businesses_Destroyed,Vehicles_Damaged,Injuries,Fatalities,Estimated_Financial_Loss (Million $)
Area_Burned (Acres),1.0,0.051915,0.028195,-0.136432,0.094843,0.050394,0.075187
Homes_Destroyed,0.051915,1.0,0.113493,-0.073115,0.01527,-0.045863,0.046645
Businesses_Destroyed,0.028195,0.113493,1.0,-0.075566,-0.103607,0.073564,-0.07799
Vehicles_Damaged,-0.136432,-0.073115,-0.075566,1.0,0.119331,-0.177314,-0.02445
Injuries,0.094843,0.01527,-0.103607,0.119331,1.0,-0.037908,0.079737
Fatalities,0.050394,-0.045863,0.073564,-0.177314,-0.037908,1.0,0.184919
Estimated_Financial_Loss (Million $),0.075187,0.046645,-0.07799,-0.02445,0.079737,0.184919,1.0


As we might have expected, all our measures of severity are seemingly uncorrelated when we measure using the Pearson correlation.

In [15]:
df[['Location', 'Cause']].value_counts().to_frame().sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Location,Cause,Unnamed: 2_level_1
Butte County,Human Activity,4
Butte County,Lightning,3
Butte County,Unknown,4
Los Angeles County,Human Activity,3
Los Angeles County,Lightning,3
Los Angeles County,Unknown,1
Mendocino County,Human Activity,4
Mendocino County,Lightning,4
Mendocino County,Unknown,3
Napa Valley,Human Activity,6
