In [32]:
import pandas as pd
import xarray as xr
import numpy as np
import scipy.interpolate
from netCDF4 import Dataset
import bottleneck
import pyarrow
import fastparquet
import scipy.ndimage as ndimage
import geopandas

import matplotlib.pyplot as plt

# Reading in the Data

In [35]:
# Import Merged Data from Notebook in Cloud
data = pd.read_parquet('../Data/ds_df.parquet', engine = 'pyarrow')

In [5]:
data.shape

(5488140, 10)

In [6]:
data.head()

Unnamed: 0,lat,lon,firemask,MaxFRP,BurnDate,Uncertainty,QA,FirstDay,LastDay,FWI
0,42.059122,-125.292524,3.0,,,,0.0,,,
1,42.059122,-125.288031,3.0,,,,0.0,,,
2,42.059122,-125.283538,3.0,,,,0.0,,,
3,42.059122,-125.279044,3.0,,,,0.0,,,
4,42.059122,-125.274551,3.0,,,,0.0,,,


In [36]:
data

Unnamed: 0,lat,lon,firemask,MaxFRP,BurnDate,Uncertainty,QA,FirstDay,LastDay,FWI
0,42.059122,-125.292524,3.0,,,,0.0,,,
1,42.059122,-125.288031,3.0,,,,0.0,,,
2,42.059122,-125.283538,3.0,,,,0.0,,,
3,42.059122,-125.279044,3.0,,,,0.0,,,
4,42.059122,-125.274551,3.0,,,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...
5488135,32.411215,-113.834520,5.0,,,,3.0,174.5,175.5,59.633713
5488136,32.411215,-113.830027,5.0,,,,3.0,174.5,175.5,59.633713
5488137,32.411215,-113.825533,5.0,,,,3.0,174.5,175.5,59.633713
5488138,32.411215,-113.821040,5.0,,,,3.0,174.5,175.5,59.633713


In [7]:
data.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'BurnDate', 'Uncertainty', 'QA',
       'FirstDay', 'LastDay', 'FWI'],
      dtype='object')

In [96]:
# Drop columns that were not deemed important features
less_than_onepercent = [
 'gaugeQualityInfo',
 'observationTimeFlag',
 'satelliteInfoFlag',
 'ECanop_tavg',
 'Qg_tavg',
 'Qh_tavg',
 'Qs_acc',
 'Qsm_acc',
 'SWE_inst',
 'SWdown_f_tavg',
 'SnowDepth_inst',
 'Snowf_tavg',
 'SoilMoi40_100cm_inst',
 'SoilTMP10_40cm_inst',
 'Tveg_tavg',
 'Uncertainty']

data.drop(less_than_onepercent, axis = 1, inplace = True)

In [8]:
data['FWI'].describe()

count    3.954940e+06
mean     5.484965e+01
std      1.338468e+01
min      7.421529e-01
25%      5.058295e+01
50%      5.684244e+01
75%      6.316099e+01
max      8.262209e+01
Name: FWI, dtype: float64

# Exploring the Data

In [9]:
d = data.astype('float')

In [10]:
d['firemask'].value_counts()

5.0    3750363
3.0    1725997
4.0      11727
4.5         48
3.5          5
Name: firemask, dtype: int64

In [12]:
# d.info()

In [11]:
# MAJOR ASSUMPTION: Fill all nulls with 0
d.fillna(0, inplace = True)

# Create Target Categories

In [13]:
# Start with all the values in FireMask
d['label'] = d['firemask']

In [14]:
d['label'].value_counts()

5.0    3750363
3.0    1725997
4.0      11727
4.5         48
3.5          5
Name: label, dtype: int64

In [15]:
# Create categorical values for active fire pixels
# FireMask
# 3 = water, non-fire
# 4 = cloud (land or water)
# 5 = land, non-fire
# 6 = unknown (land or water)
# 7 = low confidence fire pixel - classified as land
# 8, 9 = fire (nominal and high confidence fire pixels - classified as fire

d.loc[d['firemask'] == 3, 'label'] = 'water'
d.loc[(d['firemask'] == 5) | (d['firemask'] == 7), 'label'] = 'land'
d.loc[(d['firemask'] == 9) | (d['firemask'] == 8), 'label'] = 'active_fire'

# 4, 6, and zero are unknown right now - we will fill those in later. 
d.loc[d['firemask'] == 0, 'label'] = 'unknown'
d.loc[d['firemask'] == 4, 'label'] = 'unknown'
d.loc[d['firemask'] == 6, 'label'] = 'unknown'
d.loc[d['firemask'] == 3.5, 'label'] = 'active_fire'
d.loc[d['firemask'] == 4.0, 'label'] = 'cloud'
d.loc[d['firemask'] == 4.5, 'label'] = 'active_fire'

In [16]:
# If burn date is earlier than December 21 (day 344), mark as burned
d.loc[(d['BurnDate'] < 344) & (d['BurnDate'] > 0), 'label'] = 'burned'

# Note: Max value of BurnDate is day 334

In [17]:
# Using the other datasets, let's determine whether unknown pixels are land or water

# If there is a FWI value for the pixel, it is land
d.loc[(d['label']=='unknown') & (d['FWI'] > 0), 'label'] = 'land'

#If there is no (zero) FWI value, it is water
d.loc[(d['label']=='unknown') & (d['FWI'] == 0), 'label'] = 'water'

In [25]:
d['label'].value_counts()

land      3658050
water     1725746
burned     104291
4.5            48
3.5             5
Name: label, dtype: int64

In [19]:
# For "land" areas, create consolidated categories from FWI numerical
# based on the ranges appropriate for Australia
# just used extreme category

# d.loc[(d['FWI'] > 64) & (d['label'] == 'land'), 'label'] = 'fwi_veryhigh'

Source: https://www.bushfirecrc.com/sites/default/files/managed/resource/dowdy_and_mills-fwi.pdf![fire_thresholds.png](attachment:fire_thresholds.png)

# Drop columns that were used to create labels

In [24]:
d.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'BurnDate', 'Uncertainty', 'QA',
       'FirstDay', 'LastDay', 'FWI', 'label'],
      dtype='object')

In [26]:
# Drop all variables that were used to create the label columns. 
data_v3 = d.drop(['firemask', 'MaxFRP','BurnDate', 'QA','FirstDay', 'LastDay', 'FWI'], axis = 1)

### Check that data is aligned properly

In [28]:
data_v3

Unnamed: 0,lat,lon,Uncertainty,label
0,42.059122,-125.292524,0.0,water
1,42.059122,-125.288031,0.0,water
2,42.059122,-125.283538,0.0,water
3,42.059122,-125.279044,0.0,water
4,42.059122,-125.274551,0.0,water
...,...,...,...,...
5488135,32.411215,-113.834520,0.0,land
5488136,32.411215,-113.830027,0.0,land
5488137,32.411215,-113.825533,0.0,land
5488138,32.411215,-113.821040,0.0,land


In [30]:
# Export data as clean parquet gzip file
data_v3.to_parquet('../Data/data_v3.parquet')

ArrowTypeError: ("Expected a bytes object, got a 'float' object", 'Conversion failed for column label with type object')

In [70]:
data_v3['label'].value_counts()

water           661027
fwi_veryhigh    217468
land            153509
burned            2234
active_fire        138
Name: label, dtype: int64