### This notebook creates a dataset with consolidated categories

In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import scipy.interpolate
from netCDF4 import Dataset
import bottleneck
import pyarrow
import scipy.ndimage as ndimage

import matplotlib.pyplot as plt

# Reading in the Data

In [49]:
# Import Merged Data from Notebook in Cloud
data = pd.read_parquet('../Data/ds_df.parquet', engine = 'pyarrow')

In [50]:
data.head()

Unnamed: 0,lat,lon,firemask,MaxFRP,BurnDate,Uncertainty,QA,FirstDay,LastDay,FWI
0,42.059122,-125.292524,3.0,,,,0.0,,,
1,42.059122,-125.288031,3.0,,,,0.0,,,
2,42.059122,-125.283538,3.0,,,,0.0,,,
3,42.059122,-125.279044,3.0,,,,0.0,,,
4,42.059122,-125.274551,3.0,,,,0.0,,,


In [51]:
data.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'BurnDate', 'Uncertainty', 'QA',
       'FirstDay', 'LastDay', 'FWI'],
      dtype='object')

# Exploring the Data

In [52]:
data['firemask'].value_counts()

5.0    3750363
3.0    1725997
4.0      11727
4.5         48
3.5          5
Name: firemask, dtype: int64

In [53]:
d = data.astype('float')

In [54]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5488140 entries, 0 to 5488139
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   lat          float64
 1   lon          float64
 2   firemask     float64
 3   MaxFRP       float64
 4   BurnDate     float64
 5   Uncertainty  float64
 6   QA           float64
 7   FirstDay     float64
 8   LastDay      float64
 9   FWI          float64
dtypes: float64(10)
memory usage: 418.7 MB


In [55]:
## Assumption set all nulls to zero!
d.fillna(0, inplace = True)

# Create Target Categories

In [56]:
# Start with all the values in FireMask
d['label'] = d['firemask']

In [57]:
d['label'].value_counts()

5.0    3750363
3.0    1725997
4.0      11727
4.5         48
3.5          5
Name: label, dtype: int64

In [83]:
# Create categorical values for active fire pixels
# FireMask
# 3 = water, non-fire
# 4 = cloud (land or water)
# 5 = land, non-fire
# 6 = unknown (land or water)
# 7, 8, 9 = fire (low, nominal and high confidence)

d.loc[(d['firemask'] == 9) | (d['firemask'] == 8) | (data['firemask'] == 7), 'label'] = 'active_fire'
d.loc[d['firemask'] == 3, 'label'] = 'water'

# update to accomodate
d.loc[d['firemask'] == 3.5, 'label'] = 'active_fire'
d.loc[d['firemask'] == 4.0, 'label'] = 'cloud'
d.loc[d['firemask'] == 4.5, 'label'] = 'active_fire'

In [84]:
d.loc[((d['LastDay'] > 0) & (d['firemask']==5)), 'label'] = 'land'
d.loc[((d['LastDay'] > 0) & (d['firemask']==4)), 'label'] = 'burned'
#updated version to accomodate
# d.loc[((d['LastDay'] > 0) & (d['firemask']==4.5)), 'label'] = 'burned'

In [85]:
# Create categorical values from FWI numerical, only for areas that are not active fire or burned
d.loc[(d['FWI'] < 5) & (d['label'] == 5), 'label'] = 'land'
d.loc[(d['FWI'] < 5) & (d['label'] == 5), 'label'] = 'land'

d.loc[(d['FWI'] >= 5) & (d['FWI'] < 8) & (d['label'] == 5), 'label'] = 'fwi_moderate'
d.loc[(d['FWI'] >= 5) & (d['FWI'] < 8) & (d['label'] == 4), 'label'] = 'fwi_moderate'

d.loc[(d['FWI'] >= 8)  & (d['label'] == 5), 'label'] = 'fwi_high'
d.loc[(d['FWI'] >= 8)  & (d['label'] == 4), 'label'] = 'fwi_high'

In [86]:
d.loc[(d['label']==5), 'label'] = 'land'

In [76]:
# CLOUD COVER ASSUMPTION:
# Assuming if soil average value, then land. If not, water. 
d.loc[(d['label']==4) & (d['ESoil_tavg'] > 0), 'label'] = 'land'
d.loc[(d['label']==4), 'label'] = 'water'

## ASUMING UNKNOWNS AND ZEROS ARE WATER
d.loc[(d['label']==6), 'label'] = 'water'
d.loc[d['label']==0, 'label'] = 'water'

KeyError: 'ESoil_tavg'

In [None]:
# # Create categorical values for burned pixels, only for areas that are not active fire
# # burned
# # 0.0 = unburned
# # -2.0 = water

# data.loc[(data['burned'] != 0.) & (data['burned'] != -2.) & (data['label'] != 'active_fire'), 'label'] = 'burned'

In [16]:
# # for missing data from viirs - i.e. cloud and unknown pixels, fill in missing information from categories in burned dataset

# # for cloud pixels...
# # where 'unburned', code as land
# data.loc[(data['label'] == 4) & (data['burned'] == 0), 'label'] = 'land'
# # where 'water', code as water
# data.loc[(data['label'] == 4) & (data['burned'] == -2.), 'label'] = 'water'                              
                                 

# # for unknown pixels...
# # where unburned, code as land
# data.loc[(data['label'] == 6) & (data['burned'] == 0), 'label'] = 'land'
# # where 'water', code as water
# data.loc[(data['label'] == 6) & (data['burned'] == -2.), 'label'] = 'water'    

# Drop columns that were used to create labels

In [87]:
d.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'BurnDate', 'Uncertainty', 'QA',
       'FirstDay', 'LastDay', 'FWI', 'label'],
      dtype='object')

In [88]:
# Create smaller dataset for first model run
newdata_v1 = d.drop(['firemask', 'BurnDate', 'FirstDay', 'LastDay', 'FWI'], axis = 1)

In [89]:
newdata_v1['label'].value_counts()

burned         3755646
water          1725997
fwi_high          6005
cloud              356
land                83
active_fire         53
Name: label, dtype: int64

In [69]:
# Export data as clean parquet gzip file
newdata_v1.to_parquet('../Data/newdata_v2.parquet')

In [70]:
newdata_v1['label'].value_counts()

burned      3744275
water       1737777
fwi_high       6005
land             83
Name: label, dtype: int64