In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import random
InteractiveShell.ast_node_interactivity = "all"
import os
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import glob
from tqdm import tqdm

import plotly.figure_factory as ff
import plotly.express as px
import numpy as np

import itertools



In [2]:
TRAIN_UNTIL = 2018
VAL_BETWEEN = (2018, 2020)
TEST_ON = 2020

LAT_RANGE = (36, 42)
LON_RANGE = (26, 45)

PRECISION = 1
MIN_FIRE_RECORDS = 2

# Simple Baseline Model

### Trianing period 2013-2020

We zoomed in Australia and used aggressive aggregation for a simple baseline prediction model.
* Temporal resolution: Monthly
* Spatial resolution: 2 Decimal degree ~ 1 km grid
* Binary Target: At least two fire readings

In [3]:
aus_fires = pd.read_csv('../data/processed_data/fire_daily.csv.gz', parse_dates=['acq_date'])
aus_fires.shape
aus_fires.head()

(463430, 6)

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence
0,36.0,43.38,2016-11-25,Aqua,MODIS,62
1,36.0,43.42,2016-11-13,Terra,MODIS,71
2,36.0,43.89,2016-06-10,Terra,MODIS,100
3,36.01,36.34,2016-09-20,Terra,MODIS,65
4,36.01,37.2,2016-04-23,Aqua,MODIS,95


In [4]:
aus_fires = aus_fires[aus_fires.confidence>80]

aus_fires = aus_fires[
        (aus_fires.latitude > LAT_RANGE[0]) & (aus_fires.latitude < LAT_RANGE[1])]
aus_fires = aus_fires[
    (aus_fires.longitude > LON_RANGE[0]) & (aus_fires.longitude < LON_RANGE[1])]

aus_fires.shape

(50528, 6)

In [5]:
aus_fires['year'] = aus_fires.acq_date.dt.year
aus_fires['month'] = aus_fires.acq_date.dt.month
aus_fires.latitude = aus_fires.latitude.round(PRECISION)
aus_fires.longitude = aus_fires.longitude.round(PRECISION)
fires = aus_fires.groupby(['latitude', 'longitude', 'year', 'month']).size().reset_index()
fires.columns = ['latitude', 'longitude', 'year', 'month', 'fire_cnt']

fires.shape
fires.head()
fires.nunique()

(17655, 5)

Unnamed: 0,latitude,longitude,year,month,fire_cnt
0,36.0,27.8,2013,7,38
1,36.0,27.8,2016,6,7
2,36.0,27.9,2013,7,8
3,36.0,36.1,2020,8,1
4,36.0,36.1,2020,9,1


latitude      61
longitude    191
year           9
month         12
fire_cnt      72
dtype: int64

In [6]:
lats = np.arange(LAT_RANGE[0], LAT_RANGE[1], 1/(10^PRECISION)).round(PRECISION)
lons = np.arange(LON_RANGE[0], LON_RANGE[1], 1/(10^PRECISION)).round(PRECISION)
years = fires.year.unique()
months = fires.month.unique()

unq_combs = list(itertools.product(lats, lons))
coords = pd.DataFrame(unq_combs, columns=["latitude", "longitude"])

unq_combs = list(itertools.product(years, months))
times = pd.DataFrame(unq_combs, columns=["year", "month"])

coords['one'] = 1
times['one'] = 1

base = pd.merge(coords, times, how='outer', on='one').drop_duplicates()
history = base.merge(fires, how='left', on= ['latitude', 'longitude', 'year', 'month']).drop_duplicates()

In [7]:
history = history.fillna(0)
history.fire_cnt.value_counts().head()

0.0    1213606
1.0       9333
2.0       3388
3.0       1517
4.0        880
Name: fire_cnt, dtype: int64

In [8]:
history['fire'] = 1 * (history['fire_cnt'] >= MIN_FIRE_RECORDS)

In [9]:
history.shape
history.head()
history.mean()

(1231200, 7)

Unnamed: 0,latitude,longitude,one,year,month,fire_cnt,fire
0,36.0,26.0,1,2013,7,0.0,0
1,36.0,26.0,1,2013,6,0.0,0
2,36.0,26.0,1,2013,8,0.0,0
3,36.0,26.0,1,2013,9,0.0,0
4,36.0,26.0,1,2013,5,0.0,0


latitude       38.950000
longitude      35.450000
one             1.000000
year         2017.000000
month           6.500000
fire_cnt        0.040971
fire            0.006710
dtype: float64

In [10]:
yearly = history.groupby(
    ['latitude', 'longitude', 'year'])[['fire_cnt', 'fire']].mean().reset_index()
monthly = history.groupby(
    ['latitude', 'longitude', 'year', 'month'])[['fire_cnt', 'fire']].mean().reset_index()

In [11]:
last_year = yearly.copy()
last_year.year += 1
last_year.columns = ['latitude', 'longitude', 'year', 'fire_cnt_last_year', 'fire_last_year']
last_year.head(3)

Unnamed: 0,latitude,longitude,year,fire_cnt_last_year,fire_last_year
0,36.0,26.0,2014,0.0,0.0
1,36.0,26.0,2015,0.0,0.0
2,36.0,26.0,2016,0.0,0.0


In [12]:
last_year_month = monthly.copy()
last_year_month.year += 1
last_year_month.columns = ['latitude', 'longitude', 'year', 'month', 'fire_cnt_last_year_same_month', 'fire_last_year_same_month']
last_year_month.head(3)

Unnamed: 0,latitude,longitude,year,month,fire_cnt_last_year_same_month,fire_last_year_same_month
0,36.0,26.0,2014,1,0.0,0.0
1,36.0,26.0,2014,2,0.0,0.0
2,36.0,26.0,2014,3,0.0,0.0


In [13]:
past = yearly.copy()
past['one'] = 1
past = history[['latitude', 'longitude', 'year', 'one']].drop_duplicates().merge(
    past, on=['latitude', 'longitude', 'one'])
past = past[past.year_x < past.year_y]
past = past.groupby(['latitude', 'longitude', 'year_y'])[['fire_cnt', 'fire']].mean().reset_index()
past.columns = ['latitude', 'longitude', 'year', 'fire_cnt_before', 'fire_before']
past.head(3)

Unnamed: 0,latitude,longitude,year,fire_cnt_before,fire_before
0,36.0,26.0,2014,0.0,0.0
1,36.0,26.0,2015,0.0,0.0
2,36.0,26.0,2016,0.0,0.0


In [14]:
X = history.merge(past, how='left', on=['latitude', 'longitude', 'year'])
X = X.merge(last_year, how='left', on=['latitude', 'longitude', 'year'])
X = X.merge(last_year_month, how='left', on=['latitude', 'longitude', 'year', 'month'])
X = X.drop(columns='one')

X.head()
X.shape

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
0,36.0,26.0,2013,7,0.0,0,,,,,,
1,36.0,26.0,2013,6,0.0,0,,,,,,
2,36.0,26.0,2013,8,0.0,0,,,,,,
3,36.0,26.0,2013,9,0.0,0,,,,,,
4,36.0,26.0,2013,5,0.0,0,,,,,,


(1231200, 12)

# Temperature Data

In [15]:
temp_df = pd.read_csv('../data/processed_data/temperatures.csv')
temp_df = temp_df.rename(columns={"latitude":"latitude_rounded",
                                 "longitude":"longitude_rounded"})
temp_df["month"] = temp_df["month"].astype(int)
temp_df["year"] = temp_df["year"].astype(int)

In [16]:
temp_df.head()

Unnamed: 0,longitude_rounded,latitude_rounded,month,year,temperature_min,temperature_max,temperature_avg
0,41.5,39.5,1,2010,-9.653281,0.584216,-4.722202
1,41.5,39.5,1,2011,-12.800349,-0.425585,-6.618499
2,41.5,39.5,1,2012,-14.500847,-2.92638,-8.831257
3,41.5,39.5,1,2013,-13.762959,-2.342323,-7.790593
4,41.5,39.5,1,2014,-14.346466,-1.696002,-8.878721


In [17]:
def half_round(number):
    return np.floor(number) + 0.5

In [18]:
X["latitude_rounded"] = half_round(X["latitude"])
X["longitude_rounded"] = half_round(X["longitude"])

In [19]:
temp_df

Unnamed: 0,longitude_rounded,latitude_rounded,month,year,temperature_min,temperature_max,temperature_avg
0,41.5,39.5,1,2010,-9.653281,0.584216,-4.722202
1,41.5,39.5,1,2011,-12.800349,-0.425585,-6.618499
2,41.5,39.5,1,2012,-14.500847,-2.926380,-8.831257
3,41.5,39.5,1,2013,-13.762959,-2.342323,-7.790593
4,41.5,39.5,1,2014,-14.346466,-1.696002,-8.878721
...,...,...,...,...,...,...,...
8764,39.5,36.5,8,2020,26.594876,37.505002,31.937727
8765,40.5,36.5,8,2020,27.430649,38.359726,32.924143
8766,43.5,36.5,8,2020,27.192889,37.828102,32.954136
8767,41.5,36.5,8,2020,28.239824,39.302035,33.967337


In [20]:
X = pd.merge(X,
             temp_df,
             how="left",
             on=["latitude_rounded", "longitude_rounded", "month", "year"]
            )

In [21]:
X["temperature_min"] = X.groupby(['month', 'year'])["temperature_min"].transform(lambda x: x.fillna(x.mean()))
X["temperature_max"] = X.groupby(['month', 'year'])["temperature_max"].transform(lambda x: x.fillna(x.mean()))
X["temperature_avg"] = X.groupby(['month', 'year'])["temperature_avg"].transform(lambda x: x.fillna(x.mean()))

In [22]:
X

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,latitude_rounded,longitude_rounded,temperature_min,temperature_max,temperature_avg
0,36.0,26.0,2013,7,0.0,0,,,,,,,36.5,26.5,18.599343,29.769557,24.007236
1,36.0,26.0,2013,6,0.0,0,,,,,,,36.5,26.5,,,
2,36.0,26.0,2013,8,0.0,0,,,,,,,36.5,26.5,18.265671,29.882760,23.987780
3,36.0,26.0,2013,9,0.0,0,,,,,,,36.5,26.5,,,
4,36.0,26.0,2013,5,0.0,0,,,,,,,36.5,26.5,11.915374,24.333828,17.964211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231195,41.9,44.9,2017,3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,41.5,44.5,-2.173986,11.006170,4.180194
1231196,41.9,44.9,2017,10,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,41.5,44.5,4.561971,16.334282,10.323669
1231197,41.9,44.9,2017,2,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,41.5,44.5,,,
1231198,41.9,44.9,2017,1,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,41.5,44.5,-10.291933,2.448870,-4.368988


# Train-Test-Val Splitting

In [23]:
train = X[X.year < TRAIN_UNTIL]#.dropna()
valid = X[(X.year >= VAL_BETWEEN[0]) & (X.year < VAL_BETWEEN[1])]
test = X[X.year == TEST_ON]

train.to_csv('../data/train_val_test_data/train.csv', index=False)
valid.to_csv('../data/train_val_test_data/valid.csv', index=False)
test.to_csv('../data/train_val_test_data/test.csv', index=False)

In [24]:
train.shape, valid.shape, test.shape
train.head()

((684000, 17), (273600, 17), (136800, 17))

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,latitude_rounded,longitude_rounded,temperature_min,temperature_max,temperature_avg
0,36.0,26.0,2013,7,0.0,0,,,,,,,36.5,26.5,18.599343,29.769557,24.007236
1,36.0,26.0,2013,6,0.0,0,,,,,,,36.5,26.5,,,
2,36.0,26.0,2013,8,0.0,0,,,,,,,36.5,26.5,18.265671,29.88276,23.98778
3,36.0,26.0,2013,9,0.0,0,,,,,,,36.5,26.5,,,
4,36.0,26.0,2013,5,0.0,0,,,,,,,36.5,26.5,11.915374,24.333828,17.964211
