In [1]:
import pandas as pd
import numpy as np
import itertools

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Defining the Target Variables
- Define the time-splits for your train, validation and test data.
- Define the latitude & longitude ranges of your target country, zone, etc. The example is for Turkey.
- Define the floating point precision for yout longitude-latitude values.
- Define after how many fire calls you will consider to be a wild-fire.
- Define after what confidence level (in percentage) you will treat a report as a wild-fire occurence.

In [2]:
TRAIN_UNTIL = 2018
VAL_BETWEEN = (2018, 2020)
TEST_ON = 2020

LAT_RANGE = (36, 42)
LON_RANGE = (26, 45)

PRECISION = 1
MIN_FIRE_RECORDS = 2

CONFIDENCE_THRESHOLD = 80

# LGBM Regressor Model

We zoomed in Turkey and used aggressive aggregation for a simple baseline prediction model.
* Temporal resolution: Monthly
* Spatial resolution: 'PRECISION' Decimal degree ~ ( 10^(2-'PRECISION') km grid)
* Binary Target: At least 'MIN_FIRE_RECORDS' fire readings

In [3]:
fires = pd.read_csv('../data/processed_data/fire_daily.csv.gz', parse_dates=['acq_date'])
fires.head()

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence
0,36.0,43.38,2016-11-25,Aqua,MODIS,62
1,36.0,43.42,2016-11-13,Terra,MODIS,71
2,36.0,43.89,2016-06-10,Terra,MODIS,100
3,36.01,36.34,2016-09-20,Terra,MODIS,65
4,36.01,37.2,2016-04-23,Aqua,MODIS,95


- We are pruning the data based on our desired confidence level.
- We are only taking the zones that falls within our target latitude & longitude ranges.

In [4]:
fires = fires[fires.confidence > CONFIDENCE_THRESHOLD]

fires = fires[
        (fires.latitude > LAT_RANGE[0]) & (fires.latitude < LAT_RANGE[1]) & \
        (fires.longitude > LON_RANGE[0]) & (fires.longitude < LON_RANGE[1])]

fires.shape

(50528, 6)

We are extracting year and month. Then rounding the coordinate pairs with our defined floating point precision.

In [5]:
fires['year'] = fires.acq_date.dt.year
fires['month'] = fires.acq_date.dt.month
fires.latitude = fires.latitude.round(PRECISION)
fires.longitude = fires.longitude.round(PRECISION)

Grouping all the reports based on year, month and coordinates.

In [6]:
fires = fires.groupby(['latitude', 'longitude', 'year', 'month']).size().reset_index()
fires.columns = ['latitude', 'longitude', 'year', 'month', 'fire_cnt']

fires.shape
fires.head()
fires.nunique()

(17655, 5)

Unnamed: 0,latitude,longitude,year,month,fire_cnt
0,36.0,27.8,2013,7,38
1,36.0,27.8,2016,6,7
2,36.0,27.9,2013,7,8
3,36.0,36.1,2020,8,1
4,36.0,36.1,2020,9,1


latitude      61
longitude    191
year           9
month         12
fire_cnt      72
dtype: int64

Creating all the possible coordinate pairs with our desired precision. If we only take the coordinates found in the fire dataset, we would train a model that tends to produce false positives. So we should give all possible coordinates and aim for the model to predict the minority class.

In [7]:
lats = np.arange(LAT_RANGE[0], LAT_RANGE[1], 1/(10^PRECISION)).round(PRECISION)
lons = np.arange(LON_RANGE[0], LON_RANGE[1], 1/(10^PRECISION)).round(PRECISION)
years = fires.year.unique()
months = fires.month.unique()

unq_combs = list(itertools.product(lats, lons))
coords = pd.DataFrame(unq_combs, columns=["latitude", "longitude"])

unq_combs = list(itertools.product(years, months))
times = pd.DataFrame(unq_combs, columns=["year", "month"])

coords['one'] = 1
times['one'] = 1

base = pd.merge(coords, times, how='outer', on='one').drop_duplicates()
history = base.merge(fires, how='left', on= ['latitude', 'longitude', 'year', 'month']).drop_duplicates()

Let's see how many reports can be obtained for a coordinate throughout the area.

In [8]:
history = history.fillna(0)
history.fire_cnt.value_counts().head()

0.0    1213606
1.0       9333
2.0       3388
3.0       1517
4.0        880
Name: fire_cnt, dtype: int64

We mark areas that have been reported more than our threshold value as wildfire. This is our ground-truth value.

In [9]:
history['fire'] = 1 * (history['fire_cnt'] >= MIN_FIRE_RECORDS)

- For each year, we add the count of wild-fires and the count of fire reports for the last year as new features.
- For each month, we add the count of wild-fires and the count of fire reports for the same month in last year as new features.

In [10]:
yearly = history.groupby(
    ['latitude', 'longitude', 'year'])[['fire_cnt', 'fire']].sum().reset_index()
monthly = history.groupby(
    ['latitude', 'longitude', 'year', 'month'])[['fire_cnt', 'fire']].sum().reset_index()

In [11]:
last_year = yearly.copy()
last_year.year += 1
last_year.columns = ['latitude', 'longitude', 'year', 'fire_cnt_last_year', 'fire_last_year']
last_year.head()

Unnamed: 0,latitude,longitude,year,fire_cnt_last_year,fire_last_year
0,36.0,26.0,2014,0.0,0
1,36.0,26.0,2015,0.0,0
2,36.0,26.0,2016,0.0,0
3,36.0,26.0,2017,0.0,0
4,36.0,26.0,2018,0.0,0


In [12]:
last_year_month = monthly.copy()
last_year_month.year += 1
last_year_month.columns = ['latitude', 'longitude', 'year', 'month', 'fire_cnt_last_year_same_month', 'fire_last_year_same_month']
last_year_month.head()

Unnamed: 0,latitude,longitude,year,month,fire_cnt_last_year_same_month,fire_last_year_same_month
0,36.0,26.0,2014,1,0.0,0
1,36.0,26.0,2014,2,0.0,0
2,36.0,26.0,2014,3,0.0,0
3,36.0,26.0,2014,4,0.0,0
4,36.0,26.0,2014,5,0.0,0


By shifting the years by one and comparing the two year columns, we calculate the number of fire reports and wild-fires in the previous years for each year.

In [13]:
past = yearly.copy()
past['one'] = 1
past = history[['latitude', 'longitude', 'year', 'one']].drop_duplicates().merge(
    past, on=['latitude', 'longitude', 'one'])
past = past[past.year_x < past.year_y]
past = past.groupby(['latitude', 'longitude', 'year_y'])[['fire_cnt', 'fire']].sum().reset_index()
past.columns = ['latitude', 'longitude', 'year', 'fire_cnt_before', 'fire_before']
past.head(3)

Unnamed: 0,latitude,longitude,year,fire_cnt_before,fire_before
0,36.0,26.0,2014,0.0,0
1,36.0,26.0,2015,0.0,0
2,36.0,26.0,2016,0.0,0


We combine the historical report and wildfire statistics we extracted with the main dataframe.

In [14]:
X = history.merge(past, how='left', on=['latitude', 'longitude', 'year'])
X = X.merge(last_year, how='left', on=['latitude', 'longitude', 'year'])
X = X.merge(last_year_month, how='left', on=['latitude', 'longitude', 'year', 'month'])
X = X.drop(columns='one')

X.head()
X.shape

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
0,36.0,26.0,2013,7,0.0,0,,,,,,
1,36.0,26.0,2013,6,0.0,0,,,,,,
2,36.0,26.0,2013,8,0.0,0,,,,,,
3,36.0,26.0,2013,9,0.0,0,,,,,,
4,36.0,26.0,2013,5,0.0,0,,,,,,


(1231200, 12)

# Temperature Data
We are reading the temperature data we created from the Berkeley dataset.

In [15]:
temp_df = pd.read_csv('../data/processed_data/temperatures.csv')
temp_df = temp_df.rename(columns={"latitude":"latitude_rounded",
                                 "longitude":"longitude_rounded"})
temp_df["month"] = temp_df["month"].astype(int)
temp_df["year"] = temp_df["year"].astype(int)

Latitude and longitude pairs in the data is formatted like:
- 39.5
- 40.5
- 41.5
So when we merging the temperature data with our wildfire statistics, we should treat the wildfire data with this conversion (only for merging, we will preserve the actual latitude & longitude pairs).

In [16]:
def half_round(number):
    return np.floor(number) + 0.5

In [17]:
X["latitude_rounded"] = half_round(X["latitude"])
X["longitude_rounded"] = half_round(X["longitude"])

In [18]:
temp_df.head()

Unnamed: 0,longitude_rounded,latitude_rounded,month,year,temperature_min,temperature_max,temperature_avg
0,41.5,39.5,1,2010,-9.653281,0.584216,-4.722202
1,41.5,39.5,1,2011,-12.800349,-0.425585,-6.618499
2,41.5,39.5,1,2012,-14.500847,-2.92638,-8.831257
3,41.5,39.5,1,2013,-13.762959,-2.342323,-7.790593
4,41.5,39.5,1,2014,-14.346466,-1.696002,-8.878721


We are merging the historical wildfire statistics data with the temperature data.

In [19]:
X = pd.merge(X,
             temp_df,
             how="left",
             on=["latitude_rounded", "longitude_rounded", "month", "year"]
            )

We are mean-imputing the temperature values on yearly basis.

In [20]:
# X["temperature_min"] = X.groupby(['year'])["temperature_min"].transform(lambda x: x.fillna(x.mean()))
# X["temperature_max"] = X.groupby(['year'])["temperature_max"].transform(lambda x: x.fillna(x.mean()))
# X["temperature_avg"] = X.groupby(['year'])["temperature_avg"].transform(lambda x: x.fillna(x.mean()))

In [21]:
X.fire_cnt_before.max()

1560.0

# Train-Test-Val Splitting

In [22]:
train = X[X.year < TRAIN_UNTIL].dropna()
valid = X[(X.year >= VAL_BETWEEN[0]) & (X.year < VAL_BETWEEN[1])]
test = X[X.year == TEST_ON]

train.to_csv('../data/train_val_test_data/train.csv', index=False)
valid.to_csv('../data/train_val_test_data/valid.csv', index=False)
test.to_csv('../data/train_val_test_data/test.csv', index=False)
X.to_csv('../data/train_val_test_data/full_data.csv', index=False)


In [23]:
train.shape, valid.shape, test.shape
train

((310800, 17), (273600, 17), (136800, 17))

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,latitude_rounded,longitude_rounded,temperature_min,temperature_max,temperature_avg
1094,36.0,27.0,2016,8,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,36.5,27.5,22.391532,33.257656,27.912769
1096,36.0,27.0,2016,5,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,36.5,27.5,15.340218,26.191366,20.770048
1099,36.0,27.0,2016,3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,36.5,27.5,9.315493,20.847536,15.278423
1100,36.0,27.0,2016,10,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,36.5,27.5,15.409964,27.127140,21.206294
1101,36.0,27.0,2016,2,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,36.5,27.5,8.481730,21.042125,15.131624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231192,41.9,44.9,2017,5,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,41.5,44.5,7.967444,19.820513,14.006850
1231195,41.9,44.9,2017,3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,41.5,44.5,-2.173986,11.006170,4.180194
1231196,41.9,44.9,2017,10,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,41.5,44.5,4.561971,16.334282,10.323669
1231198,41.9,44.9,2017,1,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,41.5,44.5,-10.291933,2.448870,-4.368988
