## Create final binned daily data (final_binned_daily_data)
### Create a dataframe with a single row per day, containing a column for each bin (with total acres) and columns for mean aqi, max aqi, and min aqi.  Write to a table named final_binned_daily_data.

In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

%matplotlib inline


In [2]:
rds_connection_string = "root:12345678@127.0.0.1/fires"
engine = create_engine(f'mysql+pymysql://{rds_connection_string}')


In [3]:
data = pd.read_sql("select * from binned_by_acres", con=engine)
data.head()

  result = self._query(query)


Unnamed: 0,index,date,overall_aqi,fire_id,size,distance,bins
0,0,2001-01-01,153,8659.0,0.1,380.928215,bigger then 300
1,1,2001-01-01,153,9387.0,0.1,136.804378,within 150
2,2,2001-01-01,153,17163.0,0.1,251.976491,within 300
3,3,2001-01-01,153,30114.0,0.1,200.71576,within 250
4,4,2001-01-01,153,30201.0,0.1,467.441792,bigger then 300


In [4]:
data = data[['date', 'overall_aqi', 'fire_id', 'size', 'distance', 'bins']]
data.head()

Unnamed: 0,date,overall_aqi,fire_id,size,distance,bins
0,2001-01-01,153,8659.0,0.1,380.928215,bigger then 300
1,2001-01-01,153,9387.0,0.1,136.804378,within 150
2,2001-01-01,153,17163.0,0.1,251.976491,within 300
3,2001-01-01,153,30114.0,0.1,200.71576,within 250
4,2001-01-01,153,30201.0,0.1,467.441792,bigger then 300


## Group by date and bins (ie create total acres per bin per day)
* Note: because we only count acres burned on its end date, grouping by day is likely to be oversimplified

In [5]:
grouped_acres = data.groupby([pd.Grouper(key="date", freq="D"),'bins'])['size'].sum()

In [6]:
type(grouped_acres)

pandas.core.series.Series

In [7]:
grouped_acres.head(30)

date        bins           
2001-01-01  bigger then 300        1.2
            within 150             0.1
            within 250             0.1
            within 300             0.1
2001-01-02  bigger then 300        0.3
            within 150             0.1
2001-01-03  bigger then 300        0.3
            within 200             0.1
            within 250             0.1
2001-01-04  bigger then 300        5.7
            within 200             4.0
            within 300             0.1
2001-01-05  bigger then 300       25.7
            within 150            24.0
            within 200            15.0
            within 250            81.3
2001-01-06  bigger then 300        3.0
            within 300             2.5
2001-01-07  bigger then 300    10353.1
            within 150             5.0
            within 200             0.1
            within 250             5.0
2001-01-08  bigger then 300        1.0
            within 150            10.0
            within 50              0

In [8]:
grouped_acres = grouped_acres.to_frame().reset_index().pivot(index = "date",columns = "bins", values = "size")

In [9]:
grouped_acres.head()

bins,bigger then 300,within 100,within 150,within 200,within 250,within 300,within 50
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-01-01,1.2,,0.1,,0.1,0.1,
2001-01-02,0.3,,0.1,,,,
2001-01-03,0.3,,,0.1,0.1,,
2001-01-04,5.7,,,4.0,,0.1,
2001-01-05,25.7,,24.0,15.0,81.3,,


In [10]:
grouped_acres = grouped_acres.fillna(0)

In [11]:
grouped_acres.head()

bins,bigger then 300,within 100,within 150,within 200,within 250,within 300,within 50
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-01-01,1.2,0.0,0.1,0.0,0.1,0.1,0.0
2001-01-02,0.3,0.0,0.1,0.0,0.0,0.0,0.0
2001-01-03,0.3,0.0,0.0,0.1,0.1,0.0,0.0
2001-01-04,5.7,0.0,0.0,4.0,0.0,0.1,0.0
2001-01-05,25.7,0.0,24.0,15.0,81.3,0.0,0.0


## Define air quality

In [12]:
aqi = data.groupby([pd.Grouper(key="date", freq="D")])['overall_aqi'].mean()

In [13]:
aqi = aqi.to_frame()
aqi.head()

Unnamed: 0_level_0,overall_aqi
date,Unnamed: 1_level_1
2001-01-01,153.0
2001-01-02,140.0
2001-01-03,140.0
2001-01-04,107.0
2001-01-05,124.0


In [14]:
aqi.head()

Unnamed: 0_level_0,overall_aqi
date,Unnamed: 1_level_1
2001-01-01,153.0
2001-01-02,140.0
2001-01-03,140.0
2001-01-04,107.0
2001-01-05,124.0


## Create dataframe with wind data

In [15]:
wind_data = pd.read_sql("select * from wind_data", con=engine)

In [16]:
wind_data = wind_data[['date', 'avg_daily_wind_speed', 'fastest_2_min_speed', 'fastest_5_min_speed']]
wind_data.head()

Unnamed: 0,date,avg_daily_wind_speed,fastest_2_min_speed,fastest_5_min_speed
0,2000-12-01,3.58,14.1,16.1
1,2000-12-02,2.91,8.1,8.9
2,2000-12-03,3.8,13.0,14.1
3,2000-12-04,2.91,8.9,10.1
4,2000-12-05,4.03,10.1,12.1


In [17]:
wind_data['date'] = pd.to_datetime(wind_data['date'])

In [18]:
wind_data.dtypes

date                    datetime64[ns]
avg_daily_wind_speed           float64
fastest_2_min_speed            float64
fastest_5_min_speed            float64
dtype: object

## Merge them

In [20]:
grouped_acres.head()

bins,bigger then 300,within 100,within 150,within 200,within 250,within 300,within 50
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-01-01,1.2,0.0,0.1,0.0,0.1,0.1,0.0
2001-01-02,0.3,0.0,0.1,0.0,0.0,0.0,0.0
2001-01-03,0.3,0.0,0.0,0.1,0.1,0.0,0.0
2001-01-04,5.7,0.0,0.0,4.0,0.0,0.1,0.0
2001-01-05,25.7,0.0,24.0,15.0,81.3,0.0,0.0


In [21]:
all_data = pd.merge(grouped_acres, aqi, on="date")
all_data.head()

Unnamed: 0_level_0,bigger then 300,within 100,within 150,within 200,within 250,within 300,within 50,overall_aqi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,1.2,0.0,0.1,0.0,0.1,0.1,0.0,153.0
2001-01-02,0.3,0.0,0.1,0.0,0.0,0.0,0.0,140.0
2001-01-03,0.3,0.0,0.0,0.1,0.1,0.0,0.0,140.0
2001-01-04,5.7,0.0,0.0,4.0,0.0,0.1,0.0,107.0
2001-01-05,25.7,0.0,24.0,15.0,81.3,0.0,0.0,124.0


In [22]:
all_data = pd.merge(all_data, wind_data, on="date")

In [23]:
all_data.head()

Unnamed: 0,date,bigger then 300,within 100,within 150,within 200,within 250,within 300,within 50,overall_aqi,avg_daily_wind_speed,fastest_2_min_speed,fastest_5_min_speed
0,2001-01-01,1.2,0.0,0.1,0.0,0.1,0.1,0.0,153.0,3.58,10.1,10.1
1,2001-01-02,0.3,0.0,0.1,0.0,0.0,0.0,0.0,140.0,2.91,10.1,10.1
2,2001-01-03,0.3,0.0,0.0,0.1,0.1,0.0,0.0,140.0,2.68,10.1,10.1
3,2001-01-04,5.7,0.0,0.0,4.0,0.0,0.1,0.0,107.0,3.58,8.9,10.1
4,2001-01-05,25.7,0.0,24.0,15.0,81.3,0.0,0.0,124.0,4.7,15.0,17.0


## Write to database

In [24]:
all_data.to_sql(name="final_binned_daily_data", con=engine, if_exists="replace", index=True)