# Create database table of cleaned wildfire info (cleaned_wildfire_data)
### This notebook reads in from the wildfire_data table, cleans it, and writes the information to database table "cleaned_wildfire_data" in the table "fires"
Cleaning involved:
* For tables that don't have a containment date, create one based on average length of fires of that class
* Calculate distance from SF using latitude and longitude of each fire (using python library geopy)


In [2]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from datetime import datetime,date,timedelta

from geopy.distance import geodesic

In [3]:
rds_connection_string = "root:12345678@127.0.0.1/fires"
engine = create_engine(f'mysql+pymysql://{rds_connection_string}')


In [4]:
wildfire_df = pd.read_sql("select * from wildfire_data",con=engine)
wildfire_df = wildfire_df[['fire_id','fire_name', 'discovery_date', 'containment_date', 'size', 'class', 'fire_year', 'lat', 'long']]
wildfire_df.head()

  result = self._query(query)


Unnamed: 0,fire_id,fire_name,discovery_date,containment_date,size,class,fire_year,lat,long
0,0,FOUNTAIN,2005-02-02,2005-02-02,0.1,A,2005,40.036944,-121.005833
1,1,PIGEON,2004-05-12,2004-05-12,0.25,A,2004,38.933056,-120.404444
2,2,SLACK,2004-05-31,2004-05-31,0.1,A,2004,38.984167,-120.735556
3,3,DEER,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.913333
4,4,STEVENOT,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.933056


## For fires that don't have an end date, add one based on the average fire duration for that class of firee

In [5]:
# Create a dictionary of fire durations for each class of rie

wildfire_complete = wildfire_df.dropna()
wildfire_complete["duration"] = (wildfire_complete["containment_date"] - wildfire_complete["discovery_date"]).dt.days
duration_class = wildfire_complete.groupby("class").agg({"duration":"mean"})
duration_class_dict = duration_class.to_dict()["duration"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [6]:
duration_class_dict

{'A': 0.5803060142402666,
 'B': 0.8150366358712966,
 'C': 2.560782681099084,
 'D': 5.583904109589041,
 'E': 7.70935960591133,
 'F': 18.538709677419355,
 'G': 32.92018779342723}

In [8]:
# Check for fires with no containment date
wildfire_df[wildfire_df["containment_date"].isnull()].iloc[0:10,]

Unnamed: 0,fire_id,fire_name,discovery_date,containment_date,size,class,fire_year,lat,long
821,821,PAINTED,2005-12-05,NaT,0.1,A,2005,39.409167,-120.541667
2035,2035,NEILSON,2006-08-23,NaT,7.0,B,2006,34.425,-117.495833
2203,2203,WHEELER,2006-09-27,NaT,5.0,B,2006,38.513333,-119.988611
2235,2235,HARVEY,2006-06-26,NaT,1247.0,F,2006,40.058889,-122.908056
2294,2294,TITUS,2006-07-23,NaT,5792.0,G,2006,41.602778,-123.376944
2524,2524,BEAR,2006-08-22,NaT,0.1,A,2006,34.213056,-116.855556
2599,2599,SAND FLAT,2006-09-03,NaT,178.0,D,2006,38.403889,-119.779444
2600,2600,DEER,2006-07-21,NaT,0.1,A,2006,38.355833,-119.792222
2924,2924,BLUE,2006-09-29,NaT,0.5,B,2006,34.251667,-117.420833
3816,3816,WILLIS,2007-09-24,NaT,0.1,A,2007,41.685833,-123.665556


In [9]:
# Check each row of fire data; if it doesn't have a containment date, add one
for index, row in wildfire_df.iterrows():
    if pd.isnull(row["containment_date"]):
        this_class = row["class"]
        wildfire_df.loc[index,"containment_date"] = row["discovery_date"] + pd.Timedelta(str(duration_class_dict[this_class]) + " days")

In [11]:
# Confirm that we don't have any fires with a blank containment date
wildfire_df[wildfire_df["containment_date"].isnull()]

Unnamed: 0,fire_id,fire_name,discovery_date,containment_date,size,class,fire_year,lat,long


In [12]:
wildfire_df.head()

Unnamed: 0,fire_id,fire_name,discovery_date,containment_date,size,class,fire_year,lat,long
0,0,FOUNTAIN,2005-02-02,2005-02-02,0.1,A,2005,40.036944,-121.005833
1,1,PIGEON,2004-05-12,2004-05-12,0.25,A,2004,38.933056,-120.404444
2,2,SLACK,2004-05-31,2004-05-31,0.1,A,2004,38.984167,-120.735556
3,3,DEER,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.913333
4,4,STEVENOT,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.933056


## Calculate the distance from san francisco based on lat & longitude


In [13]:
# Set variable with coordinates for SF
sf = (37.7749, -122.4194)

In [16]:
wildfire_df.head()

Unnamed: 0,fire_id,fire_name,discovery_date,containment_date,size,class,fire_year,lat,long,distance
0,0,FOUNTAIN,2005-02-02,2005-02-02,0.1,A,2005,40.036944,-121.005833,0
1,1,PIGEON,2004-05-12,2004-05-12,0.25,A,2004,38.933056,-120.404444,0
2,2,SLACK,2004-05-31,2004-05-31,0.1,A,2004,38.984167,-120.735556,0
3,3,DEER,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.913333,0
4,4,STEVENOT,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.933056,0


In [17]:
# For each fire, set the distance from SF
for index, row in wildfire_df.iterrows():
    wildfire_df.loc[index,"distance"] = geodesic(sf,(row['lat'], row['long'])).miles

In [18]:
wildfire_df.head()

Unnamed: 0,fire_id,fire_name,discovery_date,containment_date,size,class,fire_year,lat,long,distance
0,0,FOUNTAIN,2005-02-02,2005-02-02,0.1,A,2005,40.036944,-121.005833,173.637835
1,1,PIGEON,2004-05-12,2004-05-12,0.25,A,2004,38.933056,-120.404444,135.485925
2,2,SLACK,2004-05-31,2004-05-31,0.1,A,2004,38.984167,-120.735556,123.749694
3,3,DEER,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.913333,146.784946
4,4,STEVENOT,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.933056,145.787248


In [19]:
wildfire_df.describe()

Unnamed: 0,fire_id,size,fire_year,lat,long,distance
count,107193.0,107193.0,107193.0,107193.0,107193.0,107193.0
mean,53596.0,78.425757,2007.436092,37.468897,-120.235776,212.891927
std,30944.09804,2434.184724,4.003132,2.551825,2.128429,122.593826
min,0.0,0.001,2001.0,32.544965,-124.402883,2.05581
25%,26798.0,0.1,2004.0,35.348056,-121.739266,117.182986
50%,53596.0,0.25,2007.0,37.650278,-120.577757,181.09066
75%,80394.0,1.0,2011.0,39.4344,-118.708611,284.098647
max,107192.0,315578.8,2014.0,42.03809,-114.1555,564.025098


## Write to database

In [20]:
wildfire_df.to_sql(name="cleaned_wildfire_data", con=engine, if_exists="replace", index=False)

In [21]:
wildfire_df.head()

Unnamed: 0,fire_id,fire_name,discovery_date,containment_date,size,class,fire_year,lat,long,distance
0,0,FOUNTAIN,2005-02-02,2005-02-02,0.1,A,2005,40.036944,-121.005833,173.637835
1,1,PIGEON,2004-05-12,2004-05-12,0.25,A,2004,38.933056,-120.404444,135.485925
2,2,SLACK,2004-05-31,2004-05-31,0.1,A,2004,38.984167,-120.735556,123.749694
3,3,DEER,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.913333,146.784946
4,4,STEVENOT,2004-06-28,2004-07-03,0.1,A,2004,38.559167,-119.933056,145.787248
