## Python for Data Science: Mari McMurtrie Project

In [54]:
# Project - California Wild fire and Precipitation 

# 1) Select new dataset
# 2) Read csv/txt files with Pandas
# 3) Combine & massage data files
# 4) Perform EDA (Early Data Exploration) (descriptive statistics, histograms)
# 5) Use groupby, correlations to understand the dataset
# 6) Plot data with Matplotlib for visualizing the dataset
# 7) Use advanced visualization for the dataset
# 8) Identify target and factors
# 9) Explain your dataset with high-level analysis


In [55]:
# 1) Select new dataset
# 2) Read csv/txt with Pandas

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

#fire_header = ["objectid","year_","state","agency","unit_id","fire_name","inc_num","alarm_date","cont_date","cause","comments","report_ac","gis_acres","c_method","objective","fire_num","shape_length","shape_area"]
fire_data = pd.read_csv("calfire_frap.csv",header = 0)
rain_data = pd.read_csv("climate_sf_ca_english.csv", header = 0)

# Backup data, just in case.
backup_fire_data = fire_data.copy()
backup_rain_data = rain_data.copy()

In [56]:
fire_data.head()

Unnamed: 0,objectid,year_,state,agency,unit_id,fire_name,inc_num,alarm_date,cont_date,cause,comments,report_ac,gis_acres,c_method,objective,fire_num,shape_length,shape_area
0,1,2007,CA,CCO,LAC,OCTOBER,246393,2007-10-21,2007-10-23,14.0,,,25.736713,8.0,1.0,233414,1902.439051,104152.8
1,2,2007,CA,CCO,LAC,MAGIC,233077,2007-10-22,2007-10-25,14.0,,,2824.877197,8.0,1.0,233077,20407.965662,11431870.0
2,3,2007,CA,USF,ANF,RANCH,166,2007-10-20,2007-11-15,2.0,,54716.0,58410.335938,7.0,1.0,166,169150.71569,236378200.0
3,4,2007,CA,CCO,LAC,EMMA,201384,2007-09-11,2007-09-11,14.0,,,172.214951,8.0,1.0,201384,6117.777086,696929.2
4,5,2007,CA,CCO,LAC,CORRAL,259483,2007-11-24,2007-11-27,14.0,,,4707.99707,8.0,1.0,259483,22907.182174,19052590.0


In [57]:
rain_data.head()

Unnamed: 0,year,month,average temperature for the month,highest temp in month,lowest temp in month,precipitation,normal average temp,normal average precipitation
0,1982,6,15.4,,,2,16.8,3.1
1,1982,7,16.5,,,0,17.5,0.1
2,1982,8,17.4,,,0,18.0,1.1
3,1982,9,17.8,,,25,18.2,4.4
4,1982,10,16.2,,,50,16.4,22.8


In [58]:
fire_data_shape = fire_data.shape
rain_data_shape = rain_data.shape
print("Fire data shape:{0}".format(fire_data_shape))
print("Rain data shape:{0}".format(rain_data_shape))


Fire data shape:(14847, 18)
Rain data shape:(439, 8)


In [59]:
##### Fire Data Cleanup ##### 
# Each data column is described here: http://frap.fire.ca.gov/projects/fire_data/fire_perimeters_data_description 
# I drop some columns which are not necessary
fire_data = fire_data.drop("objectid", axis = 1)
fire_data = fire_data.drop("comments", axis = 1)
fire_data = fire_data.drop("agency", axis = 1)
fire_data = fire_data.drop("unit_id", axis = 1)
fire_data = fire_data.drop("inc_num", axis = 1)
fire_data = fire_data.drop("fire_num", axis = 1)
fire_data = fire_data.drop("shape_length", axis = 1)
fire_data = fire_data.drop("shape_area", axis = 1)
fire_data = fire_data.drop("objective", axis = 1)

# Also rename some columns to be more descriptive
fire_data = fire_data.rename(columns={'c_method': 'data_collection_method', 'year_': 'year'})
fire_data.fillna(0)
fire_data.head()

Unnamed: 0,year,state,fire_name,alarm_date,cont_date,cause,report_ac,gis_acres,data_collection_method
0,2007,CA,OCTOBER,2007-10-21,2007-10-23,14.0,,25.736713,8.0
1,2007,CA,MAGIC,2007-10-22,2007-10-25,14.0,,2824.877197,8.0
2,2007,CA,RANCH,2007-10-20,2007-11-15,2.0,54716.0,58410.335938,7.0
3,2007,CA,EMMA,2007-09-11,2007-09-11,14.0,,172.214951,8.0
4,2007,CA,CORRAL,2007-11-24,2007-11-27,14.0,,4707.99707,8.0


In [60]:
##### Add More Column to Fire Data #####
# Adding: month ... parsed from alarm_date.

def parse_alarm_date(row):
    if isinstance(row['alarm_date'], str):
        alarm_date_list = row['alarm_date'].split("-")
        return int(alarm_date_list[1])
    return 0

fire_data['month'] = fire_data.apply(lambda row : parse_alarm_date(row), axis = 1)
cols = fire_data.columns.tolist()
cols = cols[:2] + cols[-1:] + cols[3:]
fire_data = fire_data[cols]
fire_data.head()

Unnamed: 0,year,state,month,alarm_date,cont_date,cause,report_ac,gis_acres,data_collection_method,month.1
0,2007,CA,10,2007-10-21,2007-10-23,14.0,,25.736713,8.0,10
1,2007,CA,10,2007-10-22,2007-10-25,14.0,,2824.877197,8.0,10
2,2007,CA,10,2007-10-20,2007-11-15,2.0,54716.0,58410.335938,7.0,10
3,2007,CA,9,2007-09-11,2007-09-11,14.0,,172.214951,8.0,9
4,2007,CA,11,2007-11-24,2007-11-27,14.0,,4707.99707,8.0,11


In [61]:
##### Rain Data Cleanup #####
# Drop some colums in rain_data
rain_data = rain_data.drop("highest temp in month", axis = 1)
rain_data = rain_data.drop("lowest temp in month", axis = 1)

# Rename columns name
rain_data = rain_data.rename(columns={"average temperature for the month": "avg_temp", "normal average temp":"normal_avg_temp", "normal average precipitation":"normal_avg_precipitation"})
rain_data.fillna(0)
rain_data.head()


Unnamed: 0,year,month,avg_temp,precipitation,normal_avg_temp,normal_avg_precipitation
0,1982,6,15.4,2,16.8,3.1
1,1982,7,16.5,0,17.5,0.1
2,1982,8,17.4,0,18.0,1.1
3,1982,9,17.8,25,18.2,4.4
4,1982,10,16.2,50,16.4,22.8


In [62]:
##### Add More Column to Rain Data #####
# Adding: precipitation_diff ... to see if a precipitation is higher/lower than normal average
# Adding: tempature_diff ... to see if a tempature is higher/lower than normal average

def get_diff(row, column1, column2) :
    precipitation = normal_avg_precipitation = 0
    try:
        precipitation = float(row[column1])
        normal_avg_precipitation = float(row[column2])
    except ValueError:
        print("cannot convert to float.")
    return  precipitation - normal_avg_precipitation

rain_data['precipitation_diff'] = rain_data.apply(lambda row: get_diff(row, "precipitation", "normal_avg_precipitation"), axis=1)
rain_data['tempature_diff'] = rain_data.apply(lambda row: get_diff(row, "avg_temp", "normal_avg_temp"), axis=1)

rain_data.head()


cannot convert to float.
cannot convert to float.
cannot convert to float.
cannot convert to float.
cannot convert to float.
cannot convert to float.


Unnamed: 0,year,month,avg_temp,precipitation,normal_avg_temp,normal_avg_precipitation,precipitation_diff,tempature_diff
0,1982,6,15.4,2,16.8,3.1,-1.1,-1.4
1,1982,7,16.5,0,17.5,0.1,-0.1,-1.0
2,1982,8,17.4,0,18.0,1.1,-1.1,-0.6
3,1982,9,17.8,25,18.2,4.4,20.6,-0.4
4,1982,10,16.2,50,16.4,22.8,27.2,-0.2


In [63]:
# 3) Perform EDA (Early Data Exploration)
fire_data.describe()

Unnamed: 0,year,month,cause,report_ac,gis_acres,data_collection_method,month.1
count,14847.0,14847.0,14805.0,7088.0,14841.0,7494.0,14847.0
mean,1988.606318,6.650771,8.796082,2146.328182,1730.762064,4.692554,6.650771
std,20.122024,2.892108,5.326349,13293.335145,9878.15996,3.100902,2.892108
min,1950.0,0.0,1.0,0.0,0.001357,1.0,0.0
25%,1973.0,6.0,2.0,17.45,25.658598,1.0,6.0
50%,1991.0,7.0,9.0,67.96,142.55217,6.0,7.0
75%,2007.0,8.0,14.0,400.0,624.751709,8.0,8.0
max,2017.0,12.0,19.0,499945.0,501082.03125,8.0,12.0


In [64]:
rain_data.describe()

Unnamed: 0,year,month,normal_avg_temp,normal_avg_precipitation,precipitation_diff,tempature_diff
count,439.0,439.0,439.0,439.0,439.0,439.0
mean,2000.205011,6.539863,14.512073,42.845558,-1.584055,0.166515
std,10.57454,3.451905,2.824496,40.792311,44.339984,1.092019
min,1982.0,1.0,10.1,0.1,-103.5,-3.3
25%,1991.0,4.0,12.7,3.1,-17.8,-0.5
50%,2000.0,7.0,15.3,22.8,-1.1,0.1
75%,2009.0,10.0,17.5,70.3,5.05,0.9
max,2018.0,12.0,18.2,105.8,240.2,4.0


In [65]:
import bokeh
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.transform import linear_cmap
output_notebook()

### Cause of Fire color coded ###
p = figure(plot_width=1000, plot_height=600)
p.xaxis.axis_label = "Month"
p.yaxis.axis_label = "Year"
p.circle('month', 'year',size=10, color=linear_cmap('cause', 'Viridis256', 0, 20),source=fire_data)
show(p)


In [66]:
### Cause of Fire in Histogram ###
# Cause Code	Description
# 1	Lightning
# 2	Equipment Use
# 3	Smoking
# 4	Campfire
# 5	Debris
# 6	Railroad
# 7	Arson
# 8	Playing with Fire
# 9	Miscellaneous
# 10	Vehicle
# 11	Power Line
# 12	Firefighter Training
# 13	Non-Firefighter Training
# 14	Unknown/Unidentified
# 15	Structure
# 16	Aircraft
# 17	Volcanic
# 18	Escaped Prescribed Burn
# 19	Illegal Alien Campfire

import plotly.plotly as py
import plotly.graph_objs as go

fire_cause_data = [go.Histogram(x=fire_data['cause'])]
py.iplot(fire_cause_data, filename='fire-cause-histogram')


In [67]:
# calculate total acres burned per year
import plotly.plotly as py
import plotly.graph_objs as go

acres_burned_df = fire_data.groupby('year').gis_acres.sum()
isinstance(acres_burned_df, pd.Series)
burned_data = [go.Bar(
    x = acres_burned_df.index,
    y = acres_burned_df
)]

py.iplot(burned_data, filename='burned-data-bar')



In [68]:
# Structure damage per year 
import numpy as np
import plotly.plotly as py
import pandas as pd

ca_fire_damage_df = pd.read_csv('calfire_damage.csv')
ca_fire_damage_df.head()
#isinstance(ca_fire_damage_df, pd.DataFrame)
damage_data = [go.Bar(
    x = ca_fire_damage_df.year,
    y = ca_fire_damage_df.structures
)]

py.iplot(damage_data, filename='damage-data-bar')

In [69]:
# Fire in US map
import numpy as np
import plotly.plotly as py
import pandas as pd
import math 

us_fire_df = pd.read_csv('us_fires_7.csv')
us_fire_df.head()

# Want only CA data
ca_fire_df = us_fire_df[us_fire_df.state.isin(['CA'])]

# Sort by 'fire_size_class'
ca_fire_sorted_df = ca_fire_df.sort_values(by=['fire_size_class'])
#ca_fire_sorted_df.head()
#Text appears when hover
ca_fire_sorted_df['text'] = '<br>Fire Name:' + ca_fire_sorted_df['fire_name']
#ca_fire_sorted_df.describe()
#ca_fire_sorted_df.groupby('fire_size_class').fire_size_class.value_counts()

# fire_size_class - total 12243 entry
# A                  6469
# B                  5089
# C                   531
# D                    89
# E                    34
# F                    25
# G                     6

limits = [(0,6468),(6469,11557),(11558, 12088),(12089,12177),(12178,12211),(12212,12236),(12237,12242)]
colors = ["lightgrey","rgb(0,116,217)","rgb(42,220,90)","rgb(228,243,15)","rgb(255,137,0)","rgb(255,0,247)", "rgb(255,0,0)"]
cities = []
scale = 5000

for i in range(len(limits)):
    lim = limits[i]
    df_sub = ca_fire_sorted_df[lim[0]:lim[1]]
    city = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['fire_size'].apply(np.log) * 2,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    cities.append(city)

layout = dict(
        title = 'Recent California Fire By Size',
        showlegend = True,
        geo = dict(
            scope=['ca'],
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict(data=cities, layout=layout)
py.iplot(fig, validate=False, filename='d3-bubble-map-populations')


Columns (12,14) have mixed types. Specify dtype option on import or set low_memory=False.



High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~marimcmurtrie/0 or inside your plot.ly account where it is named 'd3-bubble-map-populations'


In [70]:
# Has rainfall been decreasing? 
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

rain_data.head(30)
rain_data_0 = rain_data.drop([0, 1, 2, 3, 4, 5, 6])
#Create a trace
trace0 = go.Scatter(
    x = (rain_data_0['year'], rain_data_0['month']),
    y = rain_data_0['precipitation_diff'],
      marker=dict(color='#851e52')
)

data = [trace0]

py.iplot(data, filename='basic-line')

In [107]:
# Tempature increasing every year? 
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

rain_data_0 = rain_data.drop([0, 1, 2, 3, 4, 5, 6])
#Create a trace
trace0 = go.Scatter(
    x = (rain_data_0['year'], rain_data_0['month']),
    y = rain_data_0['tempature_diff'],
      marker=dict(color='#323e52')
)

data = [trace0]

py.iplot(data, filename='basic-line')

In [112]:
# Now, let's see if there is any co-relation between rainfall and fire
# x - year (from 1983)
# y - # of fires & rain fall

#Don't need first 6 row, it is 1982 data.
rain_data_0 = rain_data[rain_data.year > 1983]
rain_amount_series_per_year = rain_data_0.groupby('year').precipitation.sum()
#isinstance(rain_series_per_year, pd.Series)

#Want only CA fire after 1982
fire_data_sorted = fire_data.sort_values(by=['year'])
fire_data_sorted = fire_data_sorted[fire_data_sorted.year > 1983]
#only CA fire
ca_fire_data = fire_data_sorted[fire_data_sorted.state.isin(['CA'])]
ca_fire_data1 = ca_fire_data.groupby('year').count()
ca_fire_data1.state
rain_amount_series_per_year

trace1 = go.Scatter(
    x =  rain_amount_series_per_year.index,
    y = rain_amount_series_per_year,
    name='precipitation'
)

trace2 = go.Scatter(
    x=ca_fire_data1.index,
    y=ca_fire_data1.state,
    name='number of fire',
    yaxis='y2'
)

data = [trace1, trace2]
layout = go.Layout(
    title='Precipitation & Number of Fire per Year',
    yaxis=dict(
        title='precipitation'
    ),
    yaxis2=dict(
        title='number of fire',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='multiple-yaxes')