# Team WyldFyrez
## Data clean up and editing for use

In [3]:
#import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Initial CSV File fire_init is to large to add to github as such it has been added as a zip file to the repo

In [4]:
#requesting csv and reading in file
fire_csv = "Resources/fire_init.csv"

df_fire = pd.read_csv(fire_csv, low_memory=False, dtype={'STAT_CAUSE_DESCR': str,'State': str, 
                                                         'CONT_DATE':str, 'Discovery_Date': str, 
                                                         'FIRE_SIZE':float,'LATITUDE': float,'LONGITUDE':float})

df_fire.head(10)

Unnamed: 0.1,Unnamed: 0,OBJECTID,STAT_CAUSE_DESCR,STATE,FIRE_YEAR,CONT_DATE,DISCOVERY_DATE,FIRE_SIZE,LATITUDE,LONGITUDE
0,0,1,Miscellaneous,CA,2005,2005-02-02T00:00:00.000Z,2005-02-02T00:00:00.000Z,0.1,40.036944,-121.005833
1,1,2,Lightning,CA,2004,2004-05-12T00:00:00.000Z,2004-05-12T00:00:00.000Z,0.25,38.933056,-120.404444
2,2,3,Debris Burning,CA,2004,2004-05-31T00:00:00.000Z,2004-05-31T00:00:00.000Z,0.1,38.984167,-120.735556
3,3,4,Lightning,CA,2004,2004-07-03T00:00:00.000Z,2004-06-28T00:00:00.000Z,0.1,38.559167,-119.913333
4,4,5,Lightning,CA,2004,2004-07-03T00:00:00.000Z,2004-06-28T00:00:00.000Z,0.1,38.559167,-119.933056
5,5,6,Lightning,CA,2004,2004-07-01T00:00:00.000Z,2004-06-30T00:00:00.000Z,0.1,38.635278,-120.103611
6,6,7,Lightning,CA,2004,2004-07-02T00:00:00.000Z,2004-07-01T00:00:00.000Z,0.1,38.688333,-120.153333
7,7,8,Debris Burning,CA,2005,2005-03-08T00:00:00.000Z,2005-03-08T00:00:00.000Z,0.8,40.968056,-122.433889
8,8,9,Debris Burning,CA,2005,2005-03-15T00:00:00.000Z,2005-03-15T00:00:00.000Z,1.0,41.233611,-122.283333
9,9,10,Lightning,CA,2004,2004-07-02T00:00:00.000Z,2004-07-01T00:00:00.000Z,0.1,38.548333,-120.149167


In [5]:
#Cleaning Data frame to remove unused columns
df_sub = df_fire[['OBJECTID','STAT_CAUSE_DESCR', 'STATE','FIRE_YEAR', 'CONT_DATE', 'DISCOVERY_DATE', 'FIRE_SIZE', 'LATITUDE', 'LONGITUDE']]
df_sub.head()

Unnamed: 0,OBJECTID,STAT_CAUSE_DESCR,STATE,FIRE_YEAR,CONT_DATE,DISCOVERY_DATE,FIRE_SIZE,LATITUDE,LONGITUDE
0,1,Miscellaneous,CA,2005,2005-02-02T00:00:00.000Z,2005-02-02T00:00:00.000Z,0.1,40.036944,-121.005833
1,2,Lightning,CA,2004,2004-05-12T00:00:00.000Z,2004-05-12T00:00:00.000Z,0.25,38.933056,-120.404444
2,3,Debris Burning,CA,2004,2004-05-31T00:00:00.000Z,2004-05-31T00:00:00.000Z,0.1,38.984167,-120.735556
3,4,Lightning,CA,2004,2004-07-03T00:00:00.000Z,2004-06-28T00:00:00.000Z,0.1,38.559167,-119.913333
4,5,Lightning,CA,2004,2004-07-03T00:00:00.000Z,2004-06-28T00:00:00.000Z,0.1,38.559167,-119.933056


In [6]:
#Dropping data that does not have complete data
# this ultimately removed nearly everything from years 1992 through 2004
df_dropna = df_sub.dropna(how="any")
df_dropna.count()

OBJECTID            988934
STAT_CAUSE_DESCR    988934
STATE               988934
FIRE_YEAR           988934
CONT_DATE           988934
DISCOVERY_DATE      988934
FIRE_SIZE           988934
LATITUDE            988934
LONGITUDE           988934
dtype: int64

In [7]:
# converting date fields into lists to support date format clean up
date_contained = df_dropna.CONT_DATE.tolist()
date_discovery = df_dropna.DISCOVERY_DATE.tolist()

In [8]:
# cleaning up start date field by stripping out the timestamp
date_disc = pd.Series(date_discovery)
#datedisc_df =pd.DataFrame(date_disc)
#d2 = pd.to_datetime(date_disc[0])

start = date_disc.str.split(pat = "T", expand=True)
start_df = pd.DataFrame(start)

start_clean = start_df.rename(columns={ 0: "Date Discovery",1: "Time"})
start_clean['Date Discovery'] = pd.to_datetime(start_clean['Date Discovery'])

start_clean.head()

Unnamed: 0,Date Discovery,Time
0,2005-02-02,00:00:00.000Z
1,2004-05-12,00:00:00.000Z
2,2004-05-31,00:00:00.000Z
3,2004-06-28,00:00:00.000Z
4,2004-06-28,00:00:00.000Z


In [9]:
# cleaning up contained date field by stripping out the timestamp
date_cont = pd.Series(date_contained)
#dateend_df =pd.DataFrame(date_cont)
#d1 = pd.to_datetime(date_cont[0])


end = date_cont.str.split(pat = "T", expand=True)
end_df = pd.DataFrame(end)

end_clean = end_df.rename(columns={ 0 : "Date Contained", 1: "Time"})

end_clean['Date Contained'] = pd.to_datetime(end_clean['Date Contained'])
end_clean.head()

Unnamed: 0,Date Contained,Time
0,2005-02-02,00:00:00.000Z
1,2004-05-12,00:00:00.000Z
2,2004-05-31,00:00:00.000Z
3,2004-07-03,00:00:00.000Z
4,2004-07-03,00:00:00.000Z


In [10]:
# creating a duration value showing how long each fire was burning

#df_clean['duration'] = end_clean['Date Contained'] - start_clean['Date Discovery']
#df_clean

duration = pd.Series(delta.days for delta in (end_clean['Date Contained'] - start_clean['Date Discovery']))

In [11]:
# concatinating the multiple dataframes we've created above into 1
df_concat = pd.concat([df_dropna, start_clean, end_clean, duration],axis =1)
df_concat.count()

OBJECTID            988934
STAT_CAUSE_DESCR    988934
STATE               988934
FIRE_YEAR           988934
CONT_DATE           988934
DISCOVERY_DATE      988934
FIRE_SIZE           988934
LATITUDE            988934
LONGITUDE           988934
Date Discovery      988934
Time                988934
Date Contained      988934
Time                988934
0                   988934
dtype: int64

In [12]:
# renaming our duration column
df_concat = df_concat.rename(columns= {0:"Duration"})

In [13]:
# dropping unused columns again
df_clean = df_concat[['OBJECTID','STAT_CAUSE_DESCR', 'STATE', 'Date Discovery', 'Date Contained', 'FIRE_SIZE', 'LATITUDE', 'LONGITUDE','Duration']]
df_clean.head()

Unnamed: 0,OBJECTID,STAT_CAUSE_DESCR,STATE,Date Discovery,Date Contained,FIRE_SIZE,LATITUDE,LONGITUDE,Duration
0,1,Miscellaneous,CA,2005-02-02,2005-02-02,0.1,40.036944,-121.005833,0
1,2,Lightning,CA,2004-05-12,2004-05-12,0.25,38.933056,-120.404444,0
2,3,Debris Burning,CA,2004-05-31,2004-05-31,0.1,38.984167,-120.735556,0
3,4,Lightning,CA,2004-06-28,2004-07-03,0.1,38.559167,-119.913333,5
4,5,Lightning,CA,2004-06-28,2004-07-03,0.1,38.559167,-119.933056,5


In [14]:
# checking our totals
df_clean.count()

OBJECTID            988934
STAT_CAUSE_DESCR    988934
STATE               988934
Date Discovery      988934
Date Contained      988934
FIRE_SIZE           988934
LATITUDE            988934
LONGITUDE           988934
Duration            988934
dtype: int64

In [None]:
# write out a final, cleaned csv file for our data set to use in our graphs and charts workbook
df_dropcln.to_csv("Resources/fire_clean.csv")