## Data Cleaning

In [2]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# File to Load 
fire_csv = "../Datasets/Fires.csv"

#List of columns in the csv to use
columns = ['LATITUDE',
           'LONGITUDE',
           'STATE',
           'STAT_CAUSE_DESCR',
           'DISCOVERY_DOY',
           'CONT_DOY',
           'FIRE_YEAR',
           'FIRE_SIZE',
           'FIRE_SIZE_CLASS']
# Read in the wildfire csv, only first 1000 rows for testing (nrows=1000)
uncutfires_df = pd.read_csv(fire_csv, usecols=columns)

uncutfires_df = uncutfires_df[uncutfires_df['FIRE_YEAR'] >= 2000]
uncutfires_df = uncutfires_df[uncutfires_df['STAT_CAUSE_DESCR'] != 'Arson']
uncutfires_df = uncutfires_df.dropna(subset=['CONT_DOY'],axis=0).reset_index(drop=True)
uncutfires_df = uncutfires_df.astype({'CONT_DOY': 'int'})
cutfires_df = uncutfires_df[uncutfires_df['DISCOVERY_DOY'] != uncutfires_df['CONT_DOY']]
cutfires_df.sort_values(by=['FIRE_YEAR','DISCOVERY_DOY'],inplace=True)
fires_df = cutfires_df.dropna(axis=0).reset_index(drop=True)

fires_df


Unnamed: 0,FIRE_YEAR,DISCOVERY_DOY,STAT_CAUSE_DESCR,CONT_DOY,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE
0,2000,1,Debris Burning,5,250.0,D,34.710278,-94.866111,OK
1,2000,1,Fireworks,2,7320.0,G,36.376500,-96.292200,OK
2,2000,1,Miscellaneous,2,1200.0,F,35.704200,-101.545600,TX
3,2000,1,Debris Burning,20,5.0,B,36.448910,-118.738110,CA
4,2000,3,Powerline,4,0.1,A,38.367662,-82.216708,WV
...,...,...,...,...,...,...,...,...,...
121087,2015,358,Debris Burning,359,44.2,C,35.213580,-95.804700,OK
121088,2015,358,Miscellaneous,359,75.0,C,36.441100,-96.143300,OK
121089,2015,360,Powerline,361,300.0,E,35.827317,-101.419350,TX
121090,2015,362,Children,363,1.0,B,27.697300,-82.307200,FL


In [None]:
#Adding extra columns for analysis
fires_df['FIRE_DURATION']=''
fires_df['FIRE_DISC_DATE']=''
fires_df['FIRE_CONT_DATE']=''
fires_df['FIRE_MONTH_DISC']=''
fires_df['FIRE_YEAR_DISC']=''

for i in range(len(fires_df)):
    fires_df['FIRE_DURATION'][i] = fires_df['CONT_DOY'][i] - fires_df['DISCOVERY_DOY'][i]
    fires_df['FIRE_DISC_DATE'][i] = (datetime.datetime(fires_df['FIRE_YEAR'][i], 1, 1) + datetime.timedelta(int(fires_df['DISCOVERY_DOY'][i]) - 1)).date()
    fires_df['FIRE_CONT_DATE'][i] = (datetime.datetime(fires_df['FIRE_YEAR'][i], 1, 1) + datetime.timedelta(int(fires_df['CONT_DOY'][i]) - 1)).date()
    fires_df['FIRE_MONTH_DISC'][i] = fires_df['FIRE_DISC_DATE'][i].month
    fires_df['FIRE_YEAR_DISC'][i] = fires_df['FIRE_DISC_DATE'][i].year
    
#drop now unnecessary columns
fires_df = fires_df.drop(columns=['CONT_DOY','DISCOVERY_DOY','FIRE_YEAR'])
fires_df = fires_df.sort_values(by=['FIRE_DISC_DATE','FIRE_SIZE']).reset_index(drop=True)

#add additional date columns for ease of access
#for i in range(len(fires_df)):
    

#rearrange columns for readability
fires_df = fires_df[['FIRE_DISC_DATE',
                     'FIRE_CONT_DATE',
                     'FIRE_DURATION',
                     'FIRE_SIZE_CLASS',
                     'FIRE_SIZE',
                     'STAT_CAUSE_DESCR',
                     'STATE',
                     'LATITUDE',
                     'LONGITUDE',
                     'FIRE_YEAR_DISC',
                     'FIRE_MONTH_DISC']]
fires_df.head()

In [4]:
fires_df.to_csv("Culled Wildfires 2000-2015.csv",index=False)