# Random Forest

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from mlxtend.evaluate import paired_ttest_5x2cv
from sklearn.tree import export_graphviz
import pandas as pd
import numpy as np

In [29]:
# Load database
wdf_df = pd.read_csv('V4_WDF_OFFICIAL.csv')
wdf_df.head()

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY
0,0,025-01,NORRISH,2000,945.0,Miscellaneous,5/31/2000,1800.0,87.4,C,...,-115.2919,Imperial,5,30,,0.0,93.5,0.0,0,142359
1,1,025-02,GUN,2001,1245.0,Miscellaneous,5/20/2001,1345.0,0.1,A,...,-116.1486,Imperial,5,20,4.92,0.0,88.4,0.0,0,143295
2,2,025-03,ONEAL,2001,1600.0,Campfire,7/17/2001,2200.0,18.0,C,...,-116.0892,Imperial,7,17,10.405,0.0,91.0,0.0,0,143295
3,3,025-04,EXPEDITION,2002,1934.0,Lightning,12/28/2002,2000.0,0.1,A,...,-115.205,Imperial,12,28,3.58,0.0,56.4,1.0,0,144818
4,4,025-05,M L K,2002,1345.0,Lightning,1/20/2002,1430.0,0.1,A,...,-115.1186,Imperial,1,20,6.04,0.0,52.6,0.0,0,144818


In [30]:
wdf_df.shape

(17245, 21)

**Step 1**: Add FIRE_SIZE_CLASS to 2016-Sep 2020 data based on FIRE_SIZE. Remove data points that don't have a FIRE_SIZE listed.

Definition of the following columns:
- FIRE_SIZE = Estimate of acres within the final perimeter of the fire.
- FIRE_SIZE_CLASS = Code for fire size based on the number of acres within the final fire perimeter expenditures (A=greater than 0 but less than or equal to 0.25 acres, B=0.26-9.9 acres, C=10.0-99.9 acres, D=100-299 acres, E=300 to 999 acres, F=1000 to 4999 acres, and G=5000+ acres).

In [31]:
# Remove for null values in FIRE_SIZE from the dataframe
wdf_df = wdf_df[wdf_df['FIRE_SIZE'].notnull()]
wdf_df.reset_index(inplace=True, drop=True)
wdf_df.shape

(17234, 21)

In [32]:
# Check columns that have null vales in FIRE_SIZE_CLASS
# This should be from FIRE_YEAR 2016-Sept 2020
wdf_df[wdf_df['FIRE_SIZE_CLASS'].isnull()]

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY
91,91,025-174,Carter Fire,2020,,,4/4/2020,,275.0,,...,-115.586500,Imperial,4,4,12.750,0.0,70.666667,0.0,0,182333
576,584,029-1360,Cedar Fire,2016,,,,,29322.0,,...,-118.567800,Kern,8,16,6.825,0.0,81.400000,1.0,0,880856
577,585,029-1361,Cotton Fire,2016,,,,,61.0,,...,-120.251000,Kern,6,11,9.840,0.0,72.800000,1.0,1,880856
578,586,029-1362,Deer Fire,2016,,,,,1785.0,,...,-118.722720,Kern,7,1,10.065,0.0,84.400000,1.0,1,880856
579,587,029-1363,Erskine Fire,2016,,,,,48019.0,,...,-118.456280,Kern,6,23,7.605,0.0,81.400000,1.0,1,880856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17144,17221,111-148,South Fire,2019,,,,,131.0,,...,-117.461363,Ventura,7,14,5.220,0.0,76.400000,0.0,1,846006
17145,17222,111-149,Wendy Fire,2019,,,,,91.0,,...,-118.950833,Ventura,10,10,14.425,0.0,57.600000,0.0,0,846006
17147,17224,111-150,Elizabeth Fire,2020,,,,,289.0,,...,-119.178080,Ventura,6,10,6.410,0.0,74.166667,0.0,1,851297
17148,17225,111-151,Holser Fire,2020,,,,,3000.0,,...,-118.758965,Ventura,8,17,5.520,0.0,82.666667,0.0,0,851297


In [33]:
null_test = wdf_df['FIRE_SIZE_CLASS'].isnull()
null_test

0        False
1        False
2        False
3        False
4        False
         ...  
17229    False
17230    False
17231    False
17232    False
17233    False
Name: FIRE_SIZE_CLASS, Length: 17234, dtype: bool

In [34]:
# Apply FIRE_SIZE_CLASS to 2016-2020 rows
for i in range(len(wdf_df)):
    if null_test[i] == True:
        if wdf_df.FIRE_SIZE[i] > 0 and wdf_df.FIRE_SIZE[i] <= 0.25:
            wdf_df.FIRE_SIZE_CLASS[i] = 'A'
        if wdf_df.FIRE_SIZE[i] >= 0.26 and wdf_df.FIRE_SIZE[i] <= 9.9:
            wdf_df.FIRE_SIZE_CLASS[i] = 'B'
        if wdf_df.FIRE_SIZE[i] >= 10.0 and wdf_df.FIRE_SIZE[i] <= 99.9:
            wdf_df.FIRE_SIZE_CLASS[i] = 'C'
        if wdf_df.FIRE_SIZE[i] >= 100 and wdf_df.FIRE_SIZE[i] <= 299:
            wdf_df.FIRE_SIZE_CLASS[i] = 'D'
        if wdf_df.FIRE_SIZE[i] >= 300 and wdf_df.FIRE_SIZE[i] <= 999:
            wdf_df.FIRE_SIZE_CLASS[i] = 'E'
        if wdf_df.FIRE_SIZE[i] >= 1000 and wdf_df.FIRE_SIZE[i] <= 4999:
            wdf_df.FIRE_SIZE_CLASS[i] = 'F'
        if wdf_df.FIRE_SIZE[i] >= 5000:
            wdf_df.FIRE_SIZE_CLASS[i] = 'G'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wdf_df.FIRE_SIZE_CLASS[i] = 'G'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wdf_df.FIRE_SIZE_CLASS[i] = 'C'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wdf_df.FIRE_SIZE_CLASS[i] = 'F'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wdf_df.FIRE_SIZE_CLASS[i] = 'D'
A value is trying to be set on a copy of

In [35]:
# Check FIRE_SIZE_CLASS was added from 2016-2020
wdf_df[wdf_df['FIRE_SIZE_CLASS'].isnull()]

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY


**Step 2**: Check null values in the following columns: AWND, PRCP, TAVG, DURING_A_DROUGHT, WATERSHD, POP_BY_COUNTY. Remove rows if there are null values.

In [36]:
wdf_df.shape

(17234, 21)

In [37]:
# Check AWND
wdf_df[wdf_df['AWND'].isnull()]

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY
0,0,025-01,NORRISH,2000,945.0,Miscellaneous,5/31/2000,1800.0,87.4,C,...,-115.2919,Imperial,5,30,,0.0,93.5,0.0,0,142359
3494,3527,037-2741,Martindale Fire,2020,,,,,230.0,D,...,-118.4,Los Angeles,9,28,,0.0,77.285714,0.0,1,10079000
3500,3534,037-2748,Woolsey Fire,2018,,,,,96949.0,G,...,-118.70128,Los Angeles and Ventura,11,8,,,,1.0,0,10070000
10913,10953,065-6576,Candy Fire,2020,,,,,227.0,D,...,-117.374419,Riverside,9,29,,0.0,81.111111,0.0,0,2517830
17012,17084,083-250,Thomas Fire,2017,,,,,281893.0,G,...,-119.09124,Santa Barbara and Ventura,12,4,,,,1.0,0,91574


In [38]:
# Remove null values from AWND
wdf_df = wdf_df[wdf_df['AWND'].notnull()]
wdf_df.shape

(17229, 21)

In [39]:
# Check PRCP
wdf_df[wdf_df['PRCP'].isnull()]

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY


In [40]:
# Check TAVG
wdf_df[wdf_df['TAVG'].isnull()]

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY


In [41]:
# Check DURING_A_DROUGHT
wdf_df[wdf_df['DURING_A_DROUGHT'].isnull()]

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY
15387,15449,073-367,HAYBARN,2004,1630.0,Missing/Undefined,,,120.0,D,...,-117.315,San Diego,9,22,6.997143,0.0,71.166667,,0,1254000
15512,15574,073-480,SUNDEVIL,2005,1300.0,Missing/Undefined,,,160.0,D,...,-117.114444,San Diego,9,5,4.57,0.000741,72.764706,,0,1248000
15894,15957,073-829,JULIETT,2008,1442.0,Missing/Undefined,,,4026.0,F,...,-117.296111,San Diego,10,13,8.456,0.0,59.941176,,0,2977000


In [42]:
# Remove null values from DURING_A_DROUGHT
wdf_df = wdf_df[wdf_df['DURING_A_DROUGHT'].notnull()]
wdf_df.shape

(17226, 21)

In [43]:
# Check WATERSHD
wdf_df[wdf_df['WATERSHD'].isnull()]

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY


In [44]:
# Check POP_BY_COUNTY
wdf_df[wdf_df['POP_BY_COUNTY'].isnull()]

Unnamed: 0.1,Unnamed: 0,FIPS_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,...,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,DISCOVERY_DAY,AWND,PRCP,TAVG,DURING_A_DROUGHT,WATERSHD,POP_BY_COUNTY


**Step 3**: Random Forest Classifier

In [45]:
# Create x and y Dataframes
X = wdf_df[['AWND', 'PRCP', 'TAVG', 'DURING_A_DROUGHT', 'WATERSHD', 'POP_BY_COUNTY']]
y = wdf_df['FIRE_SIZE_CLASS']

# Create train and test sets
# random_state as 42 is an arbitrary value
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create DecisionTreeClassifer class
# Apply max_depth=6 based on the decision tree model
rf = RandomForestClassifier(random_state=0, max_depth=6)

# Train the model on the data
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, random_state=0)

In [46]:
# Measuring model performance
print('Score:', rf.score(X_test, y_test))

Score: 0.627792436235708


In [47]:
# Cross Validation
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')
print("Accuracy: {} +/- {}%".format(round(scores.mean()*100, 5), round(scores.std()*100, 5)))

Accuracy: 62.34295 +/- 0.01999%


**Step 4**: Dummy Classifier
The dummy classifier gives you a measure of “baseline” performance — i.e. the success rate one should expect to achieve even if simply guessing.

**Link**: https://medium.com/@mamonu/what-is-the-scikit-learn-dummy-classifier-95549d9cd44<br>
**Documentation**: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html

In [48]:
# Dummy Classifier
# Create DummyClassifier class
dummy_clf = DummyClassifier(strategy="stratified", random_state=0)

# Train the model on the data
dummy_clf.fit(X, y)

# Train a dummy classifier to make predictions based on the class values
print('Prediction Values:', dummy_clf.predict(X))

# Measuring model performance
print('Score:', dummy_clf.score(X, y))

Prediction Values: ['A' 'B' 'A' ... 'A' 'B' 'A']
Score: 0.47521188900499245


In [49]:
# Cross Validation
# Gather the mean and standard deviation of the accuracy
cv_scores = cross_val_score(dummy_clf, X, y, cv=5, scoring='accuracy')
sm_best_tree_cv_score = cv_scores.mean()
sm_best_tree_cv_score_std = cv_scores.std()
print('The best mean cross-validation accuracy {} +/- {}% on training dataset'.format(
      round(sm_best_tree_cv_score*100,5), round(sm_best_tree_cv_score_std*100, 5)))

The best mean cross-validation accuracy 47.81723 +/- 0.47223% on training dataset


**Step 5**: 5x2cv Paired t Test

The 5x2cv paired t test is a procedure for comparing the performance of two models (classifiers or regressors). For this case we will compare the decision tree model with the dummy classifier.

**Link**: http://rasbt.github.io/mlxtend/user_guide/evaluate/paired_ttest_5x2cv/<br>
**Documentation**: http://rasbt.github.io/mlxtend/#welcome-to-mlxtends-documentation

In [50]:
# 5x2cv Paired t Test
t, p = paired_ttest_5x2cv(estimator1=rf, estimator2=dummy_clf, X=X, y=y, random_seed=1)

print('t statistic: %.3f' % t)
print('p value: %.9f' % p)

t statistic: 36.428
p value: 0.000000293
