# Wildfire Analysis - eXtreme Gradient Boosting
## Michael Peters and Nick Latham
05.01.21

## Importing the appropriate modules

In [1]:
# First XGBoost model for Washington Wildfires dataset
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from pandas import DataFrame
from collections import defaultdict
import csv
import numpy 


## Collecting the data from our clean data CSV file

In [2]:
# set up X header
header = ["date","county","cause","binlat","binlon"]
# load data
X = []
Y = []
with open('data/clean_fire_data.csv') as csvfile:
    reader = csv.reader(csvfile)
    for fire in reader:
        X.append([fire[0], fire[2], fire[3], fire[6], fire[7]])
        Y.append(int(fire[8]))

## Perform LabelEncoding to transform strings into int's
this is a necessary step because the XGBClassifier requires all integer data

In [3]:
df = DataFrame(X, columns=header)
encoder_dict = defaultdict(LabelEncoder)
labeled_df = df.apply(lambda x: encoder_dict[x.name].fit_transform(x))
inverse_transform_lambda = lambda x: encoder_dict[x.name].inverse_transform(x)

## Scikit-learn train test splitting of X and Y data
We are using 33% of the data for a test split

In [4]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(labeled_df, Y, test_size=test_size, random_state=seed)

## Initialize and fit the XGBClassifier to our training data

In [5]:
model = XGBClassifier()
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Make predictions over the test split data

In [6]:
y_pred = model.predict(X_test)
predictions = [numpy.round(value) for value in y_pred]



## Calculate the percent accuracy from the matching values in y_test and the predictions of y_test

In [7]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 34.60%
