This notebook will do a 1day forecast and compare to persistence

Two types of forecast are:
1. Persistence: copy value from today to tomorrow
2. ML applied to features (INCLUDING the emissions from today) to predict the value for tomorrow

THIS VERSON TRIES K FOLD CROSS VALIDATION

In [7]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
np.set_printoptions(threshold=10000)
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split, KFold

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import seaborn as sns

In [24]:
fires = pd.read_csv('training_data_1day_forecast.csv')
print(fires.columns)
#split date into year/month/day feature
date = fires['Current Day'].values
years = [int(date[i][0:4]) for i in range(len(date))]
months = [int(date[i][5:7]) for i in range(len(date))]
days = [int(date[i][8:10]) for i in range(len(date))]
fires['years'] = years
fires['months'] = months
fires['days'] = days

#get rid of rows which Contain nans/zeros for features
inds_features = np.where((fires['biomass_12Z_today']==0)|(fires['biomass_12Z_tomorrow']==0))

fires = fires.drop(labels=inds_features[0], axis=0)

#take the log of the emissions
fires.loc[:,'biomass_12Z_today'] = np.log10(fires.loc[:,'biomass_12Z_today'])
fires.loc[:,'biomass_12Z_tomorrow'] = np.log10(fires.loc[:,'biomass_12Z_tomorrow'])

fires

Index(['Incident Number', 'Fire Name', 'Current Day', 'Lat Fire', 'Lon Fire',
       'Number of VIIRS points', 'TLML_12Z', 'QLML_12Z', 'SPEEDLML_12Z',
       'PS_12Z', 'T_12Z_700mb', 'T_12Z_500mb', 'QV_12Z_700mb', 'PBLH_12Z',
       'TCZPBL_12Z', 'precip', 'fccs', 'slp', 'asp', 'ESATLML_12Z', 'ELML_12Z',
       'HDWLML', 'RHLML_12Z', 'Td_12Z_700mb', 'E_700mb', 'HAINES',
       'biomass_12Z_today', 'biomass_12Z_tomorrow'],
      dtype='object')


Unnamed: 0,Incident Number,Fire Name,Current Day,Lat Fire,Lon Fire,Number of VIIRS points,TLML_12Z,QLML_12Z,SPEEDLML_12Z,PS_12Z,...,HDWLML,RHLML_12Z,Td_12Z_700mb,E_700mb,HAINES,biomass_12Z_today,biomass_12Z_tomorrow,years,months,days
0,10662684.0,PAINTED WAGON,2019-04-08,33.786944,-112.753333,3.0,290.748108,0.004608,10.770707,91644.742188,...,188.413662,0.131074,-66.278872,0.008245,5.0,-9.117619,-12.006459,2019,4,8
23,10663171.0,LONE MOUNTAIN,2019-07-01,33.808056,-105.738611,0.0,291.934814,0.010633,4.458162,81414.648438,...,72.434264,0.250921,-57.266238,0.026697,6.0,-9.107107,-9.442254,2019,7,1
24,10663171.0,LONE MOUNTAIN,2019-07-02,33.808056,-105.738611,9.0,290.191132,0.010948,4.315524,81379.093750,...,59.692758,0.288293,-54.788338,0.036158,5.0,-9.442254,-10.761181,2019,7,2
28,10664535.0,Roaring,2019-06-07,33.893500,-108.996000,27.0,287.163910,0.004974,3.406101,77239.523438,...,46.335181,0.150047,-58.488769,0.022917,5.0,-7.814943,-8.327804,2019,6,7
29,10664700.0,Bugbee,2019-03-13,35.749783,-101.621633,51.0,279.685045,0.005048,16.990888,86966.758045,...,118.275528,0.282796,-64.174218,0.010963,5.0,-6.928345,-8.670228,2019,3,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8074,11976103.0,THREE HILLS,2020-10-27,31.606250,-111.502100,2.0,551.875977,0.002961,5.530831,178207.718750,...,396910.353185,0.000046,-62.045197,0.014528,6.0,-9.962971,-8.149332,2020,10,27
8076,11976592.0,AIRPORT,2020-12-02,33.895278,-117.598056,6.0,289.403717,0.002007,7.640186,94568.148438,...,132.200734,0.063979,-73.379708,0.002993,5.0,-9.393548,-7.278838,2020,12,2
8078,11978031.0,LESLIE GULCH,2020-11-06,43.330020,-117.222300,44.0,284.868561,0.004524,6.578714,87044.875000,...,74.408757,0.178665,-60.880654,0.016899,5.0,-8.982938,-9.578338,2020,11,6
8081,11979081.0,LAURA 2,2020-11-17,40.034034,-120.115073,48.0,280.841309,0.002591,6.511267,83903.031250,...,59.584851,0.129047,-68.852754,0.005765,5.0,-7.243073,-8.373600,2020,11,17


## Split into features, labels, and persistence forecast

In [25]:
feat_names = []
for name in fires.columns:
    feat_names.append(name)
feat_names.remove('Fire Name')
feat_names.remove('Current Day')
feat_names.remove('biomass_12Z_tomorrow')

label_names = ['biomass_12Z_tomorrow']
persistence_names = ['biomass_12Z_today']
print(feat_names, label_names, persistence_names)


['Incident Number', 'Lat Fire', 'Lon Fire', 'Number of VIIRS points', 'TLML_12Z', 'QLML_12Z', 'SPEEDLML_12Z', 'PS_12Z', 'T_12Z_700mb', 'T_12Z_500mb', 'QV_12Z_700mb', 'PBLH_12Z', 'TCZPBL_12Z', 'precip', 'fccs', 'slp', 'asp', 'ESATLML_12Z', 'ELML_12Z', 'HDWLML', 'RHLML_12Z', 'Td_12Z_700mb', 'E_700mb', 'HAINES', 'biomass_12Z_today', 'years', 'months', 'days'] ['biomass_12Z_tomorrow'] ['biomass_12Z_today']


## Do the k-folding, training, and evaluating

In [63]:
nsplits = 5
# #rows = #splits, #cols = #models we test (0=persistence, 1=linreg, 2=svr)
rmse = np.zeros((nsplits, 3))
mad = np.zeros((nsplits, 3))
r2 = np.zeros((nsplits, 3))
dates= np.zeros((nsplits,2)) #start and end date of training set

#initialize the k-folder
kf = KFold(n_splits=nsplits)

rownum=0
for train, test in kf.split(fires):
    days = fires.iloc[test].loc[:,'Current Day'].values
    print(days[0], days[len(days)-1])
    #do the train_test_split
    x_train = fires.iloc[train].loc[:,feat_names]
    x_test= fires.iloc[test].loc[:,feat_names]
    y_train = fires.iloc[train].loc[:,label_names]
    y_test = fires.iloc[test].loc[:,label_names].values
    
    #grab the persistence forecast
    y_pred_persistence = fires.iloc[test].loc[:,persistence_names].values
    
    #train the models
    #LINEAR REGRESSION
    lin_reg = make_pipeline(StandardScaler(), LinearRegression()) #scaling is necessary
    lin_reg.fit(x_train, y_train)
    y_pred_linreg = lin_reg.predict(x_test)
    
    #SVR
    svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2)) #scaling is necessary!
    svr.fit(x_train, np.ravel(y_train))
    y_pred_svr = svr.predict(x_test)
    
    #transform the predictions back out of log space
    y_test = 10**y_test[:,0]
    y_pred_persistence = 10**y_pred_persistence[:,0]
    y_pred_linreg = 10**y_pred_linreg[:,0]
    y_pred_svr = 10**y_pred_svr
    print(y_test.shape, y_pred_persistence.shape, y_pred_linreg.shape, y_pred_svr.shape)
    
    rmse[rownum, :] = RMSE(y_pred_persistence, y_test), RMSE(y_pred_linreg, y_test), RMSE(y_pred_svr, y_test)
    mad[rownum, :] = MAD(y_pred_persistence, y_test), MAD(y_pred_linreg, y_test), MAD(y_pred_svr, y_test)
    r2[rownum, :] = np.corrcoef(y_pred_persistence, y_test)[0,1], np.corrcoef(y_pred_linreg, y_test)[0,1], np.corrcoef(y_pred_svr, y_test)[0,1]

    rownum=rownum+1
    print(rownum)
    
print(rmse)
print(mad)
print(r2)


2019-04-08 2019-09-21
(787,) (787,) (787,) (787,)
1
2019-09-22 2020-10-08
(787,) (787,) (787,) (787,)
2
2020-10-09 2020-09-17
(787,) (787,) (787,) (787,)
3
2020-09-18 2020-10-07
(787,) (787,) (787,) (787,)
4
2020-10-08 2020-12-13
(786,) (786,) (786,) (786,)
5
[[7.12024229e-08 7.11701774e-08 7.10112377e-08]
 [1.04318019e-07 1.25873094e-07 1.21783577e-07]
 [1.20275365e-07 1.30367161e-07 1.34764593e-07]
 [2.72860995e-07 3.15051355e-07 4.38205413e-07]
 [2.89632054e-07 1.08117697e-06 3.51077171e-07]]
[[3.01429753e-08 2.31842846e-08 2.32681380e-08]
 [4.89897383e-08 5.11354844e-08 5.04441256e-08]
 [6.19160280e-08 6.03230734e-08 6.36483618e-08]
 [1.19672852e-07 1.26039597e-07 1.68625367e-07]
 [1.30969912e-07 2.83580412e-07 1.39898971e-07]]
[[0.62721929 0.49785572 0.54576481]
 [0.72972799 0.63294307 0.58650731]
 [0.77661279 0.71596809 0.77177046]
 [0.81357896 0.75310619 0.35734818]
 [0.73876496 0.55446397 0.52958776]]


## Train the models

In [41]:
def RMSE(y_pred,y_test):
    return np.sqrt(np.mean(((y_pred)-(y_test))**2))

def MAD(y_pred,y_test):
    return np.mean(np.abs(y_pred-y_test))

"""def R2(y_pred, y_test):
    return np.corrcoef(y_pred, y_test)[0,1]"""

'def R2(y_pred, y_test):\n    return np.corrcoef(y_pred, y_test)[0,1]'

## Evaluate the models

In [None]:
print(y_pred_linreg.shape, y_pred_svr.shape, y_test.shape)
y_pred_linreg = 10**y_pred_linreg[:,0]
y_pred_linreg_dropped = 10**y_pred_linreg_dropped[:,0]
y_pred_svr = 10**y_pred_svr
y_pred_svr_dropped = 10**y_pred_svr_dropped
y_test = 10**y_test
y_pred_persistence = 10**y_pred_persistence
print(y_pred_linreg.shape, y_pred_svr.shape,y_pred_svr_dropped.shape, y_test.shape, y_pred_persistence.shape)

In [None]:
rmse_linreg = np.sqrt(np.mean(((y_pred_linreg)-(y_test))**2))
rmse_linreg_dropped = np.sqrt(np.mean(((y_pred_linreg_dropped)-(y_test))**2))

rmse_svr = np.sqrt(np.mean(((y_pred_svr)-(y_test))**2))
rmse_svr_dropped = np.sqrt(np.mean(((y_pred_svr_dropped)-(y_test))**2))
rmse_persistence = np.sqrt(np.mean(((y_pred_persistence)-(y_test))**2))

print('LinReg RMSE:', rmse_linreg, 'LinReg RMSE Dropped:', rmse_linreg_dropped,'SVR RMSE:', rmse_svr, 'SVR RMSE Dropped:', rmse_svr_dropped, 'persistence RMSE:', rmse_persistence)

In [None]:
mad_linreg = np.mean(np.abs(y_pred_linreg-y_test))
mad_svr = np.mean(np.abs(y_pred_svr-y_test))
mad_svr_dropped = np.mean(np.abs(y_pred_svr_dropped-y_test))
mad_persistence = np.mean(np.abs(y_pred_persistence-y_test))

print('LinReg MAD:', mad_linreg, 'SVR MAD:', mad_svr, 'SVR MAD Dropped:', mad_svr_dropped, 'Persistence MAD:', mad_persistence)

In [None]:
r2_linreg = np.corrcoef(y_pred_linreg, y_test)[0,1]
r2_svr = np.corrcoef(y_pred_svr, y_test)[0,1]
r2_svr_dropped = np.corrcoef(y_pred_svr_dropped, y_test)[0,1]
r2_persistence = np.corrcoef(y_pred_persistence, y_test)[0,1]
print('LinReg R2:', r2_linreg, 'SVR R2:', r2_svr, 'SVR R2 Dropped:', r2_svr_dropped, 'Persistence R2:', r2_persistence)

In [None]:
low = -1e-9
high = 1e-7
plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_linreg)
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Linear Regression', fontsize=32)
plt.xlabel('True Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.ylabel('Predicted Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.xlim([low, high])
plt.ylim([low, high])
plt.grid(True)
plt.show()

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_svr)
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Support Vector Regression', fontsize=32)
plt.xlabel('True Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.ylabel('Predicted Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.xlim([low, high])
plt.ylim([low, high])
plt.grid(True)
plt.show()

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_svr_dropped)
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Support Vector Regression Model Performance on Test Set (Features Dropped)')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.xlim([low, high])
plt.ylim([low, high])
plt.show()

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_persistence)
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Persistence ', fontsize=32)
plt.xlabel('True Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.ylabel('Predicted Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.xlim([low, high])
plt.ylim([low, high])
plt.grid(True)
plt.show()

In [None]:
inds_low = np.where(y_test<1e-11)
inds_high = np.where((y_test>1e-10)&(y_pred_persistence>1e-22)&(y_pred_persistence<1e-10))
print(inds_high)
low = 1e-25
high = 1e-5

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_linreg)#, c=np.abs(y_pred_linreg-y_test), norm=matplotlib.colors.LogNorm())
plt.scatter(y_test[inds_low], y_pred_linreg[inds_low])
plt.scatter(y_test[inds_high], y_pred_linreg[inds_high])
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Linear Regression',  fontsize=32)
plt.xlabel('True Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.ylabel('Predicted Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.xlim([low, high])
plt.ylim([low, high])
plt.xscale('log')
plt.yscale('log')
#plt.colorbar()
#plt.clim(1e-30,1e-6)
plt.grid(True)
plt.show()

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_linreg_dropped, c=np.abs(y_pred_linreg_dropped-y_test), norm=matplotlib.colors.LogNorm())
#plt.scatter(y_test[inds_low], y_pred_linreg_dropped[inds_low])
#plt.scatter(y_test[inds_high], y_pred_linreg_dropped[inds_high])
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Linear Regression Model Performance on Test Set (Features Dropped)')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.xlim([low, high])
plt.ylim([low, high])
plt.xscale('log')
plt.yscale('log')
plt.colorbar()
plt.clim(1e-30,1e-6)
plt.show()

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_svr)#, c=np.abs(y_pred_svr-y_test), norm=matplotlib.colors.LogNorm())
plt.scatter(y_test[inds_low], y_pred_svr[inds_low])
plt.scatter(y_test[inds_high], y_pred_svr[inds_high])
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Support Vector Regression',  fontsize=32)
plt.xlabel('True Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.ylabel('Predicted Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.xlim([low, high])
plt.ylim([low, high])
plt.xscale('log')
plt.yscale('log')
#plt.colorbar()
#plt.clim(1e-30,1e-6)
plt.grid(True)
plt.show()

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_svr_dropped, c=np.abs(y_pred_svr_dropped-y_test), norm=matplotlib.colors.LogNorm())
#plt.scatter(y_test[inds_low], y_pred_svr_dropped[inds_low])
#plt.scatter(y_test[inds_high], y_pred_svr_dropped[inds_high])
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Support Vector Regression Model Performance on Test Set (Features Dropped)')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.xlim([low, high])
plt.ylim([low, high])
plt.xscale('log')
plt.yscale('log')
plt.colorbar()
plt.clim(1e-30,1e-6)
plt.show()

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred_persistence)#, c=np.abs(y_pred_persistence-y_test), norm=matplotlib.colors.LogNorm())
plt.scatter(y_test[inds_low], y_pred_persistence[inds_low])
plt.scatter(y_test[inds_high], y_pred_persistence[inds_high])
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Persistence', fontsize=32)
plt.xlabel('True Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.ylabel('Predicted Emission Rate Values (kg$m^{-2}$$s^{-1}$)', fontsize=24)
plt.xlim([low, high])
plt.ylim([low, high])
plt.xscale('log')
plt.yscale('log')
#plt.colorbar()
#plt.clim(1e-30,1e-6)
plt.grid(True)
plt.show()

In [None]:
low = -1e-9
high = 1e-7
plt.figure(figsize=(10,10))
sns.kdeplot(x=y_test, y=y_pred_linreg, shade=True)
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Linear Regression Model Performance on Test Set (Full Range)')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.xlim([low, high])
plt.ylim([low, high])
plt.show()

plt.figure(figsize=(10,10))
sns.kdeplot(x=y_test, y=y_pred_svr, shade=True)
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Support Vector Regression Model Performance on Test Set (Full Range)')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.xlim([low, high])
plt.ylim([low, high])
plt.show()

plt.figure(figsize=(10,10))
sns.kdeplot(x=y_test, y=y_pred_svr, shade=True)
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Support Vector Regression Model Performance on Test Set (Features Dropped)')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.xlim([low, high])
plt.ylim([low, high])
plt.show()

plt.figure(figsize=(10,10))
sns.kdeplot(x=y_test, y=y_pred_persistence, shade=True)
plt.plot(np.linspace(low, high),np.linspace(low,high), c='r')
plt.title('Persistence Model Performance on Test Set (Full Range)')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.xlim([low, high])
plt.ylim([low, high])
plt.show()

In [None]:
def REC(y_pred, y_test):
    percent_error = np.abs((y_pred-y_test)/y_test)*100
    thresholds = np.linspace(0,40000, 8001) #5% increments

    #percent_error = np.abs(y_pred-y_test)
    #thresholds = np.linspace(0, 0.5e-6, 61) #5% increments
    acc = np.zeros(thresholds.shape)
    for ii in range(len(thresholds)):
        n_correct = len(percent_error[percent_error<=thresholds[ii]])
        acc[ii] = n_correct*100/len(y_test)
    return thresholds, acc

# plot REC curve
thresholds_linreg, accuracy_linreg = REC(y_pred_linreg, y_test)
thresholds_svr, accuracy_svr = REC(y_pred_svr, y_test)
print(thresholds_svr[0:5])

thresholds_svr_dropped, accuracy_svr_dropped = REC(y_pred_svr_dropped, y_test)


thresholds_persistence, accuracy_persistence = REC(y_pred_persistence, y_test)

plt.figure(figsize=(10,10))
plt.plot(thresholds_linreg, accuracy_linreg)
plt.plot(thresholds_svr, accuracy_svr)
#plt.plot(thresholds_svr_dropped, accuracy_svr_dropped)
plt.plot(thresholds_persistence, accuracy_persistence)
plt.legend(['Linear Regression', 'SVR', 'Persistence'],fontsize=15)
plt.title('REC Curve for the Methods Tested')
plt.ylabel('Fraction Correct')
plt.xlabel('% Error Tolerance')
plt.show()

In [None]:
print(len(thresholds_linreg), thresholds_linreg, accuracy_linreg)
print(np.sum(5*accuracy_linreg))
print(np.sum(5*accuracy_svr))
print(np.sum(5*accuracy_svr_dropped))
print(np.sum(5*accuracy_persistence))