# Decision trees 2

In [1]:
import json
import csv
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500)
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import geopandas as gpd
import shapely
from shapely.geometry import Point, MultiPoint, Polygon, MultiPolygon
from shapely.affinity import scale
import matplotlib.pyplot as plt
import glob
import os
import datetime

from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split

### Loading data

In [2]:
bids = pd.read_csv("../data/as_bid_aggregated_data.csv")
prices = pd.read_csv("../data/AS_price_vol.csv")
plan = pd.read_csv("../data/as_plan.csv")
energy = pd.read_csv("../data/energy_price.csv")
weather = pd.read_csv("../data/weather_forecast_ercot.csv")
gen = pd.read_csv("../data/generation.csv")

A bit more data cleaning

In [3]:
#bids
bids.drop(columns=['RRSLD_Unweighted Average Price','RRSLD_Max Price','RRSLD_Min Price',
                  'RRSLD_Total Quantity','RRSLD_Number of Bids','RRSLD_Weighted Avg Price'], inplace=True)

bids['hour'] = bids['hr_beg']

In [4]:
#generation
gen['hr_beg'] = gen['hr_beg'].apply(lambda x: x[0:2]) 
gen.drop(columns=['Imports','Other','ST'], inplace=True)

gen['year'] = pd.to_datetime(gen['date']).dt.year
gen = gen.loc[gen['year']>2011,:]
gen.reset_index(inplace=True, drop=True)
gen.drop(columns=['year'], inplace=True)

In [5]:
#energy
energy.drop_duplicates(subset=['date','hr_beg'], inplace=True)

In [6]:
data_frames = [bids, prices, plan, energy, weather, gen]

for i,df in enumerate(data_frames):
    df['date'] = pd.to_datetime(df['date']).dt.date
    df['hr_beg'] = df['hr_beg'].astype(int)
    df.sort_values(by=['date','hr_beg'], inplace=True)
    df.drop_duplicates(inplace=True)
    #df.set_index(keys=['date','hr_beg'], inplace=True)
    

In [7]:
"""vetting = {}
for i,df in enumerate(data_frames):
    df['date'] = df.index.get_level_values(0)
    df['year'] = df['date'].dt.year
    vetting[i] = df.groupby('year').count()
    df.drop(columns=['date','year'], inplace=True)

vetting[0] #bids are complete from 2011-2018; columns 6 from the end can go [-6:]
vetting[1] #prices are complete from 2008 to 2018
vetting[2] #ohhhh dropped a bunch of duplicates from plan, I bet...
vetting[3] #energy prices (2014-2018) are also screwed up; have 1.5x too mcuh?
#vetting[4] #weather is good from 2014-2019
#vetting[5] #imports are no good, solar is bad in 2010 and 2011, biomass bad in 2010, other is n o good; ST maybe we can fill in """

"vetting = {}\nfor i,df in enumerate(data_frames):\n    df['date'] = df.index.get_level_values(0)\n    df['year'] = df['date'].dt.year\n    vetting[i] = df.groupby('year').count()\n    df.drop(columns=['date','year'], inplace=True)\n\nvetting[0] #bids are complete from 2011-2018; columns 6 from the end can go [-6:]\nvetting[1] #prices are complete from 2008 to 2018\nvetting[2] #ohhhh dropped a bunch of duplicates from plan, I bet...\nvetting[3] #energy prices (2014-2018) are also screwed up; have 1.5x too mcuh?\n#vetting[4] #weather is good from 2014-2019\n#vetting[5] #imports are no good, solar is bad in 2010 and 2011, biomass bad in 2010, other is n o good; ST maybe we can fill in "

In [8]:
#ugly merge for when we have unique index
"""union = prices.merge(bids, how='inner', right_index=True, left_index=True)

for df in data_frames[2:]:
    union = union.merge(df, how='inner', right_index=True, left_index=True)"""

"union = prices.merge(bids, how='inner', right_index=True, left_index=True)\n\nfor df in data_frames[2:]:\n    union = union.merge(df, how='inner', right_index=True, left_index=True)"

In [9]:
union = prices.merge(bids, how='inner', on=['date','hr_beg'])

In [10]:
union.shape

(71841, 47)

In [11]:
union = union.merge(plan, how='inner', on=['date','hr_beg'])

In [12]:
union.shape

(43892, 51)

In [13]:
union = union.merge(weather, how='inner', on=['date','hr_beg'])

In [14]:
union.shape

(43855, 59)

In [15]:
union = union.merge(gen, how='inner', on=['date','hr_beg'])

In [16]:
union.shape

(43855, 80)

In [17]:
union = union.merge(energy, how='inner', on=['date','hr_beg'])

In [18]:
union.shape

(43854, 92)

In [19]:
#dealing with NAs
union.fillna(method='bfill',axis=0, inplace=True)

In [20]:
union.sort_values(by=['date','hr_beg'], inplace=True)

### Processing features

In [114]:
cut_labels = ['0-20', '20+']
cut_bins = [0, 20, 1000]
union['cuts'] = pd.cut(union['price_DAH_REGDN'], bins=cut_bins, labels=cut_labels)

In [115]:
#separating features that are forecasts, and can be used as-is, and those that have to be lagged
realtime = ['date','Coast', 'East', 'FarWest', 'North', 'NorthCentral', 'SouthCentral',
       'Southern', 'West','NSPIN_Quantity', 'REGDN_Quantity',
       'REGUP_Quantity', 'RRS_Quantity'] #just things that are forecasts
lagged = list(set(union.columns) - set(realtime + ['cuts']))

In [116]:
features_realtime = union[realtime]
features_lagged = union[lagged]
target = union['cuts']

In [117]:
#shifting lagged features by 48 and 72 hours
x = features_lagged.shift(24)
x.columns = x.columns+"_24"

y = features_lagged.shift(48)
y.columns = y.columns+"_48"

z =  features_lagged.shift(72)
z.columns = z.columns+"_72"

In [118]:
#concatenating
new = pd.concat([x, y,z,features_realtime,target], axis=1)
new = new.dropna()

In [119]:
new['year'] = pd.to_datetime(new['date']).dt.year

In [120]:
new.drop(columns=['hour_48','hour_24'], inplace=True)

In [121]:
new.groupby('year').count()
#finally

Unnamed: 0_level_0,Nuclear.y_24,RRSNC_Max Price_24,NGCC_24,price_DAH_NSPIN_24,OFFNS_Total Quantity_24,ONNS_Number of Bids_24,REGDN_Weighted Avg Price_24,REGUP_Min Price_24,ONNS_Total Quantity_24,Nuclear.x_24,REGDN_Unweighted Average Price_24,price_DAH_S_24,price_DAH_W_24,RRSGN_Max Price_24,Total_24,solar_ISO_24,vol_DAH_RRS_24,Wind_24,ONNS_Min Price_24,wind_24,Coal_24,max_ramp_abs_24,REGUP_Unweighted Average Price_24,price_RT15_N_24,Solar_24,price_DAH_houston_24,OFFNS_Weighted Avg Price_24,price_DAH_busavg_24,ONNS_Unweighted Average Price_24,ONNS_Weighted Avg Price_24,RRSGN_Unweighted Average Price_24,price_RT15_hubavg_24,GT_24,RRSGN_Min Price_24,REGDN_Number of Bids_24,load_net_24,REGUP_Weighted Avg Price_24,Biomass_24,price_RT15_houston_24,RRSGN_Number of Bids_24,OFFNS_Max Price_24,RRSNC_Number of Bids_24,OFFNS_Number of Bids_24,Load.MW_24,vol_DAH_REGDN_24,UPV_24,Hydro_24,price_DAH_RRS_24,max_ramp_24,RRSNC_Total Quantity_24,RRSGN_Weighted Avg Price_24,vol_DAH_REGUP_24,REGDN_Min Price_24,hr_beg_24,price_RT15_S_24,ramp_24,price_RT15_busavg_24,RRSNC_Unweighted Average Price_24,RRSNC_Min Price_24,RRSNC_Weighted Avg Price_24,OFFNS_Unweighted Average Price_24,RRSGN_Total Quantity_24,load_naked_24,REGDN_Total Quantity_24,price_RT15_W_24,ramp_abs_24,REGUP_Number of Bids_24,price_DAH_hubavg_24,price_DAH_REGUP_24,vol_DAH_NSPIN_24,ng_price_24,REGUP_Total Quantity_24,price_DAH_REGDN_24,REGDN_Max Price_24,REGUP_Max Price_24,ONNS_Max Price_24,price_DAH_N_24,OFFNS_Min Price_24,Nuclear.y_48,RRSNC_Max Price_48,NGCC_48,price_DAH_NSPIN_48,OFFNS_Total Quantity_48,ONNS_Number of Bids_48,REGDN_Weighted Avg Price_48,REGUP_Min Price_48,ONNS_Total Quantity_48,Nuclear.x_48,REGDN_Unweighted Average Price_48,price_DAH_S_48,price_DAH_W_48,RRSGN_Max Price_48,Total_48,solar_ISO_48,vol_DAH_RRS_48,Wind_48,ONNS_Min Price_48,wind_48,Coal_48,max_ramp_abs_48,REGUP_Unweighted Average Price_48,price_RT15_N_48,Solar_48,price_DAH_houston_48,OFFNS_Weighted Avg Price_48,price_DAH_busavg_48,ONNS_Unweighted Average Price_48,ONNS_Weighted Avg Price_48,RRSGN_Unweighted Average Price_48,price_RT15_hubavg_48,GT_48,RRSGN_Min Price_48,REGDN_Number of Bids_48,load_net_48,REGUP_Weighted Avg Price_48,Biomass_48,price_RT15_houston_48,RRSGN_Number of Bids_48,OFFNS_Max Price_48,RRSNC_Number of Bids_48,OFFNS_Number of Bids_48,Load.MW_48,vol_DAH_REGDN_48,UPV_48,Hydro_48,price_DAH_RRS_48,max_ramp_48,RRSNC_Total Quantity_48,RRSGN_Weighted Avg Price_48,vol_DAH_REGUP_48,REGDN_Min Price_48,hr_beg_48,price_RT15_S_48,ramp_48,price_RT15_busavg_48,RRSNC_Unweighted Average Price_48,RRSNC_Min Price_48,RRSNC_Weighted Avg Price_48,OFFNS_Unweighted Average Price_48,RRSGN_Total Quantity_48,load_naked_48,REGDN_Total Quantity_48,price_RT15_W_48,ramp_abs_48,REGUP_Number of Bids_48,price_DAH_hubavg_48,price_DAH_REGUP_48,vol_DAH_NSPIN_48,ng_price_48,REGUP_Total Quantity_48,price_DAH_REGDN_48,REGDN_Max Price_48,REGUP_Max Price_48,ONNS_Max Price_48,price_DAH_N_48,OFFNS_Min Price_48,Nuclear.y_72,RRSNC_Max Price_72,NGCC_72,price_DAH_NSPIN_72,OFFNS_Total Quantity_72,ONNS_Number of Bids_72,REGDN_Weighted Avg Price_72,REGUP_Min Price_72,ONNS_Total Quantity_72,Nuclear.x_72,REGDN_Unweighted Average Price_72,price_DAH_S_72,price_DAH_W_72,RRSGN_Max Price_72,Total_72,solar_ISO_72,vol_DAH_RRS_72,Wind_72,ONNS_Min Price_72,wind_72,Coal_72,max_ramp_abs_72,REGUP_Unweighted Average Price_72,price_RT15_N_72,Solar_72,price_DAH_houston_72,OFFNS_Weighted Avg Price_72,price_DAH_busavg_72,ONNS_Unweighted Average Price_72,ONNS_Weighted Avg Price_72,RRSGN_Unweighted Average Price_72,price_RT15_hubavg_72,GT_72,RRSGN_Min Price_72,REGDN_Number of Bids_72,load_net_72,REGUP_Weighted Avg Price_72,Biomass_72,price_RT15_houston_72,hour_72,RRSGN_Number of Bids_72,OFFNS_Max Price_72,RRSNC_Number of Bids_72,OFFNS_Number of Bids_72,Load.MW_72,vol_DAH_REGDN_72,UPV_72,Hydro_72,price_DAH_RRS_72,max_ramp_72,RRSNC_Total Quantity_72,RRSGN_Weighted Avg Price_72,vol_DAH_REGUP_72,REGDN_Min Price_72,hr_beg_72,price_RT15_S_72,ramp_72,price_RT15_busavg_72,RRSNC_Unweighted Average Price_72,RRSNC_Min Price_72,RRSNC_Weighted Avg Price_72,OFFNS_Unweighted Average Price_72,RRSGN_Total Quantity_72,load_naked_72,REGDN_Total Quantity_72,price_RT15_W_72,ramp_abs_72,REGUP_Number of Bids_72,price_DAH_hubavg_72,price_DAH_REGUP_72,vol_DAH_NSPIN_72,ng_price_72,REGUP_Total Quantity_72,price_DAH_REGDN_72,REGDN_Max Price_72,REGUP_Max Price_72,ONNS_Max Price_72,price_DAH_N_72,OFFNS_Min Price_72,date,Coast,East,FarWest,North,NorthCentral,SouthCentral,Southern,West,NSPIN_Quantity,REGDN_Quantity,REGUP_Quantity,RRS_Quantity,cuts
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1
2014,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647,8647
2015,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752,8752
2016,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785,8785
2017,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766,8766
2018,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764,8764
2019,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6


# Modeling

In [122]:
X = new[new['year']<2018].drop(columns=['cuts','date'])
X_test = new[new['year']==2018].drop(columns=['cuts','date'])
y = new.loc[new['year']<2018,'cuts']
y_test = new.loc[new['year']==2018,'cuts']

In [123]:
# split test set -- NOT TIME SERIES SPECIFIC; going to hand-select 2018
#X, X_test, y, y_test = train_test_split(features, target, random_state = 1, test_size = .2)

# split between train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 1, test_size = 0.25)

### First tree -- no restrictions

In [124]:
#SOLUTION
first_tree = DecisionTreeClassifier()
first_tree.fit(X_train, y_train)

print("Number of features: {}".format(first_tree.tree_.n_features))
print("Number of nodes (internal and terminal): {}".format(first_tree.tree_.node_count), "\n")

train_score = first_tree.score(X_train, y_train)
val_score = first_tree.score(X_val, y_val)

print('Train Score: ', train_score)
print('Validation Score: ', val_score)

Number of features: 248
Number of nodes (internal and terminal): 1279 

Train Score:  1.0
Validation Score:  0.9450675211718929


### Tuned tree -- some error in setting max_features

In [125]:
# possible solution
tuned_tree = DecisionTreeClassifier(max_leaf_nodes=1000, max_features=40)
tuned_tree.fit(X_train, y_train)

print("Number of features: {}".format(tuned_tree.tree_.n_features))
print("Number of nodes (leaves): {}".format(tuned_tree.tree_.node_count),"\n")

tuned_train_score = tuned_tree.score(X_train, y_train)
tuned_val_score = tuned_tree.score(X_val, y_val)

print('Train Score: ', tuned_train_score)
print('Validation Score: ', tuned_val_score)

Number of features: 248
Number of nodes (leaves): 1495 

Train Score:  1.0
Validation Score:  0.9459830624856946


In [139]:
feature_importance = tuned_tree.feature_importances_
#relative feature importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
pd.DataFrame({'feature':X_train.columns, 'importance':feature_importance}).sort_values(by='importance', ascending=False).head(15)

Unnamed: 0,feature,importance
72,price_DAH_REGDN_24,100.0
44,vol_DAH_REGDN_24,25.357275
43,Load.MW_24,9.039973
122,vol_DAH_REGDN_48,5.927415
2,NGCC_24,5.724985
150,price_DAH_REGDN_48,5.6352
176,Coal_72,5.340461
154,price_DAH_N_48,5.308809
192,REGUP_Weighted Avg Price_72,5.185259
242,West,5.050391


### Bagging

In [126]:
# solution
from sklearn.ensemble import BaggingClassifier

bag_tree = BaggingClassifier(random_state = 10, n_estimators = 100, max_samples = 700, max_features = 75)
bag_tree.fit(X_train, y_train)

bag_train_score = bag_tree.score(X_train, y_train)
bag_val_score = bag_tree.score(X_val, y_val)

print('Train Score: ', bag_train_score)
print('Validation Score: ', bag_val_score)

Train Score:  0.9583778422096749
Validation Score:  0.954337376974136


In [34]:
"""#Cross-validation
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {'n_estimators': randint(3, 200),
              'max_features': randint(2, 108),
              'max_samples': randint(1, 2000)}

rnd_search = RandomizedSearchCV(bag_tree, param_distributions=param_dist, 
                                cv=8, n_iter=20, random_state = 2)
rnd_search.fit(X_train, y_train)

print(rnd_search.best_score_) # This is cross validation error from the training data set.
print(rnd_search.best_params_)"""

KeyboardInterrupt: 

In [111]:
bag_tree = BaggingClassifier(random_state = 10, n_estimators = 165, max_samples = 1288, max_features =106)
bag_tree.fit(X_train, y_train)

bag_train_score = bag_tree.score(X_train, y_train)
bag_val_score = bag_tree.score(X_val, y_val)

print('Train Score: ', bag_train_score)
print('Validation Score: ', bag_val_score)

Train Score:  0.9603235159468946
Validation Score:  0.9551384756237126


### Random Forest

In [127]:
# Initial try
from sklearn.ensemble import RandomForestClassifier

rf_tree = RandomForestClassifier()
rf_tree.fit(X_train, y_train)

rf_train_score = rf_tree.score(X_train, y_train)
rf_val_score = rf_tree.score(X_val, y_val)

print('Train Score: ', rf_train_score)
print('Validation Score: ', rf_val_score)

Train Score:  0.9999618495345644
Validation Score:  0.961547264820325


### Gradient boosting classifier

In [36]:
# solution
from sklearn.ensemble import GradientBoostingClassifier

gb_tree = GradientBoostingClassifier()
gb_tree.fit(X_train, y_train)

gb_train_score = gb_tree.score(X_train, y_train)
gb_val_score = gb_tree.score(X_val, y_val)

print('Train Score: ', gb_train_score)
print('Validation Score: ', gb_val_score)

KeyboardInterrupt: 

blah blah, we put in all the functions, cross-validate, find the best one...
then want to show the feature importances!
and then also want to translate to actual outputs

**see if you can predict big price spike days for REG UP!

# Picking a winner, for now
In the end this will just be the tree with the best x-validated classification score

In [128]:
models = [first_tree, tuned_tree, bag_tree, rf_tree]
for i in models:
    print('Test Score: ', i.score(X_test, y_test))

Test Score:  0.9447740757644911
Test Score:  0.9442035600182564
Test Score:  0.9742126882701962
Test Score:  0.9752396166134185


In [129]:
y_pred = rf_tree.predict(X_test)

In [130]:
y_test

35084    0-20
35085    0-20
35086    0-20
35087    0-20
35088    0-20
         ... 
43843    0-20
43844    0-20
43845    0-20
43846    0-20
43847    0-20
Name: cuts, Length: 8764, dtype: category
Categories (2, object): [0-20 < 20+]

In [131]:
results = pd.DataFrame({"pred": y_pred,
                        "test": y_test})

In [132]:
results['match?'] = y_pred == y_test

In [133]:
results.groupby(['test','match?']).count() #well, this misses the point entirely
#this is why these results look so good...lol

Unnamed: 0_level_0,Unnamed: 1_level_0,pred
test,match?,Unnamed: 2_level_1
0-20,False,6
0-20,True,8530
20+,False,211
20+,True,17


* What other questions can we ask? What defines a spike? During what hour have forecasted spikes happened in the past? Do we clean this wind data after all?

* OK, adding 24h lagged data helped a lot... "a lot"


* Add in more stats about bid stack. 
* Try to predict w variables in real time, then try to predict those variables?? 
* Also check how well your bid stack worked

* Two good feature importance links (can use to improve model): https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e ; https://explained.ai/rf-importance/index.html

* Try to understand how to use rolling averages ro whatever to decompose parts of this; use fourier thing scott talked about