In [3]:
import pandas as pd
from tqdm.notebook import tqdm
import csv
import requests
import statsmodels.formula.api as sm
import numpy as np

In [162]:
# create the date csv  - DONE

In [163]:
# system load pathname and retrieval?? - SKIP

In [164]:
# add the new lagged week variables || remove the old one 

In [165]:
# add lag48

In [166]:
# mark rows with bad data + delete those rows at end

In [167]:
## Inputs

year = 2020

#Choose the node. For a full list, see https://www.iso-ne.com/markets-operations/settlements/pricing-node-tables/
node = ['LD.NEW_HAVN46']

In [168]:
## Iterable including all dates to be retrieved

year_start = str(year) + '-01-01'
year_end = str(year + 1) + '-01-01'
leap_check = pd.Timestamp(year, 1, 1)

#create a dataframe including every day of the year
df_date_time = pd.DataFrame(
        {'Days': pd.date_range(year_start, year_end, freq='1D', inclusive='left')}
     )

#convert into yearmonthday format, e.g. Jan., 1st, 2020 = 20200101
df_date_time['year'] = pd.DatetimeIndex(df_date_time['Days']).year

df_date_time['month'] = pd.DatetimeIndex(df_date_time['Days']).month
df_date_time["month"] = df_date_time.month.map("{:02}".format)

df_date_time['day'] = pd.DatetimeIndex(df_date_time['Days']).day
df_date_time['day'] = df_date_time.day.map("{:02}".format)

df_date_time['date_time'] = df_date_time['year'].astype(str) + df_date_time['month'] + df_date_time['day']

df_date_time = df_date_time.drop(['Days', 'month', 'day', 'year'], axis=1)

if leap_check.is_leap_year:
    df_date_time = df_date_time.drop(365)

In [169]:
#retrieve and clean first day, and define initial dataframe
url = "https://www.iso-ne.com/static-transform/csv/histRpts/rt-lmp/lmp_rt_final_" + df_date_time['date_time'][0] + ".csv"
main_df = pd.read_csv(url, skiprows=4, usecols=range(1,10))

#Remove other nodes
main_df = main_df[main_df['Location Name'].isin(node)]
main_df = main_df.reset_index(drop=True)

#Create replacement dataframe (for when data doesn't exist)
replacement_df = main_df.iloc[0:24]

In [170]:
#retrieve remaining days and concat with main_df

#drop retrieved day
first_day = df_date_time['date_time'][0]
df_date_time = df_date_time.drop(0)

#retrieve data, clean and concat
for x in tqdm(df_date_time.index):
#     print(df_date_time['date_time'][x])
    
    url_iter = "https://www.iso-ne.com/static-transform/csv/histRpts/rt-lmp/lmp_rt_final_" + str(df_date_time['date_time'][x]) + ".csv"

    request = requests.get(url_iter)
    
    if request.status_code == 200: 
        temp_df = pd.read_csv(url_iter, skiprows=4, usecols=range(1,10))

        #Remove other nodes
        temp_df = temp_df[temp_df['Location Name'].isin(node)]

        main_df = pd.concat([main_df, temp_df], ignore_index=True)
    else: 
        main_df = pd.concat([main_df, replacement_df], ignore_index=True)

  0%|          | 0/364 [00:00<?, ?it/s]

In [171]:
#Retrieve and clean system load and add to main_df
pathname = "rt_hourlysysload_" + str(first_day) + "_" + str(df_date_time['date_time'][364]) + ".csv"

system_load_df = pd.read_csv(pathname, skiprows=4, usecols=range(1,4))
system_load_df = system_load_df.drop([0, 8761])
system_load_df = system_load_df.reset_index()

main_df['Total_Load'] = system_load_df['Total Load']

#Rename certain columns to remove spaces (due to naming issues with libs)
main_df.rename(columns = {'Locational Marginal Price':'LMP', 'Hour Ending':'Hour_Ending'}, inplace = True)

In [172]:
## Capture lagged price variables

In [173]:
# capture individual lagged prices, last 24hrs
for x in range(24,48):
    column_name = str("lag" + str(x))
    main_df[column_name] = main_df['LMP'].shift(x)

In [174]:
## capture daily average

main_df.to_csv("backup_2.csv")
data = pd.read_csv("backup_2.csv")

# remove daylight savings
data = data[data['Hour_Ending'] != "02X"]


data['Hour_Ending'] = data['Hour_Ending'].astype(int) - 1
data['Date'] = pd.to_datetime(data['Date']) + pd.to_timedelta(data['Hour_Ending'], unit='h')
data = data.drop(['Hour_Ending'], axis=1)

davg_data = data.resample('1D', on='Date').mean()

davg_data = davg_data[['LMP', 'Total_Load']].drop(['Total_Load'], axis=1)
davg_data['Date_col'] = davg_data.index

data = pd.read_csv("backup_2.csv", index_col=0)
data['Date'] = pd.to_datetime(data['Date'])

merged_data = data.merge(davg_data, left_on='Date', right_on='Date_col')
merged_data['LMP_y'] = merged_data['LMP_y'].shift(periods=24)

main_df = merged_data

In [175]:
# capture calendar variables
main_df['Date']= pd.to_datetime(main_df['Date'])
main_df['month'] = main_df['Date'].dt.month
main_df['day'] = main_df['Date'].dt.day

In [176]:
# retrieve & add NEISO solar production
solar_df = pd.read_excel('https://www.iso-ne.com/static-assets/documents/2020/04/hourly_solar_gen_' + str(year) + '.xlsx', sheet_name='HourlyData')
solar_df = solar_df.drop(['year', 'local_day', 'LOCAL_HOUR_END', '_FREQ_'], axis=1)
main_df['tot_solar_mwh'] = solar_df['tot_solar_mwh']

# retrieve & add NEISO wind production ## NOTE:
wind_df = pd.read_excel('https://www.iso-ne.com/static-assets/documents/2020/04/hourly_wind_gen_' + str(year) + '.xlsx', sheet_name='HourlyData')
wind_df = wind_df.drop(['year', 'local_day', 'local_hour_end', '_FREQ_'], axis=1)
main_df['tot_wind_mwh'] = wind_df['tot_wind_mwh']

In [177]:
# drop rows with empty lagged variables
main_df = main_df.iloc[47:]
main_df

# set null solar & wind values to zero.
main_df['tot_solar_mwh'] = main_df['tot_solar_mwh'].fillna(0)
main_df['tot_wind_mwh'] = main_df['tot_wind_mwh'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df['tot_solar_mwh'] = main_df['tot_solar_mwh'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df['tot_wind_mwh'] = main_df['tot_wind_mwh'].fillna(0)


In [178]:
main_df

Unnamed: 0,Date,Hour_Ending,Location ID,Location Name,Location Type,LMP_x,Energy Component,Congestion Component,Marginal Loss Component,Total_Load,...,lag44,lag45,lag46,lag47,LMP_y,Date_col,month,day,tot_solar_mwh,tot_wind_mwh
47,2020-01-02,24,4456,LD.NEW_HAVN46,NETWORK NODE,17.97,18.03,0.00,-0.06,12319.18,...,17.10,17.57,18.45,23.05,22.023750,2020-01-02,1,2,0.0,320.464
48,2020-01-03,01,4456,LD.NEW_HAVN46,NETWORK NODE,17.08,17.11,0.00,-0.03,11843.06,...,17.07,17.10,17.57,18.45,23.864167,2020-01-03,1,3,0.0,337.901
49,2020-01-03,02,4456,LD.NEW_HAVN46,NETWORK NODE,16.96,16.94,0.00,0.02,11461.28,...,17.03,17.07,17.10,17.57,23.864167,2020-01-03,1,3,0.0,362.103
50,2020-01-03,03,4456,LD.NEW_HAVN46,NETWORK NODE,16.80,16.76,0.00,0.04,11407.83,...,16.91,17.03,17.07,17.10,23.864167,2020-01-03,1,3,0.0,392.822
51,2020-01-03,04,4456,LD.NEW_HAVN46,NETWORK NODE,16.76,16.71,0.00,0.05,11484.35,...,6.25,16.91,17.03,17.07,23.864167,2020-01-03,1,3,0.0,432.739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2020-12-30,20,4456,LD.NEW_HAVN46,NETWORK NODE,46.75,47.02,0.07,-0.34,17340.46,...,3.47,21.47,29.45,31.93,22.975833,2020-12-30,12,30,0.0,750.562
8756,2020-12-30,21,4456,LD.NEW_HAVN46,NETWORK NODE,44.17,44.28,0.21,-0.32,16986.14,...,17.99,3.47,21.47,29.45,22.975833,2020-12-30,12,30,0.0,812.757
8757,2020-12-30,22,4456,LD.NEW_HAVN46,NETWORK NODE,56.71,56.87,0.26,-0.42,16336.00,...,7.59,17.99,3.47,21.47,22.975833,2020-12-30,12,30,0.0,870.641
8758,2020-12-30,23,4456,LD.NEW_HAVN46,NETWORK NODE,50.09,50.10,0.24,-0.25,15572.73,...,-4.17,7.59,17.99,3.47,22.975833,2020-12-30,12,30,0.0,856.217
