In [49]:
import json
import csv
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import geopandas as gpd
import shapely
from shapely.geometry import Point, MultiPoint, Polygon, MultiPolygon
from shapely.affinity import scale
import matplotlib.pyplot as plt

import glob
import os
import datetime

# Bid data (Aggregated Ancillary Service Offer Curve_thru_2016-2019)

In [285]:
#loading all data and concatenating
path = r'/Users/margaretmccall/Documents/2020 Spring/CE 295/0 - Final Project/Data--ERCOT/Aggregated Ancillary Service Offer Curve_thru_2016-2019'
all_files = glob.glob(path + "/*.csv")

dfs = []
bad_files = []

for file in all_files:
    try:
        x = pd.read_csv(file)
        dfs.append(x)
    except pd.errors.EmptyDataError:
        bad_files.append(file)
        
df = pd.concat(dfs, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


In [286]:
df.head()

Unnamed: 0,AncillaryType,DSTFlag,DeliveryDate,HourEnding,Price,Quantity
0,OFFNS,N,01/12/2014,01:00,0.01,214.0
1,OFFNS,N,01/12/2014,01:00,50.01,2039.0
2,OFFNS,N,01/12/2014,01:00,60.01,2104.0
3,OFFNS,N,01/12/2014,01:00,10.0,1891.0
4,OFFNS,N,01/12/2014,01:00,5.0,1869.0


In [287]:
df.shape

(20707971, 6)

In [291]:
np.sum(df.duplicated(subset=['AncillaryType','DeliveryDate','HourEnding','Price','Quantity'],keep='first'))

35834

In [292]:
df.drop_duplicates(subset=['AncillaryType','DeliveryDate','HourEnding','Price','Quantity'],
                   keep='first', inplace=True)

In [293]:
df.head()

Unnamed: 0,AncillaryType,DSTFlag,DeliveryDate,HourEnding,Price,Quantity
0,OFFNS,N,01/12/2014,01:00,0.01,214.0
1,OFFNS,N,01/12/2014,01:00,50.01,2039.0
2,OFFNS,N,01/12/2014,01:00,60.01,2104.0
3,OFFNS,N,01/12/2014,01:00,10.0,1891.0
4,OFFNS,N,01/12/2014,01:00,5.0,1869.0


In [294]:
df['hr_end'] = df['HourEnding'].apply(lambda x: int(x[:2]))
df['hr_beg'] = df['hr_end'] - 1
df.drop(columns=['hr_end'],inplace=True)

In [295]:
df['date'] = pd.to_datetime(df['DeliveryDate']).dt.date

In [307]:
df.head()

Unnamed: 0,AncillaryType,DSTFlag,Price,Quantity,hr_beg,date
0,OFFNS,N,0.01,214.0,0,2014-01-12
1,OFFNS,N,50.01,2039.0,0,2014-01-12
2,OFFNS,N,60.01,2104.0,0,2014-01-12
3,OFFNS,N,10.0,1891.0,0,2014-01-12
4,OFFNS,N,5.0,1869.0,0,2014-01-12


In [311]:
df = df[df['date']>datetime.date(2013,12,31)]

In [46]:
"""full_dt_hrbeg = []
for d, t in zip(df['date'], df['hr_end']):
    full_dt_hrbeg.append(datetime.datetime.combine(d,datetime.time(t-1)))"""

In [312]:
df.drop(columns=['DeliveryDate','HourEnding'], inplace=True)

KeyError: "['DeliveryDate' 'HourEnding'] not found in axis"

https://towardsdatascience.com/pandas-groupby-aggregate-transform-filter-c95ba3444bbb is great

In [313]:
grouped = df.groupby(['AncillaryType','date','hr_beg'])#this is a good one

In [314]:
aggregation = {
    'Unweighted Average Price': pd.NamedAgg(column='Price', aggfunc='mean'),
    'Max Price': pd.NamedAgg(column='Price', aggfunc='max'),
    'Min Price': pd.NamedAgg(column='Price', aggfunc='min'),
    'Total Quantity': pd.NamedAgg(column='Quantity', aggfunc='sum'),
    'Number of Bids': pd.NamedAgg(column='Price', aggfunc='size')
}
grouped.agg(**aggregation)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unweighted Average Price,Max Price,Min Price,Total Quantity,Number of Bids
AncillaryType,date,hr_beg,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OFFNS,2014-01-01,0,43.319091,300.01,0.01,38144.7,22
OFFNS,2014-01-01,1,43.342727,300.01,0.01,38216.4,22
OFFNS,2014-01-01,2,43.348182,300.01,0.01,38336.9,22
OFFNS,2014-01-01,3,43.350000,300.01,0.01,38374.5,22
OFFNS,2014-01-01,4,46.431000,300.01,0.01,33460.8,20
...,...,...,...,...,...,...,...
RRSNC,2020-01-01,19,666.500000,1333.00,0.00,3236.0,2
RRSNC,2020-01-01,20,666.500000,1333.00,0.00,3226.4,2
RRSNC,2020-01-01,21,666.500000,1333.00,0.00,3222.6,2
RRSNC,2020-01-01,22,666.500000,1333.00,0.00,3155.2,2


In [315]:
#want weighted average price
def wavg(group, avg_name, weight_name):
    """ https://pbpython.com/weighted-average.html
    """
    d = group[avg_name]
    w = group[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return d.mean()

In [316]:
x = pd.Series(grouped.apply(wavg, "Price", "Quantity"), name="Weighted Avg Price")

In [317]:
grouped_data = pd.concat([grouped.agg(**aggregation), x], axis=1)

In [318]:
grouped_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unweighted Average Price,Max Price,Min Price,Total Quantity,Number of Bids,Weighted Avg Price
AncillaryType,date,hr_beg,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
OFFNS,2014-01-01,0,43.319091,300.01,0.01,38144.7,22,59.512790
OFFNS,2014-01-01,1,43.342727,300.01,0.01,38216.4,22,59.505340
OFFNS,2014-01-01,2,43.348182,300.01,0.01,38336.9,22,59.498483
OFFNS,2014-01-01,3,43.350000,300.01,0.01,38374.5,22,59.492013
OFFNS,2014-01-01,4,46.431000,300.01,0.01,33460.8,20,64.004757
...,...,...,...,...,...,...,...,...
RRSNC,2020-01-01,19,666.500000,1333.00,0.00,3236.0,2,667.529821
RRSNC,2020-01-01,20,666.500000,1333.00,0.00,3226.4,2,667.532885
RRSNC,2020-01-01,21,666.500000,1333.00,0.00,3222.6,2,667.534103
RRSNC,2020-01-01,22,666.500000,1333.00,0.00,3155.2,2,667.556193


In [78]:
#grouped_data.to_csv("as_bid_aggregated_data.csv")

In [319]:
products = df['AncillaryType'].unique()
output = grouped_data.loc[(products[0]),:]
output.columns = [products[0] + "_" + str(col) for col in output.columns]

for prod in products[1:]:
    x = grouped_data.loc[(prod),:]
    x.columns = [prod + "_" + str(col) for col in x.columns]
    output = pd.concat([output, x], axis=1)

In [320]:
output

Unnamed: 0_level_0,Unnamed: 1_level_0,OFFNS_Unweighted Average Price,OFFNS_Max Price,OFFNS_Min Price,OFFNS_Total Quantity,OFFNS_Number of Bids,OFFNS_Weighted Avg Price,ONNS_Unweighted Average Price,ONNS_Max Price,ONNS_Min Price,ONNS_Total Quantity,...,RRSGN_Min Price,RRSGN_Total Quantity,RRSGN_Number of Bids,RRSGN_Weighted Avg Price,RRSNC_Unweighted Average Price,RRSNC_Max Price,RRSNC_Min Price,RRSNC_Total Quantity,RRSNC_Number of Bids,RRSNC_Weighted Avg Price
date,hr_beg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-01-01,0,43.319091,300.01,0.01,38144.7,22,59.512790,22.016970,250.0,1.00,32531.5,...,0.01,107723.6,48,30.195520,1.5,3.0,0.0,2236.8,2,1.599651
2014-01-01,1,43.342727,300.01,0.01,38216.4,22,59.505340,20.935143,250.0,1.00,34781.2,...,0.01,107145.0,50,30.000687,1.5,3.0,0.0,2235.8,2,1.599696
2014-01-01,2,43.348182,300.01,0.01,38336.9,22,59.498483,20.970000,250.0,1.00,34741.4,...,0.01,105503.8,50,30.275497,1.5,3.0,0.0,2236.2,2,1.599678
2014-01-01,3,43.350000,300.01,0.01,38374.5,22,59.492013,21.002571,250.0,1.00,34761.4,...,0.01,105024.0,50,30.418845,1.5,3.0,0.0,2238.2,2,1.599589
2014-01-01,4,46.431000,300.01,0.01,33460.8,20,64.004757,20.756286,250.0,1.00,34829.3,...,0.01,104199.8,49,30.477332,1.5,3.0,0.0,2242.4,2,1.599402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01,19,11.671731,150.00,0.01,76496.3,52,14.797209,25.570435,250.0,0.01,26482.4,...,0.01,116485.9,66,14.518144,666.5,1333.0,0.0,3236.0,2,667.529821
2020-01-01,20,9.414118,150.00,0.01,75019.8,51,11.993506,30.986250,250.0,0.01,27609.0,...,0.01,136089.2,68,12.992571,666.5,1333.0,0.0,3226.4,2,667.532885
2020-01-01,21,5.883125,150.00,0.01,69390.8,48,7.481639,32.252609,250.0,0.01,26498.2,...,0.01,114358.4,61,14.025406,666.5,1333.0,0.0,3222.6,2,667.534103
2020-01-01,22,5.103878,150.00,0.01,69486.8,49,6.996216,29.804000,250.0,0.01,29510.6,...,0.01,105783.7,57,16.574780,666.5,1333.0,0.0,3155.2,2,667.556193


In [321]:
output.to_csv("df_as_bid_aggregated_data.csv")

In [None]:
#ok now you need to check actual project requirements to see if you walk through your thought process in 
#a meaningful way or just need to guess your way to the outcome
#like we should test for correlation between things rigorously if we need to be able to justify
#our process

# DAM AS Plan_2016-2019

In [254]:
#loading all data and concatenating
path = r'/Users/margaretmccall/Documents/2020 Spring/CE 295/0 - Final Project/Data--ERCOT/DAM AS Plan_2016-2019'
all_files = glob.glob(path + "/*.csv")

dfs = []
bad_files = []

for file in all_files:
    try:
        x = pd.read_csv(file)
        dfs.append(x)
    except pd.errors.EmptyDataError:
        bad_files.append(file)
        
df_plan = pd.concat(dfs, ignore_index=True)

In [266]:
#testing that the insane number of reported duplicates is real...
df_plan.loc[(df_plan['DeliveryDate']=='02/05/2016') &
           (df_plan['HourEnding']=='12:00'),:]

Unnamed: 0,DeliveryDate,HourEnding,AncillaryType,Quantity,DSTFlag
323836,02/05/2016,12:00,NSPIN,2286,N
323837,02/05/2016,12:00,REGDN,281,N
323838,02/05/2016,12:00,REGUP,296,N
323839,02/05/2016,12:00,RRS,2808,N
642380,02/05/2016,12:00,NSPIN,2286,N
642381,02/05/2016,12:00,REGDN,281,N
642382,02/05/2016,12:00,REGUP,296,N
642383,02/05/2016,12:00,RRS,2808,N


In [267]:
df_plan.drop_duplicates(subset=['DeliveryDate','HourEnding','AncillaryType','Quantity'], 
                        keep="first", inplace=True)

In [269]:
df_plan.reset_index(inplace=True, drop=True)

In [270]:
df_plan.head()

Unnamed: 0,DeliveryDate,HourEnding,AncillaryType,Quantity,DSTFlag
0,12/24/2018,01:00,NSPIN,1198,N
1,12/24/2018,01:00,REGDN,278,N
2,12/24/2018,01:00,REGUP,211,N
3,12/24/2018,01:00,RRS,2985,N
4,12/24/2018,02:00,NSPIN,1198,N


In [271]:
df_plan['hr_end'] = df_plan['HourEnding'].apply(lambda x: int(x[:2]))
df_plan['hr_beg'] = df_plan['hr_end'] - 1
df_plan.drop(columns=['hr_end'],inplace=True)

In [272]:
df_plan.drop(columns=['HourEnding','DSTFlag'],inplace=True)

In [273]:
df_plan.rename(columns={'DeliveryDate':'date'},inplace=True)

In [185]:
#df_plan.pivot_table(values=['Quantity'],index=['date','hr_beg'],columns=['AncillaryType'])

In [282]:
products = df_plan['AncillaryType'].unique()
output = df_plan.loc[df_plan['AncillaryType']==products[0],['date','hr_beg','Quantity']]
output.rename(columns={'Quantity':products[0]+"_"+'Quantity'}, inplace=True)

for prod in products[1:]:
    x = df_plan.loc[df_plan['AncillaryType']==prod, ['date','hr_beg','Quantity']]
    output = output.merge(x, how='outer', on=['date','hr_beg'])
    output.rename(columns={'Quantity':prod+"_"+'Quantity'}, inplace=True)

In [289]:
output.to_csv("df_as_plan.csv", index=False)