In [None]:
import pandas as pd
import numpy as np

In [None]:
# setup lists and dictionaries for repeated use later
old_tags = ['1DW.PV','2DW.PV','3DW.PV','WTHR_T_LOWER_F']
new_tags = ['901DW.PV','902DW.PV','903DW.PV','41AI117A.PV']
all_tags = old_tags + new_tags
temp = '41AI117A.PV'
tags_dict = dict(zip(new_tags,old_tags))

In [None]:
%%time
# read in the dataframe from the csv
# change the filename to whatever is the latest PI datalink pull
df = pd.read_csv('https://raw.githubusercontent.com/nollijish/datasets/main/Cogen_wthr_temp_20221006-20000101.csv.gz',
                 sep=',',
                 parse_dates=['TIMESTAMP'],
                 low_memory=False,
                 compression='gzip'
                )

In [None]:
# search for all the strings in the process historian that contains bad data
mask_badpv = ((df=='No Data')|(df=='Bad Data')|(df=='Bad')
              |(df=='Intf Shut')|(df=='Bad Input')
              |(df=='I/O Timeout')|(df=='Configure')
              |(df=='Scan Off')|(df=='Out of Serv')
              |(df=='Comm Fail')
             )
df[mask_badpv] = np.nan #does work
# df.loc[mask_badpv] = np.nan #does not work

In [None]:
# swap values over from old tags to new tags
for k in new_tags:
    mask_new = df[k].isna()
    val = tags_dict[k]
    df.loc[mask_new, k] = df.loc[mask_new, val] # does work
    # df[mask_new, k] = df[mask_new, val] #does not work

In [None]:
# cut down to just the needed columns now that they have been combined
df = df.drop(columns=old_tags)

In [None]:
# remove rows with NaN values in the temperature measurement
df = df.dropna(how='any',subset=temp)
# remove rows with all NaN values
df = df.dropna(how='all',subset=new_tags)

In [None]:
# convert all sample values to float (all df columns comes in as objects due to mixed str and float)
# new_tags columns mask is being used to avoid converting the timedate column
df.loc[:,new_tags] = df.loc[:,new_tags].astype(dtype=np.float64)
print(df.dtypes)

In [None]:
# remove out of tolerance weather values
# lowest recorded temperature in Anacortes was 5°F
# highest recorded temperature in Anacortes was 98°F
# including a 20°F in both directions from the record values
mask_temp = ((df[temp]<120.0)
             &(df[temp]>-20.0)
            )
df = df.loc[mask_temp,:]

In [None]:
# sort all cases by temperature
df = df.sort_values(by=temp).reset_index(drop=True)

In [None]:
# remove all values for months not December, January, or February for lo temp dataframe
mask_month = ((df['TIMESTAMP'].dt.month==12)
              |(df['TIMESTAMP'].dt.month==1)
              |(df['TIMESTAMP'].dt.month==2)
             )
df_wint = df.loc[mask_month,:]

# print 0.2% percentile value
print(df_wint[temp].quantile(q=0.002))

In [None]:
# spit out low temperature statistics
df_wint.describe()

In [None]:
# mask to just the < 0.2% percentile
mask_lo = ((df_wint[temp]<=df_wint[temp].quantile(q=0.002))
             )

df_wint_lo = df_wint[mask_lo]

In [None]:
df_wint_lo.to_csv('./lo_perc_temps.csv')

In [None]:
# create power tags list
pwr_tags = [i for i in new_tags if i not in temp]

In [None]:
# convert any na generator power output values to 0.0
for i in pwr_tags:
    mask_na = df_wint_lo[i].isna()
    df_wint_lo.loc[mask_na,i] = 0.0

In [None]:
# find all times when the generator was operating < 32MW
# the generators will typically operate down to 35MW as determined by economic considerations
# the generators are typically not operated below 35MW due to concerns around flame out
mask_off = (df_wint_lo[pwr_tags]<=32).any(axis=1)
df_wint_lo_off = df_wint_lo[mask_off]

In [None]:
# write output of all times a generator is off or na when below the 0.2% percentile
df_wint_lo_off.to_csv('./lo_perc_gen_off.csv')

In [None]:
# create groupby  object that can segment the lo values together
gb_df_td = df_wint_lo.groupby(pd.Grouper(key='TIMESTAMP',
                                         freq='D',
                                         sort=True,
                                        )
                             )

In [None]:
mask_gb = gb_df_td.size()>0
gb_df_td.size()[mask_gb]

In [None]:
# just testing some concepts
l = []
for n, g in gb_df_td:
    if ((g.size>0) & (n.year==2004)):
        print(type(n))
        print(n)
        print(type(g))
        print(g['901DW.PV'].mean())
        l.append(g)
df_l = pd.concat(l)

In [None]:
df.loc[df[temp].idxmax(),:]