In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from datetime import datetime
from copy import deepcopy
from dataloader import *

In [2]:
def format_sitegts(site_start,site_freqs,duration_len):
    # Create approximate labels for dataframe indices when each frequency is being calibrated
    # This is based on our two-hour calibration time for each frequency
    # ------------------------------------------------
    # site_start: (int) index in the all-site-data dataframe labelling when calibration begins
    # site_freqs: (list <int>) a list of frequencies being calibrated
        # assumptions:
            # frequencies are each calibrated for the same amount of time
            # frequencies are listed in the order they are calibrated (e.g. if calib. is done in ascending order, freqs. are listed ascending, too)
    # duration_len: (int) duration (the number of indices/timestamps) the calibration is occuring
        # assumption: the dataframe with site info is recorded in order; i.e. all the rows are in chronological order

    # output: a dictionary where key:value is frequency: [start_idx, end_idx] for that frequency during calibration
    # ================================================
    sitegt = {freq:[0,0] for freq in site_freqs}
    for freq in site_freqs:
        sitegt[freq][0] = site_start
        sitegt[freq][1] = site_start+duration_len
        site_start = site_start+duration_len+1
    return sitegt

def plot_ts_gt(sitegts, freq_data, site_id):
    # plot the time series estimated ground truth
    # the first plot is the time series and estiamted ground truth
    # the second plot is a set of surrounding indices before, during, and after the estimated ground truth for visual clarity
    # Note: Audrey will switch to plotly.express in the future
    # ------------------------------------------------
    # sitegts: a dictionary of dictionaries, see format_sitegts()
    # freq_data: the frequency information of the dataframe/series from the dataframe
    # site_id: (int)
    # output: None
    # ================================================
    
    # this is as many different colors you can assign to frequencies in calibration; add more as needed
    colors = ['c','m','y','r','g','b','lime','violet']
    site_gt = sitegts[site_id]
    fig = plt.figure(figsize=(30,4))
    plt.plot(np.arange(len(freq_data)), freq_data,label="Frequency TS")
    for i, freq in enumerate(site_gt.keys()):
        curr_start = site_gt[freq][0]
        curr_end = site_gt[freq][1]
        plt.plot(np.arange(curr_start,curr_end), freq_data[curr_start:curr_end], c=colors[i])
    plt.grid()
    plt.show()

    # Zoom In
    freqs = list(site_gt.keys())
    first_freq = freqs[0]
    last_freq = freqs[-1]
    fig = plt.figure(figsize=(20,4))
    range_start = site_gt[first_freq][0]-200
    range_end = site_gt[last_freq][1]+200
    plt.plot(np.arange(range_start,range_end), freq_data[range_start:range_end], label="Frequency TS")
    for i, freq in enumerate(site_gt.keys()):
        curr_start = site_gt[freq][0]
        curr_end = site_gt[freq][1]
        plt.plot(np.arange(curr_start,curr_end), freq_data[curr_start:curr_end], c=colors[i])
    plt.grid()
    plt.show()
    return


def find_closest_time(data, query_date, query_time='00:00:00'):
    # find the closest entry to the query date and time in the dataframe
    # ------------------------------------------------
        # data: dataframe of the site you're looking at
        # query_date: %Y-%m-%d format
        # query time: %H:%M:%S format
        # output: closest entry in the dataframe timestamp (datetime object), its corresponding dataframe index (int)
    # ================================================
    date_subset = data[data['timestamp'].str.contains(query_date)]
    query_datetime = datetime.strptime(query_date+" "+query_time, "%Y-%m-%d %H:%M:%S")
    closest_timestamp = datetime.strptime(date_subset.iloc[0]['timestamp'], "%Y-%m-%d %H:%M:%S")
    closest_timeidx = date_subset.iloc[0].index
    for i, row in date_subset.iterrows():
        curr_date = datetime.strptime(row['timestamp'], "%Y-%m-%d %H:%M:%S")
        if abs(query_datetime-curr_date) < abs(query_datetime-closest_timestamp):
            closest_timestamp = curr_date
            closest_timeidx = i
    return closest_timestamp, closest_timeidx # errs on the side of being later

In [None]:
 # load cached bison data; replace filepath/dataloading as appropriate
data = cached_bison_data("syncdatabase_011725.csv")

In [None]:
twohr_rate_1_min = 120
twohr_rate_5_min = 24

# NOTES
# union city: 33404, one minute -- '2024-12-17 12:00:00'
# siegrist: 33467, five minutes -- '2024-12-17 10:00:00'
# canadian: 57740, one minutes -- '2024-12-13 12:00:00'
# calumet: 33614, five minutes -- '2024-12-13 12:00:00' 69969 is ~midnight sept 1 2024

# OLD -- found starting times based on the above notes by hand
# unioncity_start = 259943 
# seigrist_start = 66050 
# canadian_start = 254170 
# calumet_start = 100021

# Stage 1: Estimate time stamps for when each stage of calibration is happening given a label for when (date, time) calibration begins for a site
# 1. begin with your site id
# 2. specify your query start date and time for calibration 
# 3. find the closest time and date match in the dataframe to your query (default time is 00:00:00 if no time is specified)
# 4. hardcode the different frequencies you know are being calibrated # (Stage 2: automate this based on info given in steps 2 and 3 here)
# 5. format the estimated ground truth for calibration
# 6. plot it

unioncity_id = 33404 # 1
unioncity_startdate = '2024-12-17' # 2
unioncity_startime = '12:00:00'
unioncity_timestamp, unioncity_start = find_closest_time(data[data['site_id']==unioncity_id], unioncity_startdate, query_time=unioncity_startime) # 3
print(unioncity_timestamp)
unioncity_freqs = [46,48,50,52,54,56] # 4
unioncity = format_sitegts(unioncity_start,unioncity_freqs,twohr_rate_5_min) # 5

siegrist_id = 33467
seigrist_startdate = '2024-12-17'
seigrist_starttime = '10:00:00'
seigrist_timestamp, seigrist_start = find_closest_time(data[data['site_id']==siegrist_id], seigrist_startdate, query_time=seigrist_starttime)
seigrist_freqs = [47,49,51,53,55,57] # future: automate based on the above given timestamp
print(seigrist_timestamp)
siegrist = format_sitegts(seigrist_start,seigrist_freqs,twohr_rate_5_min) 

canadian_id = 57740
canadian_startdate = '2024-12-13'
canadian_starttime = '12:00:00'
canadian_timestamp, canadian_start = find_closest_time(data[data['site_id']==canadian_id], canadian_startdate, query_time=canadian_starttime)
print(canadian_timestamp)
canadian_freqs = [44,47,49,51,53,55,57,59] # future: automate based on the above given timestamp
canadian = format_sitegts(canadian_start,canadian_freqs,twohr_rate_1_min)

calumet_id = 33614
calumet_startdate = '2024-12-13'
calumet_starttime = '12:00:00'
calumet_timestamp, calumet_start = find_closest_time(data[data['site_id']==calumet_id], calumet_startdate, query_time=calumet_starttime)
print(calumet_timestamp)
calumet_freqs = [49,51,53,55] # future: automate based on the above given timestamp
calumet = format_sitegts(calumet_start,calumet_freqs,twohr_rate_5_min)

sitegts = {unioncity_id:unioncity, siegrist_id:siegrist, canadian_id:canadian, calumet_id:calumet}

# QUICK COMMANDS -- feel free to ignore
# print(data[data['site_id']==33614]) # look at the data for a given site
# s_temp = 259943 
# print(data[s_temp:s_temp+twohr_rate_1_min]) # quickly check the two hour duration based off a start index & given sampling rate
# print(data[data['timestamp']=='2024-12-17 12:00:02'])

In [None]:
for site_id in sitegts.keys(): # 6
    print(site_id)
    freq_data = data['frequency']
    plot_ts_gt(sitegts, freq_data, site_id)

In [21]:
# we can rename these functions later

def ryan_format(data, ryan_sites_info, audrey_sitegts):
    # takes Audrey's info in Audrey dictionaries-using-dataframe-indices format and converts it to Ryan's timestamp format
    # ------------------------------------------------
    # data: dataframe of all site info
    # ryan_sites_info: copied from https://github.com/neuralix-ai/dev_RyanMercer/blob/dev/notebooks/Customers/Bison/2024-12-29_Bison_PumpCurve_x-axisHealth_calibrated_alert.ipynb
    # audrey_sitegts: audrey's format of using dataframe indices
    # output: start-time and end-time and frequency data is recorded in Ryan's site_info format consistent with ryan_sites_info
    # ================================================

    ryan_format_sitegts = deepcopy(ryan_sites_info) # we need to keep the rest of the structure of Ryan's sites_info (enable, num_pumps, etc.)
    format_calib_stage = []
    for site in ryan_format_sitegts: # for each site in the database...
        site_id = site['site_id'] # for each site in Ryan's gt,
        curr_site_gts = audrey_sitegts[site_id] # find the site estimated gt in Audrey's idx format dictionary
        for freq in curr_site_gts.keys(): # for each frequency at a site...
            tmp = {}
            tmp['frequency'] = freq
            start_and_end = curr_site_gts[freq]
            start_timestamp = data.iloc[start_and_end[0]]['timestamp']
            end_timestamp = data.iloc[start_and_end[1]]['timestamp']
            tmp['start_time'] = start_timestamp
            tmp['end_time'] = end_timestamp
            format_calib_stage.append(tmp)
        site['calibration_stages'] = format_calib_stage # ...now replace the info now that it's in Ryan's format
    return ryan_format_sitegts


def ryan_site_info_to_audrey_format(data, ryan_cached_gt,sampling_rates):
    # takes in the GT format from Ryan and converts it to the format Audrey uses in the plots above
    # ------------------------------------------------
    # data: dataframe of all site info
    # ryan_cached_gt: copied from https://github.com/neuralix-ai/dev_RyanMercer/blob/dev/notebooks/Customers/Bison/2024-12-29_Bison_PumpCurve_x-axisHealth_calibrated_alert.ipynb
    # sampling_rates: sampling rates for each site (hardcoded)
    # output: audrey-format dictionaries about site info
    # ================================================
    audrey_format = {}
    for site in ryan_cached_gt:
        site_id = site['site_id']
        frequencies = []
        for stage in site['calibration_stages']:
            frequencies.append(stage['frequency'])
            listed_start = stage['start_time'].split(" ")
            startdate = listed_start[0]
            starttime = listed_start[1]
            _, index = find_closest_time(data[data['site_id']==site_id], startdate, query_time=starttime)
        site_estimatedgt = format_sitegts(index,frequencies,sampling_rates[site_id])
        audrey_format[site_id] = site_estimatedgt
    return audrey_format

In [None]:
# copied from sites_info: 
# https://github.com/neuralix-ai/dev_RyanMercer/blob/dev/notebooks/Customers/Bison/2024-12-29_Bison_PumpCurve_x-axisHealth_calibrated_alert.ipynb
ryan_cached_gt = cached_site_info() 
audrey_sitegts = deepcopy(sitegts)

print(ryan_cached_gt)
replace_this_every_week_ryan = ryan_format(data, ryan_cached_gt, audrey_sitegts)
print(replace_this_every_week_ryan)

# examine how the timestamps change from "my rough guess was about noon" --> "the precise time in the dataframe is 12:00:19"

In [None]:
sampling_rates = {33404:twohr_rate_1_min, 33467:twohr_rate_5_min, 57740:twohr_rate_1_min, 33614:twohr_rate_5_min}
ryan_cached_to_audrey_format = ryan_site_info_to_audrey_format(data, ryan_cached_gt, sampling_rates)
print(ryan_cached_to_audrey_format)
replace_every_week_to_audrey_format = ryan_site_info_to_audrey_format(data, replace_this_every_week_ryan, sampling_rates)
print(replace_every_week_to_audrey_format)

# these are a tad different because ...
# ... I made time stamps mutually exclusive to a particular calibration frequency, whereas a calibration stage could start at 14:00:00 AND another could begin at 14:00:00