In [None]:
date_obs = 20240202

In [None]:
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd

import awkward as ak
import awkward_pandas as akpd

from astropy.time import Time

from lsst.summit.utils.blockUtils import BlockParser
from lsst.summit.utils.tmaUtils import TMAEventMaker

In [None]:
## Set the day_obs list
#day_obs_list = range(20240205, 20240205 + 11)

# For the TMA events
#event_maker = TMAEventMaker()

In [None]:
# For each day_obs in the list determine which blocks were run and put
# the list of blocks into the block_list.

#block_list = []

#for day_obs in day_obs_list:
#    block_parser = BlockParser(day_obs)
#    blocks = block_parser.getBlockNums()
#    block_list.append(blocks)

# Put the variable length nested list into an awkward array and then
# put that into a pandas dataframe with the awkward array extension
# so that the list of blocks is shown in a column.

#blocks = ak.Array({"day_obs": day_obs_list, "blocks": block_list})
#series = akpd.from_awkward(blocks)
#pandas_df = series.ak.to_columns(extract_all=True)
#pandas_df

In [None]:

import time
import logging

class BlockParser:
    """A class to parse BLOCK data from the EFD.

    Information on executed blocks is stored in the EFD (Electronic Facilities
    Database) in the ``lsst.sal.Script.logevent_state`` topic. This class
    parses that topic and provides methods to get information on the blocks
    which were run on a given dayObs. It also provides methods to get the
    events which occurred during a given block, and also to get the block in
    which a specified event occurred, if any.

    Parameters
    ----------
    dayObs : `int`
        The dayObs to get the block data for.
    client : `lsst_efd_client.efd_client.EfdClient`, optional
        The EFD client to use. If not specified, a new one is created.
    """

    def __init__(self, dayObs, client=None):
        self.log = logging.getLogger("lsst.summit.utils.blockUtils.BlockParser")
        self.dayObs = dayObs

        self.client = client
        if client is None:
            self.client = makeEfdClient()

        t0 = time.time()
        self.getDataForDayObs()
        self.log.debug(f"Getting data took {(time.time()-t0):.2f} seconds")
        t0 = time.time()
        self.augmentData()
        self.log.debug(f"Parsing data took {(time.time()-t0):.5f} seconds")

    def getDataForDayObs(self):
        """Retrieve the data for the specified dayObs from the EFD."""
        # Tiago thinks no individual block seqNums should take more than an
        # hour to run, so pad the dayObs by 1.5 hours to make sure we catch
        # any blocks which might span the end of the day.
        padding = 1.5 * 60 * 60
        data = getEfdData(
            self.client, "lsst.sal.Script.logevent_state", dayObs=self.dayObs, postPadding=padding
        )
        self.data = data

    def augmentDataSlow(self):
        """Parse each row in the data frame individually, pulling the
        information out into its own columns.
        """
        data = self.data
        blockPattern = r"BLOCK-(\d+)"
        blockIdPattern = r"BL\d+(?:_\w+)+"

        data["blockNum"] = pd.Series()
        data["blockId"] = pd.Series()
        data["blockDayObs"] = pd.Series()
        data["blockSeqNum"] = pd.Series()

        if "lastCheckpoint" not in self.data.columns:
            nRows = len(self.data)
            self.log.warning(
                f"Found {nRows} rows of data and no 'lastCheckpoint' column was in the data,"
                " so block data cannot be parsed."
            )

        for index, row in data.iterrows():
            rowStr = row["lastCheckpoint"]

            blockMatch = re.search(blockPattern, rowStr)
            blockNumber = int(blockMatch.group(1)) if blockMatch else None
            data.loc[index, "blockNum"] = blockNumber

            blockIdMatch = re.search(blockIdPattern, rowStr)
            blockId = blockIdMatch.group(0) if blockIdMatch else None
            data.loc[index, "blockId"] = blockId
            if blockId is not None:
                blockDayObs = int(blockId.split("_")[2])
                blockSeqNum = int(blockId.split("_")[3])
                data.loc[index, "blockDayObs"] = blockDayObs
                data.loc[index, "blockSeqNum"] = blockSeqNum

    def augmentData(self):
        """Parse the dataframe using vectorized methods, pulling the
        information out into its own columns.

        This method is much faster for large dataframes than augmentDataSlow,
        but is also much harder to maintain/debug, as the vectorized regexes
        are hard to work with, and to know which row is causing problems.
        """
        if "lastCheckpoint" not in self.data.columns:
            nRows = len(self.data)
            self.log.warning(
                f"Found {nRows} rows of data and no 'lastCheckpoint' column was in the data,"
                " so block data cannot be parsed."
            )
            # add the columns that would have been added for consistency
            self.data["blockNum"] = pd.Series()
            self.data["blockId"] = pd.Series()
            self.data["blockDayObs"] = pd.Series()
            self.data["blockSeqNum"] = pd.Series()
            return

        data = self.data
        blockPattern = r"BLOCK-(\d+)"
        blockIdPattern = r"(BL\d+(?:_\w+)+)"

        col = data["lastCheckpoint"]
        data["blockNum"] = col.str.extract(blockPattern, expand=False).astype(float).astype(pd.Int64Dtype())
        data["blockId"] = col.str.extract(blockIdPattern, expand=False)

        blockIdSplit = data["blockId"].str.split("_", expand=True)
        if blockIdSplit.columns.max() > 1:  # parsing the blockId succeeded
            data["blockDayObs"] = blockIdSplit[2].astype(float).astype(pd.Int64Dtype())
            data["blockSeqNum"] = blockIdSplit[3].astype(float).astype(pd.Int64Dtype())
        else:  # make nan filled columns for these
            nanSeries = pd.Series([np.nan] * len(data))
            data["blockDayObs"] = nanSeries
            data["blockSeqNum"] = nanSeries

    def _listColumnValues(self, column, removeNone=True):
        """Get all the different values for the specified column, as a list.

        Parameters
        ----------
        column : `str`
            The column to get the values for.
        removeNone : `bool`
            Whether to remove None from the list of values.

        Returns
        -------
        values : `list`
            The values for the specified column.
        """
        values = set(self.data[column].dropna())
        if None in values and removeNone:
            values.remove(None)
        return sorted(values)

    def getBlockNums(self):
        """Get the block numbers which were run on the specified dayObs.

        Returns
        -------
        blockNums : `list` of `int`
            The blocks which were run on the specified dayObs.
        """
        return self._listColumnValues("blockNum")

    def getSeqNums(self, block):
        """Get the seqNums for the specified block.

        Parameters
        ----------
        block : `int`
            The block number to get the events for.

        Returns
        -------
        seqNums : `list` of `int`
            The sequence numbers for the specified block.
        """
        seqNums = self.data[self.data["blockNum"] == block]["blockSeqNum"]
        # block header rows have no blockId or seqNum, but do have a blockNum
        # so appear here, so drop the nans as they don't relate to an actual
        # run of a block
        seqNums = seqNums.dropna()
        return sorted(set(seqNums))

    def getRows(self, block, seqNum=None):
        """Get all rows of data which relate to the specified block.

        If the seqNum is specified, only the rows for that sequence number are
        returned, otherwise all the rows relating to any block execution that
        day are returned. If the specified seqNum doesn't occur on the current
        day, an empty dataframe is returned.

        Parameters
        ----------
        block : `int`
            The block number to get the events for.
        seqNum : `int`, optional
            The sequence number, if specified, to get the row data for. If not
            specified, all data for the specified block is returned.

        Returns
        -------
        data : `pandas.DataFrame`
            The row data.
        """
        # Because we query for a whole dayObs, but BLOCKs can overlap the day
        # start/end, it's possible for the block's blockDayObs not to be the
        # same as self.dayObs around the beginning or end of the day, so filter
        # with an extra `& (self.data['blockDayObs'] == self.dayObs` when
        # getting the relevant rows.
        rowsForBlock = self.data[
            np.logical_and(self.data["blockNum"] == block, self.data["blockDayObs"] == self.dayObs)
        ]
        if rowsForBlock.empty:
            self.log.warning(f"No rows found for {block=} on dayObs={self.dayObs}")
        if seqNum is None:
            return rowsForBlock
        return rowsForBlock[rowsForBlock["blockSeqNum"] == seqNum]

    def printBlockEvolution(self, block, seqNum=None):
        """Display the evolution of the specified block.

        If the seqNum is specified, the evolution of that specific block
        exection is displayed, otherwise all executions of that block are
        printed.

        Parameters
        ----------
        block : `int`
            The block number to get the events for.
        seqNum : `int`, optional
            The sequence number, if specified, to print the evolution of. If
            not specified, all sequence numbers for the block are printed.
        """
        if seqNum is None:
            seqNums = self.getSeqNums(block)
        else:
            seqNums = [seqNum]
        print(f"Evolution of BLOCK {block} for dayObs={self.dayObs} {seqNum=}:")
        for seqNum in seqNums:
            blockInfo = self.getBlockInfo(block, seqNum)
            print(blockInfo, "\n")

    def getBlockInfo(self, block, seqNum):
        """Get the block info for the specified block.

        Parses the rows relating to this block execution, and returns
        the information as a ``BlockInfo`` instance.

        Parameters
        ----------
        block : `int`
            The block number.
        seqNum : `int`
            The sequence number.

        Returns
        -------
        blockInfo : `lsst.summit.utils.blockUtils.BlockInfo`
            The block info.
        """
        rows = self.getRows(block, seqNum=seqNum)
        if rows.empty:
            print(f"No {seqNum=} on dayObs={self.dayObs} for {block=}")
            return

        blockIds = set()
        tickets = set()
        salIndices = set()
        statePoints = []
        sitcomPattern = r"SITCOM-(\d+)"

        for index, row in rows.iterrows():
            salIndices.add(row["salIndex"])
            blockIds.add(row["blockId"])

            lastCheckpoint = row["lastCheckpoint"]
            sitcomMatches = re.findall(sitcomPattern, lastCheckpoint)
            tickets.update(sitcomMatches)

            time = efdTimestampToAstropy(row["private_efdStamp"])
            state = ScriptState(row["state"])
            reason = row["reason"]
            statePoint = ScriptStatePoint(time=time, state=state, reason=reason)
            statePoints.append(statePoint)

        # likewise for the blockIds
        if len(blockIds) > 1:
            raise RuntimeError(f"Found multiple blockIds ({blockIds}) for {seqNum=}")
        blockId = blockIds.pop()

        blockInfo = BlockInfo(
            blockNumber=block,
            blockId=blockId,
            dayObs=self.dayObs,
            seqNum=seqNum,
            begin=efdTimestampToAstropy(rows.iloc[0]["private_efdStamp"]),
            end=efdTimestampToAstropy(rows.iloc[-1]["private_efdStamp"]),
            salIndices=sorted(salIndices),
            tickets=[f"SITCOM-{ticket}" for ticket in sorted(tickets)],
            states=statePoints,
        )

        return blockInfo

    def getEventsForBlock(self, events, block, seqNum):
        """Get the events which occurred during the specified block.

        Parameters
        ----------
        events : `list` of `lsst.summit.utils.tmaUtils.TMAEvent`
            The list of candidate events.
        block : `int`
            The block number to get the events for.
        seqNum : `int`
            The sequence number to get the events for.

        Returns
        -------
        events : `list` of `lsst.summit.utils.tmaUtils.TMAEvent`
            The events.
        """
        blockInfo = self.getBlockInfo(block, seqNum)
        begin = blockInfo.begin
        end = blockInfo.end

        # each event's end being past the begin time and their
        # starts being before the end time means we get all the
        # events in the window and also those that overlap the
        # start/end too
        return [e for e in events if e.end >= begin and e.begin <= end]

In [None]:
# Example of grabbing manual information on one block/sequence

#block_parser = BlockParser(20240205)
#print("Sequences Run:", block_parser.getSeqNums(219))
#block_parser.getBlockInfo(219,1)

In [None]:
# Produce a Pandas data frame with block information.  More information can be
# added as needed by modifying the function and following the pattern
# in the code.

def day_obs_dataframe(day_obs):
    '''
    Loop over the blocks and sequences for one day and produce a pandas dataframe.
    containing the BLOCK number, BLOCK ID, time start, time stop, and reason for stopping.

    This function returns a pandas dataframe
    '''

    entry_list = []

    block_parser = BlockParser(day_obs)
    blocks = block_parser.getBlockNums()

    for block_id in blocks:
        sequences =  block_parser.getSeqNums(block_id)

        for seq_id in sequences:
            info = block_parser.getBlockInfo(block_id, seq_id)

            start_time = info.begin
            end_time = info.end
            reason = info.states[-1]

            entry_list.append([block_id, seq_id, start_time.iso, end_time.iso,
                               reason])

    data_frame = pd.DataFrame(entry_list, columns = ['Block', 'Sequence',
                                                     'Start', 'Stop',
                                                     'Completion Status'])
    return data_frame


In [None]:
# Show example data frame

day_obs_dataframe(int(date_obs))


In [None]:
def day_obs_report(day_obs):
    '''
    Loop over the blocks and sequences for one day and produce a report.
    Interspace TMA events with the block info.
    '''

    block_parser = BlockParser(day_obs)
    tma_events = event_maker.getEvents(day_obs)
    blocks = block_parser.getBlockNums()

    print(f'SUMMARY REPORT FOR DAYOBS: {day_obs} \n')

    for block_id in blocks:
        sequences =  block_parser.getSeqNums(block_id)

        print(f'BLOCK:SEQ \t STATES')

        for seq_id in sequences:
            info = block_parser.getBlockInfo(block_id, seq_id)
            state_string = ' '.join([str(state) for state in info.states])
            print(f'{info.blockNumber}:{info.seqNum} \t\t {state_string}')

            # Also print any TMA events for this block/sequence
            event = block_parser.getEventsForBlock(tma_events, block_id, seq_id)
            if event: print(event)

        print(f'\n')

This HTML magick below will make it so the example report below doesn't line wrap. Each line can be quite long.

In [None]:
%%html
<style>
div.jp-OutputArea-output pre {
    white-space: pre;
}
</style>

In [None]:
day_obs_report(int(date_obs))