# This notebook is very much work in progress with different attempts and bits to be integrated together. 

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import astropy.units as u

import warnings

from adler.objectdata.AdlerPlanetoid import AdlerPlanetoid

from adler.science.AvgMagModel import AvgMagModel
from adler.science.PhaseCurve import PhaseCurve

from adler.objectdata.Observations import Observations
from adler.objectdata.MPCORB import MPCORB
from adler.objectdata.SSObject import SSObject
from adler.objectdata.AdlerData import (
    AdlerData,
    FilterDependentAdler,
    AdlerSourceFlags,
    VALID_AVG_MAG_MODELS,
    VALID_PHASE_MODELS,
)
from adler.objectdata.objectdata_utilities import get_data_table, get_from_table, mpc_file_preprocessing
import time

In [None]:
plot_filter_colors_white_background = {
    "u": "#1600ea",
    "g": "#31de1f",
    "r": "#b52626",
    "i": "#370201",
    "z": "#ba52ff",
    "y": "#61a2b3",
}
plot_symbols = {"u": "o", "g": "^", "r": "v", "i": "s", "z": "*", "y": "p"}
plot_linestyles = {
    "u": "--",
    "g": (0, (3, 1, 1, 1)),
    "r": "-.",
    "i": "-",
    "z": (0, (3, 1, 1, 1, 1, 1)),
    "y": ":",
}

In [None]:
# root_filepath = "/home/astro-sobrien/"
# root_filepath = "/Volumes/astro-sobrien/home/astro-sobrien/"
# root_filepath = "/Users/seanobrien/Documents/Adler/"
current_path = os.getcwd()
root_filepath = current_path.split("lsst-adler")[0]

# mpc_test_db_filename = f"{root_filepath}lsst-adler/tests/data/mpc_obs_sbn_testing_database.sqlite"
# rubin_sql_filename_oct = f"{root_filepath}rubin_251002.sqlite"
# rubin_sql_filename_nov = f"{root_filepath}rubin_251105.sqlite"
# rubin_sql_filename_nov2 = f"{root_filepath}rubin_251111.sqlite"
# rubin_sql_filename_nov3 = f"{root_filepath}rubin_251115.sqlite"

input_sql_file = f"{root_filepath}lsst-adler/tests/data/mpc_obs_sbn_testing_database.sqlite"
schema = "MPC"

# input_sql_file = f"{root_filepath}lsst-adler/tests/data/testing_database.db"
# schema = 'dp03_catalogs_10yr'
input_conn = sqlite3.connect(input_sql_file)
input_cur = input_conn.cursor()

In [None]:
import logging
from astropy.time import Time

log_timestamp = Time.now().isot.replace(":", "_")

# --- Reset logging system (important for Jupyter) ---
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# --- Configure root logger to log only to a file ---
logging.basicConfig(
    filename=f"{root_filepath}/adler_test_logs/adler_test_{log_timestamp}.log",
    filemode="w",
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    level=logging.DEBUG,  # capture INFO, DEBUG, etc.
)

# Optional confirmation
logging.getLogger().info(f"Root logging configured to write to {root_filepath}adler_test{log_timestamp}.log")

logger = logging.getLogger(__name__)

# Attempt to consolidate things

In [None]:
from tqdm import tqdm
import adler.utilities.science_utilities as sci_utils
import math


# Set column to use for magnitude (we don't currently have reduced_mag populated for MPC file format)
mag_col = "reduced_mag"
magErr_col = "magErr"

# Set filter list (only ugri present currently, very few u, but keeping in for completeness)
# filter_list=['u', 'g', 'r', 'i', 'z', 'y']
filter_list = ["g", "r"]

make_plots = True

if schema == "MPC":
    min_obstime_mjd = math.floor(
        pd.read_sql_query(f"SELECT MIN(mjd_tai) FROM obs_sbn", input_conn).iloc[0].values[0]
    )
    max_obstime_mjd = math.ceil(
        pd.read_sql_query(f"SELECT MAX(mjd_tai) FROM obs_sbn", input_conn).iloc[0].values[0]
    )
elif schema == "dp03_catalogs_10yr":
    min_obstime_mjd = math.floor(
        pd.read_sql_query(f"SELECT MIN(midPointMjdTai) FROM diaSource", input_conn).iloc[0].values[0]
    )
    max_obstime_mjd = math.ceil(
        pd.read_sql_query(f"SELECT MAX(midPointMjdTai) FROM diaSource", input_conn).iloc[0].values[0]
    )
else:
    print("Schema not recognised")
    raise ValueError("Schema not recognised")

print(min_obstime_mjd)
print(max_obstime_mjd)

from astropy.stats import sigma_clip as astropy_sigma_clip

sig_clip_val = 3

# Set thresholds for magnitude changes
# diff_cuts_arr = np.array([1, 2, 3])
# std_cuts_arr = np.array([5, 6])
# Shifting to single thresholds and recording the res value to simplify things. User can always check their own threshold if it's higher than these (or run the code themselves at a lower threshold)
diff_cut = 1.5
std_cut = 5

# Defines how many nights of data to retrieve in total
# By default we take the previous 30 nights and the previous 7 nights to allow different checks for outliers
data_timespan_arr = np.array([30, 7])

# Defines how many nights to consider as "new observations" allowing for the checking of "sustained outliers/outbursts"
n_new_nights_arr = np.array([1, 3])

In [None]:
logger.info("----------------------------------------")
logger.info("New loop started")
logger.info("----------------------------------------")

if schema == "MPC":
    process_mjd_arr = np.array([60799.5])
    model_name = "median"
elif schema == "dp03_catalogs_10yr":
    process_mjd_arr = np.array([61590.5])
    # model_name = "HG12_Pen16" #Not currently all working for PhaseCurves
    model_name = "median"
else:
    raise ValueError("Schema not recognised")

In [None]:
# for process_mjd in process_mjd_arr: #option to loop through multiple days
process_mjd = process_mjd_arr[0]
start_of_night_mjd = process_mjd - 1

if schema == "MPC":
    obj_df = pd.read_sql_query(
        f"SELECT DISTINCT provid FROM obs_sbn WHERE mjd_tai BETWEEN '{start_of_night_mjd}' AND '{process_mjd}'",
        input_conn,
    )
    unique_obj_ids = obj_df.provid.to_numpy()
elif schema == "dp03_catalogs_10yr":
    obj_df = pd.read_sql_query(
        f"SELECT DISTINCT ssObjectId FROM diaSource WHERE midPointMjdTai BETWEEN '{start_of_night_mjd}' AND '{process_mjd}'",
        input_conn,
    )
    unique_obj_ids = obj_df.ssObjectId.to_numpy()
else:
    print("Schema not recognised")
    raise ValueError("Schema not recognised")

logger.info(f"{len(unique_obj_ids)} objects to analyze")

if len(unique_obj_ids) == 0:
    logger.info(f"No objects to process for {process_mjd}")
    raise ValueError(f"No objects to process for {process_mjd}")
    # continue

output_dir = f"outputs_consolidated_{schema}_{process_mjd}"

os.makedirs(output_dir, exist_ok=True)

output_db = f"{output_dir}/adler_output.sqlite"

# Remove the output DB if it exists
# TODO probably remove this once testing done?
if os.path.exists(output_db):
    os.remove(output_db)

for ssObjectId in tqdm(unique_obj_ids, desc=f"Objects to process for {process_mjd}"):
    for data_timespan in data_timespan_arr:
        planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(
            ssObjectId=ssObjectId,
            sql_filename=input_sql_file,
            filter_list=filter_list,
            date_range=[process_mjd - data_timespan, process_mjd],
        )

        for filt in planetoid.filter_list:
            df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

            err_flag = df_obs.magErr.isnull().all()
            if err_flag:
                logger.info("All magErr values are NaNs, proceed with caution")
            else:
                # Remove observations with large errorbars
                magErr_mask = sci_utils.large_magErr_mask(df_obs)
                df_obs = df_obs[magErr_mask]

            for n_new_nights in n_new_nights_arr:
                # Set the modelId here, described by the bare minimum information for replication (I hope)
                planetoid.AdlerData.modelId = (
                    f"{ssObjectId}_{process_mjd:.1f}_{data_timespan}n_{n_new_nights}n_{model_name}"
                )
                # Split into previous observations and observations from the most recent night(s)
                df_obs_old, df_obs_new, *_ = sci_utils.split_obs(
                    df_obs, process_mjd=process_mjd, n_new_nights=n_new_nights
                )
                logger.info(
                    "Previous observations (date < {}): {}".format(
                        process_mjd - n_new_nights, len(df_obs_old)
                    )
                )
                logger.info(
                    "New observations ({} <= date < {}): {}".format(
                        process_mjd - n_new_nights, process_mjd, len(df_obs_new)
                    )
                )
                if len(df_obs_old) < 2:
                    logger.info(
                        "Insufficient number of previous observations, continuing to next band/object"
                    )
                    continue
                if len(df_obs_new) == 0:
                    logger.info(f"No new observations in {filt}, continuing to next band/object")
                    continue

                # Sigma clip old observations
                sig_clip_mask = astropy_sigma_clip(df_obs_old[mag_col], sigma=sig_clip_val).mask
                df_obs_old = df_obs_old[~sig_clip_mask].copy()
                if len(df_obs_old) < 2:
                    logger.info(
                        "Insufficient number of previous observations after sigma clipping, continuing to next band/object"
                    )
                    continue

                # Populate summary AdlerData params for this filter and particular model
                ad_params = {}
                ad_params["phaseAngle_min"] = np.amin(df_obs_old["phaseAngle"])  # * u.d
                ad_params["phaseAngle_range"] = np.ptp(df_obs_old["phaseAngle"])  # * u.d
                ad_params["observationTime_max"] = np.amax(df_obs_old["midPointMjdTai"])  # * u.d
                ad_params["arc"] = np.ptp(df_obs_old["midPointMjdTai"])  # * u.d
                ad_params["nobs"] = len(df_obs_old)
                ad_params["modelFitMjd"] = Time.now().mjd

                # Fit model
                if model_name in VALID_AVG_MAG_MODELS:
                    model = AvgMagModel().InitModelObs(
                        mag=df_obs_old[mag_col], magErr=df_obs_old[magErr_col], model_name=model_name
                    )
                    ad_params.update(model.__dict__)  # store model values in ad_params
                    planetoid.AdlerData.populate_avg_mag_parameters(filt, **ad_params)

                    res = np.array(df_obs_new[mag_col]) - model.avg_mag
                elif model_name in VALID_PHASE_MODELS:
                    # TODO once SSObject is populated we can take the absolute magnitude from there as the initial guess (this is not currently the case for the MPC obs_sbn file)
                    model = PhaseCurve(H=np.amin(df_obs_old["reduced_mag"]) * u.mag, model_name=model_name)
                    pc_fit = model.FitModel(
                        phase_angle=np.array(df_obs_old["phaseAngle"]) * u.deg,
                        reduced_mag=np.array(df_obs_old["reduced_mag"]) * u.mag,
                        mag_err=np.array([df_obs_old[magErr_col]]) * u.mag,
                    )
                    model = model.InitModelSbpy(pc_fit)
                    ad_params.update(model.__dict__)  # store model values in ad_params
                    planetoid.AdlerData.populate_phase_parameters(filt, **ad_params)

                    res = np.array(df_obs_new[mag_col]) - model.ReducedMag(df_obs_new["phaseAngle"])
                else:
                    logger.error(f"Model '{model_name}' not recognised")
                    raise ValueError(f"Model '{model_name}' not recognised")

                filter_index = planetoid.AdlerData.filter_list.index(filt)

                # Check for individual outlying observations
                # Simple magnitude difference
                diff_cut_outlier_arr = sci_utils.outlier_diff(res, diff_cut=diff_cut)
                df_obs_new["mag_diff"] = np.zeros(shape=len(df_obs_new), dtype=float)
                df_obs_new.loc[diff_cut_outlier_arr, "mag_diff"] = res[diff_cut_outlier_arr]
                planetoid.AdlerData.filter_dependent_values[filter_index].n_outlier = np.count_nonzero(
                    diff_cut_outlier_arr
                )
                source_flags_obj = AdlerSourceFlags.construct_from_data_table(
                    planetoid.ssObjectId,
                    filt,
                    planetoid.AdlerData.modelId,
                    df_obs_new.loc[diff_cut_outlier_arr],
                )
                source_flags_obj.write_flags_to_database(output_db)
                # I've written a populate_source_flags function that integrates these into the AdlerData.FilterDependentAdler.source_flags list
                # planetoid.AdlerData.populate_source_flags(filt, planetoid.AdlerData.modelId, df_obs_new.loc[diff_cut_outlier_arr])

                # TODO std_diff
                # Sigma_diff check looking for how many uncertainties a given point is from the model
                # Need to figure out the terminology, the check will be using science_utilities.outlier_sigma_diff

                # Sustained outburst checks
                # TODO how will sustained checks work with PhaseCurve models

                # Identify timegaps in case there's only one night of new data (in the case where we select >1 new night)
                df_obs_new.sort_values(by="midPointMjdTai", inplace=True)
                time_gaps = sci_utils.apparition_gap_finder(df_obs_new.midPointMjdTai.to_numpy(), dx=0.5)
                if len(time_gaps) == 0:
                    # If there is only one night of new data, we continue to the next band/object
                    logger.info(
                        "Insufficient number of nights with new observations, continuing to next band/object"
                    )
                    continue

                # Sigma clip new observations
                sig_clip_mask_new = astropy_sigma_clip(df_obs_new[mag_col], sigma=sig_clip_val).mask
                df_obs_new = df_obs_new[~sig_clip_mask_new].copy()

                if len(df_obs_new) == 0:
                    logger.info(
                        f"No new observations in {filt} after sigma clipping, continuing to next band/object"
                    )
                    continue

                # Calculate average magnitude of the new observations (slightly obtuse to use the model here but future proofing maybe)
                new_obs_model = AvgMagModel().InitModelObs(
                    mag=df_obs_new[mag_col], magErr=df_obs_new[magErr_col], model_name=model_name
                )
                mag_change = np.abs(new_obs_model.avg_mag - model.avg_mag)
                # Difference in magnitude space
                sustained_diff_cut_flag = mag_change > diff_cut
                # Check if sustained outlier detected and record the different between the old and new avg_mag values in AdlerData
                if sustained_diff_cut_flag:
                    planetoid.AdlerData.filter_dependent_values[filter_index].sustained_outlier = mag_change
                else:
                    logger.info(f"No sustained outlier detected")

                # TODO sustained difference in sigma space

                # Write out summary AdlerData information
                # TODO should this also have an option to write out source_flags instead of the call above so it's a one-liner?
                planetoid.AdlerData.write_row_to_database(output_db, write_model_data=False)

                # TODO make optional plotting routine into a function
                if make_plots:
                    # Up-to-date for the mag_diff/sustained_mag_diff checks
                    # Identify any outliers detected
                    # tmp_master_outlier_flag = (diff_cut_outlier_arr) | (std_cut_outlier_arr)
                    fig, ax = plt.subplots()
                    ax.errorbar(
                        df_obs_old["midPointMjdTai"],
                        df_obs_old[mag_col],
                        df_obs_old[magErr_col],
                        ls="",
                        marker=".",
                        color="k",
                        label="Previous observations",
                    )
                    ax.errorbar(
                        df_obs_new["midPointMjdTai"],
                        df_obs_new[mag_col],
                        df_obs_new[magErr_col],
                        ls="",
                        marker=".",
                        color="c",
                        label="New observations",
                    )
                    ax.errorbar(
                        df_obs_new[diff_cut_outlier_arr]["midPointMjdTai"],
                        df_obs_new[diff_cut_outlier_arr][mag_col],
                        df_obs_new[diff_cut_outlier_arr][magErr_col],
                        ls="",
                        marker="x",
                        color="b",
                        label="Outliers",
                    )
                    ax.axhline(model.avg_mag, c="k", ls="-")
                    ax.axhline(new_obs_model.avg_mag, c="c", ls="--")
                    ax.invert_yaxis()
                    ax.set_xlabel("Time [MJD]")
                    ax.set_ylabel(f"{filt}-band Reduced Magnitude")
                    fig.savefig(
                        f"{output_dir}/{planetoid.AdlerData.modelId}_{filt}_outliers.png",
                        bbox_inches="tight",
                        pad_inches=0.05,
                    )
                    plt.close(fig)

# TODO Function for summary stats of all outliers detected across all objects
# TODO multi-band outliers identified in this step?

In [None]:
unique_obj_ids

In [None]:
obj_id = "2025 MX40"
planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(
    ssObjectId=obj_id,
    sql_filename=input_sql_file,
    filter_list=filter_list,
    date_range=[process_mjd - data_timespan, process_mjd],
)

adler_data = AdlerData(obj_id, planetoid.filter_list)

filt = "r"
df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

In [None]:
model = AvgMagModel().InitModelObs(mag=df_obs.reduced_mag)

In [None]:
model.__dict__

# Expanded format:

TODO new format:

(run for night in night but once scripted it will be provided with a given night)

include flag for MPC vs DP0.3 (for testing)

for object in objects:
    for filt in filters:
        for n_new_nights in [1,3,7]:
            sigma clip old observations, calculate median
            check for 1/2/3 mag outliers
            check for 3/4/5 sigma outliers


calculate summary stats;
N objects with outliers
N outliers
distribution of outliers per object and per filter

other things to consider:
consecutive outliers trigger in sigma space as an additional check? (i.e. are 95% of the new points outliers)
are the outliers consistent across multiple filters
minimum number of data points (different for different checks)
minimum number of nights (different for different checks)
if path to database that exists is provided then populate AdlerData and outliers (from AdlerSourceFlags) during processing and write to/update this database


In [None]:
# Number of days of data to retrieve (i.e. previous 30 nights)
data_timespan = 30

from tqdm import tqdm
import adler.utilities.science_utilities as sci_utils
import math


# Set column to use for magnitude (we don't currently have reduced_mag populated for MPC file format)
mag_col = "reduced_mag"
magErr_col = "magErr"

# Set filter list (only ugri present currently, very few u, but keeping in for completeness)
filter_list = ["u", "g", "r", "i", "z", "y"]
# filter_list=['g', 'r', 'i']

make_plots = True

if schema == "MPC":
    min_obstime_mjd = math.floor(
        pd.read_sql_query(f"SELECT MIN(mjd_tai) FROM obs_sbn", input_conn).iloc[0].values[0]
    )
    max_obstime_mjd = math.ceil(
        pd.read_sql_query(f"SELECT MAX(mjd_tai) FROM obs_sbn", input_conn).iloc[0].values[0]
    )
elif schema == "dp03_catalogs_10yr":
    min_obstime_mjd = math.floor(
        pd.read_sql_query(f"SELECT MIN(midPointMjdTai) FROM diaSource", input_conn).iloc[0].values[0]
    )
    max_obstime_mjd = math.ceil(
        pd.read_sql_query(f"SELECT MAX(midPointMjdTai) FROM diaSource", input_conn).iloc[0].values[0]
    )
else:
    print("Schema not recognised")

In [None]:
from astropy.stats import sigma_clip as astropy_sigma_clip

sig_clip_val = 3

In [None]:
# Set thresholds for magnitude changes
diff_cuts_arr = np.array([1, 2, 3])
# std_cuts_arr = np.array([3, 4, 5])
std_cuts_arr = np.array([5, 6])

# Defines how many nights to consider as "new observations," allowing for the checking of "sustained outliers/outbursts"
n_new_nights_arr = np.array([1, 3, 7])

In [None]:
# Defined the strings that will be used as column headers and populated later
night_mag_string = "outlier_{}night_mag"
night_std_string = "outlier_{}night_sigma"

outlier_cols_list = []
for n_new_nights in n_new_nights_arr:
    outlier_cols_list.append(night_mag_string.format(n_new_nights))
    outlier_cols_list.append(night_std_string.format(n_new_nights))

# TODO include this in function for initialising the output Adler DB
sql_outlier_cols = ""
for col in outlier_cols_list:
    sql_outlier_cols += f", {col} INTEGER"

adler_flags_create_sql = f"CREATE TABLE AdlerSourceFlags(ssObjectId TEXT, filter_name TEXT, diaSourceId TEXT, midPointMjdTai INTEGER{sql_outlier_cols})"

In [None]:
logger.info("----------------------------------------")
logger.info("New loop started")
logger.info("----------------------------------------")

# process_mjd_arr = np.arange(min_obstime_mjd-0.5, max_obstime_mjd+1.5, 1)
# process_mjd_arr = np.arange(60795.5, 60799.5, 1)
process_mjd_arr = np.array([60799.5])

for process_mjd in process_mjd_arr:
    process_date = mjd_to_utc(process_mjd)
    start_of_night = mjd_to_utc(process_mjd - 1)

    # Get list of objects with observations from most recent night preceding the process date
    # obj_df = pd.read_sql_query(f"SELECT DISTINCT provid FROM obs_sbn WHERE obstime BETWEEN '{start_of_night}' AND '{process_date}' LIMIT 20", input_conn)
    obj_df = pd.read_sql_query(
        f"SELECT DISTINCT provid FROM obs_sbn WHERE obstime BETWEEN '{start_of_night}' AND '{process_date}'",
        input_conn,
    )
    unique_obj_ids = obj_df.provid.to_numpy()
    logger.info(f"{len(unique_obj_ids)} objects to analyze")

    if len(unique_obj_ids) == 0:
        logger.info(f"No objects to process for {process_mjd}")
        continue

    output_dir = f"outputs_lesscols_update_full_{process_mjd}"

    os.makedirs(output_dir, exist_ok=True)

    adler_output_filename = f"{output_dir}/adler_output.sqlite"
    conn_adler_out = sqlite3.connect(adler_output_filename)

    # TODO tidy this up into function
    cur_adler_out = conn_adler_out.cursor()

    cur_adler_out.execute("DROP TABLE IF EXISTS AdlerData;")
    cur_adler_out.execute("DROP TABLE IF EXISTS  AdlerSourceFlags;")

    cur_adler_out.execute("CREATE TABLE AdlerData(ssObjectId TEXT, timestamp REAL, PRIMARY KEY (ssObjectId))")
    # added creation of AdlerSourceFlags table
    cur_adler_out.execute(adler_flags_create_sql)

    for obj_id in tqdm(unique_obj_ids, desc=f"Objects to process for {process_mjd}"):
        # Taking all data, no time constraint
        planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(
            ssObjectId=obj_id,
            sql_filename=input_rubin_sql_file,
            filter_list=filter_list,
            date_range=[process_mjd - data_timespan, process_mjd],
        )

        adler_data = AdlerData(obj_id, planetoid.filter_list)

        for filt in planetoid.filter_list:
            df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

            # Initialise outlier columns in df_obs
            # TODO Consider how we'll load in previous flags if an existing Adler DB is provided
            for col in outlier_cols_list:
                # Populate outlier columns with zeros
                df_obs[col] = np.zeros(shape=len(df_obs), dtype=int)

            # TODO this may change as no longer checking how many datapoints from previous process
            nobs_nomask = len(df_obs)

            err_flag = df_obs.magErr.isnull().all()
            if err_flag:
                logger.info("All magErr values are NaNs, proceed with caution")
            else:
                # Remove observations with large errorbars
                magErr_percentile_cut = 95  # Value (between 0 and 100) to define the percentile above which we cut data with large magErr values
                magErr_mask = df_obs.magErr <= np.nanpercentile(df_obs.magErr, q=magErr_percentile_cut)
                df_obs = df_obs[magErr_mask]

            for n_new_nights in n_new_nights_arr:
                # Split into previous observations and observations from the most recent night(s)
                mask = df_obs["midPointMjdTai"] < process_mjd - n_new_nights

                df_obs_old = df_obs[mask].copy()
                df_obs_new = df_obs[~mask].copy()
                logger.info("Previous observations (date < {}): {}".format(process_mjd - 1, len(df_obs_old)))
                logger.info(
                    "New observations ({} <= date < {}): {}".format(
                        process_mjd - 1, process_mjd, len(df_obs_new)
                    )
                )

                if len(df_obs_old) < 2:
                    # Taken from adler_demo.py
                    logger.info(
                        "Insufficient number of previous observations, continuing to next band/object"
                    )
                    continue
                if len(df_obs_new) == 0:
                    logger.info(f"No new observations in {filt}, continuing to next band/object")
                    continue

                # TODO consider how this affects writing out to AdlerData (possibly fine with how it's already setup)
                sig_clip_mask = astropy_sigma_clip(df_obs_old[mag_col], sigma=sig_clip_val).mask

                df_obs_old = df_obs_old[~sig_clip_mask].copy()

                if len(df_obs_old) < 2:
                    # Taken from adler_demo.py
                    logger.info(
                        "Insufficient number of previous observations after sigma clipping, continuing to next band/object"
                    )
                    continue

                median_mag = np.median(df_obs_old[mag_col])
                # TODO consider storing relevant res values
                res = np.array(df_obs_new[mag_col]) - median_mag

                for diff_cut in diff_cuts_arr:
                    diff_cut_outlier_arr = sci_utils.outlier_diff(res, diff_cut=diff_cut)
                    # Populate outlier rows with True returned by outlier_diff with the current diff_cut value
                    df_obs_new.loc[diff_cut_outlier_arr, night_mag_string.format(n_new_nights)] = diff_cut

                std_cut_outlier_arr = [False] * len(df_obs_new)
                if len(df_obs_old) < 4:
                    logger.info(
                        "Insufficient number of previous observations to check with outlier_sigma_diff"
                    )
                    # TODO how to handle this case (i.e. do we populate the columns with something other than False?)
                else:
                    if err_flag:
                        logger.info(f"No measurement errors, can't attempt outlier_sigma_diff")
                        # TODO how to handle this case (i.e. do we populate the columns with something other than False?)
                    else:
                        for std_cut in std_cuts_arr:
                            std_cut_outlier_arr = sci_utils.outlier_sigma_diff(
                                res, df_obs_new[magErr_col], std_sigma=std_cut
                            )
                            df_obs_new.loc[std_cut_outlier_arr, night_std_string.format(n_new_nights)] = (
                                std_cut
                            )

                if make_plots:
                    # Identify any outliers detected
                    tmp_master_outlier_flag = (diff_cut_outlier_arr) | (std_cut_outlier_arr)
                    fig, ax = plt.subplots()
                    ax.errorbar(
                        df_obs_old["midPointMjdTai"],
                        df_obs_old[mag_col],
                        df_obs_old[magErr_col],
                        ls="",
                        marker=".",
                        color="k",
                        label="Previous observations",
                    )
                    ax.errorbar(
                        df_obs_new["midPointMjdTai"],
                        df_obs_new[mag_col],
                        df_obs_new[magErr_col],
                        ls="",
                        marker=".",
                        color="c",
                        label="New observations",
                    )
                    ax.errorbar(
                        df_obs_new[tmp_master_outlier_flag]["midPointMjdTai"],
                        df_obs_new[tmp_master_outlier_flag][mag_col],
                        df_obs_new[tmp_master_outlier_flag][magErr_col],
                        ls="",
                        marker="x",
                        color="b",
                        label="Outliers",
                    )
                    ax.axhline(median_mag)
                    ax.invert_yaxis()
                    ax.set_xlabel("Time [MJD]")
                    ax.set_ylabel(f"{filt}-band Reduced Magnitude")
                    fig.savefig(
                        f"{output_dir}/{obj_id}_{filt}_{n_new_nights}nights_outliers.png",
                        bbox_inches="tight",
                        pad_inches=0.05,
                    )
                    plt.close(fig)

                # Use pandas.DataFrame.update to add the outlier flags to the df_obs DataFrame
                df_obs.update(df_obs_new)

            # Identify rows that contain at least 1 outlier flag set to a value greater than 0
            master_outlier_flag = df_obs[outlier_cols_list].any(axis=1)
            df_obs.loc[
                master_outlier_flag,
                ["ssObjectId", "filter_name", "diaSourceId", "midPointMjdTai"] + outlier_cols_list,
            ].to_sql("AdlerSourceFlags", con=conn_adler_out, if_exists="append", index=False)

            # TODO consider how to include check and flag for if outliers span multiple consecutive nights
            # Perhaps this is easier now we update df_obs
            # Some kind of resetting cumulative sum to test for sustained outliers that then looks for any value above say 3. [0,1,0,1,2,0] gives [0,1,0,1,3,0]

            # Write nobs to AdlerData
            # TODO write median and standard deviation of previous observations to AdlerData
            # Create AdlerData structure to do the above where we have a particular model that is designed for the median
            adler_data.populate_phase_parameters(filt, **{"nobs": nobs_nomask})
            adler_data.write_row_to_database(adler_output_filename)

            logger.info(f"New information for {obj_id} written to AdlerSourceFlags and AdlerData tables")

    # TODO make summary stats a function
    logger.info(f"Computing summary statistics for {process_mjd}")

    cur_adler_out.execute("SELECT COUNT(*) FROM AdlerData")
    n_obj_analzyed = cur_adler_out.fetchall()[0][0]
    logger.info(f"Number of objects analysed: {n_obj_analzyed}")

    # Get column information to check what filters have previously been analysed (and therefore have {filter}_nobs columns in AdlerData)
    cur_adler_out.execute("PRAGMA table_info(AdlerData);")
    adler_out_cols = [row[1] for row in cur_adler_out.fetchall()]
    filter_nobs_columns = [c for c in adler_out_cols if c.endswith("_nobs")]
    current_adlerdata_filters = [c.replace("_nobs", "") for c in filter_nobs_columns]

    # #Manually removing u filter because it has so few observations in comparison
    # filter_nobs_columns.remove('u_nobs')

    filter_columns_sql = ", ".join(filter_nobs_columns)
    nobs_df = pd.read_sql_query(f"SELECT ssObjectId, {filter_columns_sql} FROM AdlerData", conn_adler_out)

    for band in current_adlerdata_filters:
        med_obs_band = np.median(nobs_df[f"{band}_nobs"])
        std_obs_band = np.std(nobs_df[f"{band}_nobs"])
        sum_obs_band = np.sum(nobs_df[f"{band}_nobs"])

        logger.info(f"Total number of observations for {band}-band: {sum_obs_band}")
        logger.info(
            f"Observations per object for {band}-band: Median={med_obs_band}, standard deviation={std_obs_band}"
        )

    filters_to_plot = ["g", "r", "i"]
    fig, ax = plt.subplots()
    ax.hist(
        (nobs_df["g_nobs"], nobs_df["r_nobs"], nobs_df["i_nobs"]),
        color=[plot_filter_colors_white_background[filt] for filt in filters_to_plot],
        label=[f"{filt}-band" for filt in filters_to_plot],
    )

    ax.set_xlabel("Observations per filter")
    ax.set_ylabel("Number of objects")
    ax.legend()
    fig.savefig(f"{output_dir}/obs_per_band_hists.png", bbox_inches="tight", pad_inches=0.05)
    plt.close(fig)
    logger.info(
        f"Histograms of observations per band per object saved to {output_dir}/obs_per_band_hists.png"
    )

    for n_new_nights in n_new_nights_arr:
        logger.info(
            f"Checking for number of outliers when considering last {n_new_nights} night(s) as new observations"
        )
        for diff_cut in diff_cuts_arr:
            cur_adler_out.execute(
                f"SELECT COUNT(*) FROM AdlerSourceFlags WHERE {night_mag_string.format(n_new_nights)}>={diff_cut}"
            )
            logger.info(f"Number of outliers above {diff_cut} magnitude: {cur_adler_out.fetchall()[0][0]}")
        for std_cut in std_cuts_arr:
            cur_adler_out.execute(
                f"SELECT COUNT(*) FROM AdlerSourceFlags WHERE {night_std_string.format(n_new_nights)}>={std_cut}"
            )
            logger.info(f"Number of outliers above {std_cut}-sigma: {cur_adler_out.fetchall()[0][0]}")

    # TODO perhaps add example query to find the outliers in the output database

    # TODO distribution of outliers per object and per filter

### Interactive plotting

In [None]:
# Setup
# obj_id = "2025 MV10"
obj_id = "2025 MX40"
process_mjd = 60799.5
filt = "g"
n_new_nights = 3

#######

planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(
    ssObjectId=obj_id,
    sql_filename=input_rubin_sql_file,
    filter_list=filter_list,
    date_range=[process_mjd - data_timespan, process_mjd],
)

adler_data = AdlerData(obj_id, planetoid.filter_list)

df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

# Initialise outlier columns in df_obs
# TODO Consider how we'll load in previous flags if an existing Adler DB is provided
for col in outlier_cols_list:
    # Populate outlier columns with zeros
    df_obs[col] = np.zeros(shape=len(df_obs), dtype=int)

nobs_nomask = len(df_obs)

err_flag = df_obs.magErr.isnull().all()
if err_flag:
    logger.info("All magErr values are NaNs, proceed with caution")
else:
    # Remove observations with large errorbars
    magErr_percentile_cut = 95  # Value (between 0 and 100) to define the percentile above which we cut data with large magErr values
    magErr_mask = df_obs.magErr <= np.nanpercentile(df_obs.magErr, q=magErr_percentile_cut)
    df_obs = df_obs[magErr_mask]

# Split into previous observations and observations from the most recent night(s)
mask = df_obs["midPointMjdTai"] < process_mjd - n_new_nights

df_obs_old = df_obs[mask].copy()
df_obs_new = df_obs[~mask].copy()
logger.info("Previous observations (date < {}): {}".format(process_mjd - 1, len(df_obs_old)))
logger.info("New observations ({} <= date < {}): {}".format(process_mjd - 1, process_mjd, len(df_obs_new)))

# if len(df_obs_old)<2:
#     #Taken from adler_demo.py
#     logger.info("Insufficient number of previous observations, continuing to next band/object")
#     continue
# if len(df_obs_new)==0:
#     logger.info(f"No new observations in {filt}, continuing to next band/object")
#     continue

# TODO consider how this affects writing out to AdlerData (possibly fine with how it's already setup)
sig_clip_mask = astropy_sigma_clip(df_obs_old[mag_col], sigma=sig_clip_val).mask

df_obs_old = df_obs_old[~sig_clip_mask].copy()

# if len(df_obs_old)<2:
#     #Taken from adler_demo.py
#     logger.info("Insufficient number of previous observations after sigma clipping, continuing to next band/object")
#     continue

median_mag = np.median(df_obs_old[mag_col])
# TODO consider storing relevant res values
res = np.array(df_obs_new[mag_col]) - median_mag

for diff_cut in diff_cuts_arr:
    diff_cut_outlier_arr = sci_utils.outlier_diff(res, diff_cut=diff_cut)
    # Populate outlier rows with True returned by outlier_diff with the current diff_cut value
    df_obs_new.loc[diff_cut_outlier_arr, night_mag_string.format(n_new_nights)] = diff_cut

std_cut_outlier_arr = [False] * len(df_obs_new)
if len(df_obs_old) < 4:
    logger.info("Insufficient number of previous observations to check with outlier_sigma_diff")
    # TODO how to handle this case (i.e. do we populate the columns with something other than False?)
else:
    if err_flag:
        logger.info(f"No measurement errors, can't attempt outlier_sigma_diff")
        # TODO how to handle this case (i.e. do we populate the columns with something other than False?)
    else:
        for std_cut in std_cuts_arr:
            std_cut_outlier_arr = sci_utils.outlier_sigma_diff(res, df_obs_new[magErr_col], std_sigma=std_cut)
            df_obs_new.loc[std_cut_outlier_arr, night_std_string.format(n_new_nights)] = std_cut

%matplotlib widget
# Identify any outliers detected
tmp_master_outlier_flag = (diff_cut_outlier_arr) | (std_cut_outlier_arr)
fig, ax = plt.subplots(figsize=(10, 6))
ax.errorbar(
    df_obs_old["midPointMjdTai"],
    df_obs_old[mag_col],
    df_obs_old[magErr_col],
    ls="",
    marker=".",
    color="k",
    label="Previous observations",
)
ax.errorbar(
    df_obs_new["midPointMjdTai"],
    df_obs_new[mag_col],
    df_obs_new[magErr_col],
    ls="",
    marker=".",
    color="c",
    label="New observations",
)
ax.errorbar(
    df_obs_new[tmp_master_outlier_flag]["midPointMjdTai"],
    df_obs_new[tmp_master_outlier_flag][mag_col],
    df_obs_new[tmp_master_outlier_flag][magErr_col],
    ls="",
    marker="x",
    color="b",
    label="Outliers",
)
ax.axhline(median_mag)
ax.invert_yaxis()
ax.set_xlabel("Time [MJD]")
ax.set_ylabel(f"{filt}-band Reduced Magnitude")

# Format taking median of new observations and old observations to compare

In [None]:
# input_rubin_sql_file = f"{root_filepath}rubin_251105.sqlite"
input_rubin_sql_file = f"{root_filepath}lsst-adler/tests/data/mpc_obs_sbn_testing_database.sqlite"

input_conn = sqlite3.connect(input_rubin_sql_file)
input_cur = input_conn.cursor()

# Number of days of data to retrieve (i.e. previous 30 nights)
data_timespan = 30

from tqdm import tqdm
import adler.utilities.science_utilities as sci_utils
import math


# Set column to use for magnitude (we don't currently have reduced_mag populated for MPC file format)
mag_col = "reduced_mag"
magErr_col = "magErr"

# Set filter list (only ugri present currently, very few u, but keeping in for completeness)
filter_list = ["u", "g", "r", "i", "z", "y"]
# filter_list=['g', 'r', 'i']

make_plots = True

# TODO implement schema switching
schema = "MPC"
# schema = "dp03_catalogs_10yr"

min_obstime_mjd = math.floor(
    pd.read_sql_query(f"SELECT MIN(mjd_tai) FROM obs_sbn", input_conn).iloc[0].values[0]
)
max_obstime_mjd = math.ceil(
    pd.read_sql_query(f"SELECT MAX(mjd_tai) FROM obs_sbn", input_conn).iloc[0].values[0]
)

In [None]:
from astropy.stats import sigma_clip as astropy_sigma_clip

sig_clip_val = 3

In [None]:
# Set thresholds for magnitude changes
diff_cuts_arr = np.array([0, 1, 2, 3])  # 0 included due to way we currently write to the output DB
# std_cuts_arr = np.array([3, 4, 5])
std_cuts_arr = np.array([5, 6])

# Defines how many nights to consider as "new observations," allowing for the checking of "sustained outliers/outbursts"
# n_new_nights_arr = np.array([3])

n_new_nights = 3

In [None]:
# #Defined the strings that will be used as column headers and populated later
# night_mag_string = "outlier_{}night_mag"
# night_std_string = "outlier_{}night_sigma"

# outlier_cols_list = []
# for n_new_nights in n_new_nights_arr:
#     outlier_cols_list.append(night_mag_string.format(n_new_nights))
#     outlier_cols_list.append(night_std_string.format(n_new_nights))

# #TODO include this in function for initialising the output Adler DB
# sql_outlier_cols=""
# for col in outlier_cols_list:
#     sql_outlier_cols += f", {col} INTEGER"

# adler_flags_create_sql = f"CREATE TABLE AdlerSourceFlags(ssObjectId TEXT, filter_name TEXT, diaSourceId TEXT, midPointMjdTai INTEGER{sql_outlier_cols})"

In [None]:
logger.info("----------------------------------------")
logger.info("New loop started")
logger.info("----------------------------------------")

# process_mjd_arr = np.arange(min_obstime_mjd-0.5, max_obstime_mjd+1.5, 1)
# process_mjd_arr = np.arange(60795.5, 60799.5, 1)
process_mjd_arr = np.array([60799.5])

for process_mjd in process_mjd_arr:
    start_of_night_mjd = process_mjd - 1

    # Get list of objects with observations from most recent night preceding the process date
    # obj_df = pd.read_sql_query(f"SELECT DISTINCT provid FROM obs_sbn WHERE obstime BETWEEN '{start_of_night}' AND '{process_date}' LIMIT 20", input_conn)
    obj_df = pd.read_sql_query(
        f"SELECT DISTINCT provid FROM obs_sbn WHERE mjd_tai BETWEEN '{start_of_night_mjd}' AND '{process_mjd}'",
        input_conn,
    )
    unique_obj_ids = obj_df.provid.to_numpy()
    logger.info(f"{len(unique_obj_ids)} objects to analyze")

    if len(unique_obj_ids) == 0:
        logger.info(f"No objects to process for {process_mjd}")
        continue

    output_dir = f"outputs_sustained_checks_v2_{process_mjd}"

    os.makedirs(output_dir, exist_ok=True)

    adler_output_filename = f"{output_dir}/adler_output.sqlite"
    conn_adler_out = sqlite3.connect(adler_output_filename)

    # TODO tidy this up into function
    cur_adler_out = conn_adler_out.cursor()

    cur_adler_out.execute("DROP TABLE IF EXISTS AdlerData;")
    cur_adler_out.execute("DROP TABLE IF EXISTS  AdlerSourceFlags;")

    # TODO make this non-manual
    cur_adler_out.execute(
        f"CREATE TABLE AdlerData(ssObjectId TEXT, timestamp REAL, g_{n_new_nights}night_sustained, r_{n_new_nights}night_sustained, i_{n_new_nights}night_sustained, u_{n_new_nights}night_sustained, PRIMARY KEY (ssObjectId))"
    )
    # added creation of AdlerSourceFlags table
    # cur_adler_out.execute(adler_flags_create_sql)

    for obj_id in tqdm(unique_obj_ids, desc=f"Objects to process for {process_mjd}"):
        # Taking all data, no time constraint
        planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(
            ssObjectId=obj_id,
            sql_filename=input_rubin_sql_file,
            filter_list=filter_list,
            date_range=[process_mjd - data_timespan, process_mjd],
        )

        adler_data = AdlerData(obj_id, planetoid.filter_list)

        for filt in planetoid.filter_list:
            df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

            # Initialise outlier columns in df_obs
            # TODO Consider how we'll load in previous flags if an existing Adler DB is provided
            # for col in outlier_cols_list:
            #     #Populate outlier columns with zeros
            #     df_obs[col] = np.zeros(shape=len(df_obs), dtype=int)

            # TODO this may change as no longer checking how many datapoints from previous process
            nobs_nomask = len(df_obs)

            err_flag = df_obs.magErr.isnull().all()
            if err_flag:
                logger.info("All magErr values are NaNs, proceed with caution")
            else:
                # Remove observations with large errorbars
                magErr_percentile_cut = 95  # Value (between 0 and 100) to define the percentile above which we cut data with large magErr values
                magErr_mask = df_obs.magErr <= np.nanpercentile(df_obs.magErr, q=magErr_percentile_cut)
                df_obs = df_obs[magErr_mask]

            # for n_new_nights in n_new_nights_arr:
            # Split into previous observations and observations from the most recent night(s)
            mask = df_obs["midPointMjdTai"] < process_mjd - n_new_nights

            # TODO update the splitting and checking of previous nights (at least 2 nights in previous 3-5 nights to do sustained check
            df_obs_old = df_obs[mask].copy()
            df_obs_new = df_obs[~mask].copy()
            logger.info("Previous observations (date < {}): {}".format(process_mjd - 1, len(df_obs_old)))
            logger.info(
                "New observations ({} <= date < {}): {}".format(process_mjd - 1, process_mjd, len(df_obs_new))
            )

            if len(df_obs_old) < 2:
                # Taken from adler_demo.py
                logger.info("Insufficient number of previous observations, continuing to next band/object")
                continue
            if len(df_obs_new) == 0:
                logger.info(f"No new observations in {filt}, continuing to next band/object")
                continue

            df_obs_new.sort_values(by="midPointMjdTai", inplace=True)

            time_gaps = sci_utils.apparition_gap_finder(df_obs_new.midPointMjdTai.to_numpy(), dx=0.5)
            if len(time_gaps) == 0:
                # If there is only one night of new data, we continue to the next band/object
                logger.info(
                    "Insufficient number of nights with new observations, continuing to next band/object"
                )
                continue

            # TODO consider how this affects writing out to AdlerData (possibly fine with how it's already setup)
            sig_clip_mask = astropy_sigma_clip(df_obs_old[mag_col], sigma=sig_clip_val).mask

            df_obs_old = df_obs_old[~sig_clip_mask].copy()

            sig_clip_mask = astropy_sigma_clip(df_obs_new[mag_col], sigma=sig_clip_val).mask

            df_obs_new = df_obs_new[~sig_clip_mask].copy()

            if len(df_obs_old) < 2:
                # Taken from adler_demo.py
                logger.info(
                    "Insufficient number of previous observations after sigma clipping, continuing to next band/object"
                )
                continue
            if len(df_obs_new) == 0:
                logger.info(
                    f"No new observations in {filt} after sigma clipping, continuing to next band/object"
                )
                continue

            old_median_mag = np.median(df_obs_old[mag_col])

            new_median_mag = np.median(df_obs_new[mag_col])

            # #TODO consider storing relevant res values
            # res = np.array(df_obs_new[mag_col]) - median_mag

            diff_cut_threshold_arr = np.abs(new_median_mag - old_median_mag) > diff_cuts_arr

            diff_cut_met = np.max(diff_cuts_arr[diff_cut_threshold_arr])

            # TODO add columns to adlerdata (temporary version) to store these sustaine doutliers, fix the plot down below

            # for diff_cut in diff_cuts_arr:
            #     diff_cut_outlier_arr = sci_utils.outlier_diff(res, diff_cut=diff_cut)
            #     #Populate outlier rows with True returned by outlier_diff with the current diff_cut value
            #     df_obs_new.loc[diff_cut_outlier_arr, night_mag_string.format(n_new_nights)] = diff_cut

            # std_cut_outlier_arr = [False] * len(df_obs_new)
            # if len(df_obs_old)<4:
            #     logger.info("Insufficient number of previous observations to check with outlier_sigma_diff")
            #     #TODO how to handle this case (i.e. do we populate the columns with something other than False?)
            # else:
            #     if err_flag:
            #         logger.info(f"No measurement errors, can't attempt outlier_sigma_diff")
            #         #TODO how to handle this case (i.e. do we populate the columns with something other than False?)
            #     else:
            #         for std_cut in std_cuts_arr:
            #             std_cut_outlier_arr = sci_utils.outlier_sigma_diff(res, df_obs_new[magErr_col], std_sigma=std_cut)
            #             df_obs_new.loc[std_cut_outlier_arr, night_std_string.format(n_new_nights)] = std_cut

            if make_plots:
                # Identify any outliers detected
                fig, ax = plt.subplots()
                ax.errorbar(
                    df_obs_old["midPointMjdTai"],
                    df_obs_old[mag_col],
                    df_obs_old[magErr_col],
                    ls="",
                    marker=".",
                    color="k",
                    label="Previous observations",
                )
                ax.errorbar(
                    df_obs_new["midPointMjdTai"],
                    df_obs_new[mag_col],
                    df_obs_new[magErr_col],
                    ls="",
                    marker=".",
                    color="c",
                    label="New observations",
                )
                ax.axhline(old_median_mag, ls="--", c="k")
                ax.axhline(new_median_mag, ls="--", c="c")
                ax.invert_yaxis()
                ax.set_xlabel("Time [MJD]")
                ax.set_ylabel(f"{filt}-band Reduced Magnitude")
                fig.savefig(
                    f"{output_dir}/{obj_id}_{filt}_{n_new_nights}nights_outliers.png",
                    bbox_inches="tight",
                    pad_inches=0.05,
                )
                plt.close(fig)

                # #Use pandas.DataFrame.update to add the outlier flags to the df_obs DataFrame
                # df_obs.update(df_obs_new)

            # Identify rows that contain at least 1 outlier flag set to a value greater than 0
            # master_outlier_flag = df_obs[outlier_cols_list].any(axis=1)
            # df_obs.loc[master_outlier_flag,['ssObjectId', 'filter_name', 'diaSourceId', 'midPointMjdTai'] + outlier_cols_list].to_sql('AdlerSourceFlags', con=conn_adler_out, if_exists='append', index=False)

            # TODO consider how to include check and flag for if outliers span multiple consecutive nights
            # Perhaps this is easier now we update df_obs
            # Some kind of resetting cumulative sum to test for sustained outliers that then looks for any value above say 3. [0,1,0,1,2,0] gives [0,1,0,1,3,0]

            # Write nobs to AdlerData
            # TODO write median and standard deviation of previous observations to AdlerData
            # Create AdlerData structure to do the above where we have a particular model that is designed for the median
            adler_data.populate_phase_parameters(filt, **{"nobs": nobs_nomask})
            adler_data.write_row_to_database(adler_output_filename)

            cur_adler_out.execute(
                f"UPDATE AdlerData SET {filt}_{n_new_nights}night_sustained = {diff_cut_met} WHERE ssObjectId='{obj_id}'"
            )
            conn_adler_out.commit()

            logger.info(f"New information for {obj_id} written to AdlerSourceFlags and AdlerData tables")

    # TODO make summary stats a function
    # logger.info(f"Computing summary statistics for {process_mjd}")

    # cur_adler_out.execute("SELECT COUNT(*) FROM AdlerData")
    # n_obj_analzyed = cur_adler_out.fetchall()[0][0]
    # logger.info(f"Number of objects analysed: {n_obj_analzyed}")

    # # Get column information to check what filters have previously been analysed (and therefore have {filter}_nobs columns in AdlerData)
    # cur_adler_out.execute("PRAGMA table_info(AdlerData);")
    # adler_out_cols = [row[1] for row in cur_adler_out.fetchall()]
    # filter_nobs_columns = [c for c in adler_out_cols if c.endswith('_nobs')]
    # current_adlerdata_filters = [c.replace('_nobs', '') for c in filter_nobs_columns]

    # # #Manually removing u filter because it has so few observations in comparison
    # # filter_nobs_columns.remove('u_nobs')

    # filter_columns_sql = ', '.join(filter_nobs_columns)
    # nobs_df = pd.read_sql_query(f"SELECT ssObjectId, {filter_columns_sql} FROM AdlerData", conn_adler_out)

    # for band in current_adlerdata_filters:
    #     med_obs_band = np.median(nobs_df[f"{band}_nobs"])
    #     std_obs_band = np.std(nobs_df[f"{band}_nobs"])
    #     sum_obs_band = np.sum(nobs_df[f"{band}_nobs"])

    #     logger.info(f"Total number of observations for {band}-band: {sum_obs_band}")
    #     logger.info(f"Observations per object for {band}-band: Median={med_obs_band}, standard deviation={std_obs_band}")

    # filters_to_plot = ['g', 'r', 'i']
    # fig, ax= plt.subplots()
    # ax.hist((nobs_df['g_nobs'], nobs_df['r_nobs'], nobs_df['i_nobs']),
    #         color=[plot_filter_colors_white_background[filt] for filt in filters_to_plot],
    #         label=[f"{filt}-band" for filt in filters_to_plot])

    # ax.set_xlabel("Observations per filter")
    # ax.set_ylabel("Number of objects")
    # ax.legend()
    # fig.savefig(f"{output_dir}/obs_per_band_hists.png", bbox_inches='tight', pad_inches=0.05)
    # plt.close(fig)
    # logger.info(f"Histograms of observations per band per object saved to {output_dir}/obs_per_band_hists.png")

    # for n_new_nights in n_new_nights_arr:
    #     logger.info(f"Checking for number of outliers when considering last {n_new_nights} night(s) as new observations")
    #     for diff_cut in diff_cuts_arr:
    #         cur_adler_out.execute(f"SELECT COUNT(*) FROM AdlerSourceFlags WHERE {night_mag_string.format(n_new_nights)}>={diff_cut}")
    #         logger.info(f"Number of outliers above {diff_cut} magnitude: {cur_adler_out.fetchall()[0][0]}")
    #     for std_cut in std_cuts_arr:
    #         cur_adler_out.execute(f"SELECT COUNT(*) FROM AdlerSourceFlags WHERE {night_std_string.format(n_new_nights)}>={std_cut}")
    #         logger.info(f"Number of outliers above {std_cut}-sigma: {cur_adler_out.fetchall()[0][0]}")

    # #TODO perhaps add example query to find the outliers in the output database

    # #TODO distribution of outliers per object and per filter

# Trying to use new class structures

In [None]:
logger.info("----------------------------------------")
logger.info("New loop started")
logger.info("----------------------------------------")

# process_mjd_arr = np.arange(min_obstime_mjd-0.5, max_obstime_mjd+1.5, 1)
# process_mjd_arr = np.arange(60795.5, 60799.5, 1)
process_mjd_arr = np.array([60799.5])

for process_mjd in process_mjd_arr:
    process_date = mjd_to_utc(process_mjd)
    start_of_night = mjd_to_utc(process_mjd - 1)

    # Get list of objects with observations from most recent night preceding the process date
    # obj_df = pd.read_sql_query(f"SELECT DISTINCT provid FROM obs_sbn WHERE obstime BETWEEN '{start_of_night}' AND '{process_date}' LIMIT 20", input_conn)
    obj_df = pd.read_sql_query(
        f"SELECT DISTINCT provid FROM obs_sbn WHERE obstime BETWEEN '{start_of_night}' AND '{process_date}'",
        input_conn,
    )
    unique_obj_ids = obj_df.provid.to_numpy()
    logger.info(f"{len(unique_obj_ids)} objects to analyze")

    if len(unique_obj_ids) == 0:
        logger.info(f"No objects to process for {process_mjd}")
        continue

    output_dir = f"outputs_lesscols_update_full_{process_mjd}"

    os.makedirs(output_dir, exist_ok=True)

    adler_output_filename = f"{output_dir}/adler_output.sqlite"
    conn_adler_out = sqlite3.connect(adler_output_filename)

    # TODO tidy this up into function
    cur_adler_out = conn_adler_out.cursor()

    cur_adler_out.execute("DROP TABLE IF EXISTS AdlerData;")
    cur_adler_out.execute("DROP TABLE IF EXISTS  AdlerSourceFlags;")

    cur_adler_out.execute("CREATE TABLE AdlerData(ssObjectId TEXT, timestamp REAL, PRIMARY KEY (ssObjectId))")
    # added creation of AdlerSourceFlags table
    cur_adler_out.execute(adler_flags_create_sql)

    for obj_id in tqdm(unique_obj_ids, desc=f"Objects to process for {process_mjd}"):
        # Taking all data, no time constraint
        planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(
            ssObjectId=obj_id,
            sql_filename=input_rubin_sql_file,
            filter_list=filter_list,
            date_range=[process_mjd - data_timespan, process_mjd],
        )

        adler_data = AdlerData(obj_id, planetoid.filter_list)

        for filt in planetoid.filter_list:
            df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

            # Initialise outlier columns in df_obs
            # TODO Consider how we'll load in previous flags if an existing Adler DB is provided
            for col in outlier_cols_list:
                # Populate outlier columns with zeros
                df_obs[col] = np.zeros(shape=len(df_obs), dtype=int)

            # TODO this may change as no longer checking how many datapoints from previous process
            nobs_nomask = len(df_obs)

            err_flag = df_obs.magErr.isnull().all()
            if err_flag:
                logger.info("All magErr values are NaNs, proceed with caution")
            else:
                # Remove observations with large errorbars
                magErr_percentile_cut = 95  # Value (between 0 and 100) to define the percentile above which we cut data with large magErr values
                magErr_mask = df_obs.magErr <= np.nanpercentile(df_obs.magErr, q=magErr_percentile_cut)
                df_obs = df_obs[magErr_mask]

            for n_new_nights in n_new_nights_arr:
                # Split into previous observations and observations from the most recent night(s)
                mask = df_obs["midPointMjdTai"] < process_mjd - n_new_nights

                df_obs_old = df_obs[mask].copy()
                df_obs_new = df_obs[~mask].copy()
                logger.info("Previous observations (date < {}): {}".format(process_mjd - 1, len(df_obs_old)))
                logger.info(
                    "New observations ({} <= date < {}): {}".format(
                        process_mjd - 1, process_mjd, len(df_obs_new)
                    )
                )

                if len(df_obs_old) < 2:
                    # Taken from adler_demo.py
                    logger.info(
                        "Insufficient number of previous observations, continuing to next band/object"
                    )
                    continue
                if len(df_obs_new) == 0:
                    logger.info(f"No new observations in {filt}, continuing to next band/object")
                    continue

                # TODO consider how this affects writing out to AdlerData (possibly fine with how it's already setup)
                sig_clip_mask = astropy_sigma_clip(df_obs_old[mag_col], sigma=sig_clip_val).mask

                df_obs_old = df_obs_old[~sig_clip_mask].copy()

                if len(df_obs_old) < 2:
                    # Taken from adler_demo.py
                    logger.info(
                        "Insufficient number of previous observations after sigma clipping, continuing to next band/object"
                    )
                    continue

                median_mag = np.median(df_obs_old[mag_col])
                # TODO consider storing relevant res values
                res = np.array(df_obs_new[mag_col]) - median_mag

                for diff_cut in diff_cuts_arr:
                    diff_cut_outlier_arr = sci_utils.outlier_diff(res, diff_cut=diff_cut)
                    # Populate outlier rows with True returned by outlier_diff with the current diff_cut value
                    df_obs_new.loc[diff_cut_outlier_arr, night_mag_string.format(n_new_nights)] = diff_cut

                std_cut_outlier_arr = [False] * len(df_obs_new)
                if len(df_obs_old) < 4:
                    logger.info(
                        "Insufficient number of previous observations to check with outlier_sigma_diff"
                    )
                    # TODO how to handle this case (i.e. do we populate the columns with something other than False?)
                else:
                    if err_flag:
                        logger.info(f"No measurement errors, can't attempt outlier_sigma_diff")
                        # TODO how to handle this case (i.e. do we populate the columns with something other than False?)
                    else:
                        for std_cut in std_cuts_arr:
                            std_cut_outlier_arr = sci_utils.outlier_sigma_diff(
                                res, df_obs_new[magErr_col], std_sigma=std_cut
                            )
                            df_obs_new.loc[std_cut_outlier_arr, night_std_string.format(n_new_nights)] = (
                                std_cut
                            )

                if make_plots:
                    # Identify any outliers detected
                    tmp_master_outlier_flag = (diff_cut_outlier_arr) | (std_cut_outlier_arr)
                    fig, ax = plt.subplots()
                    ax.errorbar(
                        df_obs_old["midPointMjdTai"],
                        df_obs_old[mag_col],
                        df_obs_old[magErr_col],
                        ls="",
                        marker=".",
                        color="k",
                        label="Previous observations",
                    )
                    ax.errorbar(
                        df_obs_new["midPointMjdTai"],
                        df_obs_new[mag_col],
                        df_obs_new[magErr_col],
                        ls="",
                        marker=".",
                        color="c",
                        label="New observations",
                    )
                    ax.errorbar(
                        df_obs_new[tmp_master_outlier_flag]["midPointMjdTai"],
                        df_obs_new[tmp_master_outlier_flag][mag_col],
                        df_obs_new[tmp_master_outlier_flag][magErr_col],
                        ls="",
                        marker="x",
                        color="b",
                        label="Outliers",
                    )
                    ax.axhline(median_mag)
                    ax.invert_yaxis()
                    ax.set_xlabel("Time [MJD]")
                    ax.set_ylabel(f"{filt}-band Reduced Magnitude")
                    fig.savefig(
                        f"{output_dir}/{obj_id}_{filt}_{n_new_nights}nights_outliers.png",
                        bbox_inches="tight",
                        pad_inches=0.05,
                    )
                    plt.close(fig)

                # Use pandas.DataFrame.update to add the outlier flags to the df_obs DataFrame
                df_obs.update(df_obs_new)

            # Identify rows that contain at least 1 outlier flag set to a value greater than 0
            master_outlier_flag = df_obs[outlier_cols_list].any(axis=1)
            df_obs.loc[
                master_outlier_flag,
                ["ssObjectId", "filter_name", "diaSourceId", "midPointMjdTai"] + outlier_cols_list,
            ].to_sql("AdlerSourceFlags", con=conn_adler_out, if_exists="append", index=False)

            # TODO consider how to include check and flag for if outliers span multiple consecutive nights
            # Perhaps this is easier now we update df_obs
            # Some kind of resetting cumulative sum to test for sustained outliers that then looks for any value above say 3. [0,1,0,1,2,0] gives [0,1,0,1,3,0]

            # Write nobs to AdlerData
            # TODO write median and standard deviation of previous observations to AdlerData
            # Create AdlerData structure to do the above where we have a particular model that is designed for the median
            adler_data.populate_phase_parameters(filt, **{"nobs": nobs_nomask})
            adler_data.write_row_to_database(adler_output_filename)

            logger.info(f"New information for {obj_id} written to AdlerSourceFlags and AdlerData tables")

    # TODO make summary stats a function
    logger.info(f"Computing summary statistics for {process_mjd}")

    cur_adler_out.execute("SELECT COUNT(*) FROM AdlerData")
    n_obj_analzyed = cur_adler_out.fetchall()[0][0]
    logger.info(f"Number of objects analysed: {n_obj_analzyed}")

    # Get column information to check what filters have previously been analysed (and therefore have {filter}_nobs columns in AdlerData)
    cur_adler_out.execute("PRAGMA table_info(AdlerData);")
    adler_out_cols = [row[1] for row in cur_adler_out.fetchall()]
    filter_nobs_columns = [c for c in adler_out_cols if c.endswith("_nobs")]
    current_adlerdata_filters = [c.replace("_nobs", "") for c in filter_nobs_columns]

    # #Manually removing u filter because it has so few observations in comparison
    # filter_nobs_columns.remove('u_nobs')

    filter_columns_sql = ", ".join(filter_nobs_columns)
    nobs_df = pd.read_sql_query(f"SELECT ssObjectId, {filter_columns_sql} FROM AdlerData", conn_adler_out)

    for band in current_adlerdata_filters:
        med_obs_band = np.median(nobs_df[f"{band}_nobs"])
        std_obs_band = np.std(nobs_df[f"{band}_nobs"])
        sum_obs_band = np.sum(nobs_df[f"{band}_nobs"])

        logger.info(f"Total number of observations for {band}-band: {sum_obs_band}")
        logger.info(
            f"Observations per object for {band}-band: Median={med_obs_band}, standard deviation={std_obs_band}"
        )

    filters_to_plot = ["g", "r", "i"]
    fig, ax = plt.subplots()
    ax.hist(
        (nobs_df["g_nobs"], nobs_df["r_nobs"], nobs_df["i_nobs"]),
        color=[plot_filter_colors_white_background[filt] for filt in filters_to_plot],
        label=[f"{filt}-band" for filt in filters_to_plot],
    )

    ax.set_xlabel("Observations per filter")
    ax.set_ylabel("Number of objects")
    ax.legend()
    fig.savefig(f"{output_dir}/obs_per_band_hists.png", bbox_inches="tight", pad_inches=0.05)
    plt.close(fig)
    logger.info(
        f"Histograms of observations per band per object saved to {output_dir}/obs_per_band_hists.png"
    )

    for n_new_nights in n_new_nights_arr:
        logger.info(
            f"Checking for number of outliers when considering last {n_new_nights} night(s) as new observations"
        )
        for diff_cut in diff_cuts_arr:
            cur_adler_out.execute(
                f"SELECT COUNT(*) FROM AdlerSourceFlags WHERE {night_mag_string.format(n_new_nights)}>={diff_cut}"
            )
            logger.info(f"Number of outliers above {diff_cut} magnitude: {cur_adler_out.fetchall()[0][0]}")
        for std_cut in std_cuts_arr:
            cur_adler_out.execute(
                f"SELECT COUNT(*) FROM AdlerSourceFlags WHERE {night_std_string.format(n_new_nights)}>={std_cut}"
            )
            logger.info(f"Number of outliers above {std_cut}-sigma: {cur_adler_out.fetchall()[0][0]}")

    # TODO perhaps add example query to find the outliers in the output database

    # TODO distribution of outliers per object and per filter