In [1]:
import sqlite3
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt

In [7]:
# root_filepath = "/home/astro-sobrien/"
# root_filepath = "/Volumes/astro-sobrien/home/astro-sobrien/"
# root_filepath = "/Users/seanobrien/Documents/Adler/"
current_path = os.getcwd()
root_filepath = current_path.split('lsst-adler')[0]

test_db_filename = f"{root_filepath}lsst-adler/tests/data/testing_database.db"
rubin_sql_filename_oct = f"{root_filepath}rubin_251002.sqlite"
rubin_sql_filename_nov = f"{root_filepath}rubin_251105.sqlite"
rubin_sql_filename_nov2 = f"{root_filepath}rubin_251111.sqlite"
adler_output_filename = f"{root_filepath}adler_output_251111_v1.sqlite"

In [8]:
import logging
from astropy.time import Time
log_timestamp = Time.now().isot.replace(":", '_')

# --- Reset logging system (important for Jupyter) ---
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# --- Configure root logger to log only to a file ---
logging.basicConfig(
    filename=f"{root_filepath}adler_test_{log_timestamp}.log",
    filemode='w',  # 'w' to overwrite each run, 'a' to append
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    level=logging.DEBUG  # capture INFO, DEBUG, etc.
)

# Optional confirmation
logging.getLogger().info(f"Root logging configured to write to {root_filepath}adler_test{log_timestamp}.log")

In [9]:
import warnings

from adler.objectdata.AdlerPlanetoid import AdlerPlanetoid

from adler.objectdata.Observations import Observations
from adler.objectdata.MPCORB import MPCORB
from adler.objectdata.SSObject import SSObject
from adler.objectdata.AdlerData import AdlerData
from adler.objectdata.objectdata_utilities import get_data_table, get_from_table, utc_to_mjd, mjd_to_utc

logger = logging.getLogger(__name__)

In [10]:
FILTER_DEPENDENT_KEYS = ["phaseAngle_min", "phaseAngle_range", "nobs", "arc"]
MODEL_DEPENDENT_KEYS = [
    "H",
    "H_err",
    "phase_parameter_1",
    "phase_parameter_1_err",
    "phase_parameter_2",
    "phase_parameter_2_err",
    "modelFitMjd",
]
ALL_FILTER_LIST = ["u", "g", "r", "i", "z", "y"]

def tmp_get_row_data_and_columns(self):
    """Collects all of the data present in the AdlerData object as a list with a corresponding list of column names,
    in preparation for a row to be written to a SQL database table.

    Returns
    -----------
    row_data : list
        A list containing all of the relevant data present in the AdlerData object.

    required_columns : list of str
        A list of the corresponding column names in the same order.

    """
    required_columns = ["ssObjectId", "timestamp"]
    # edited here to remove int() around ssObjectId
    row_data = [self.ssObjectId, Time.now().mjd]

    for f, filter_name in enumerate(self.filter_list):
        columns_by_filter = ["_".join([filter_name, filter_key]) for filter_key in FILTER_DEPENDENT_KEYS]
        data_by_filter = [
            getattr(self.filter_dependent_values[f], filter_key) for filter_key in FILTER_DEPENDENT_KEYS
        ]

        required_columns.extend(columns_by_filter)
        row_data.extend(data_by_filter)

        for m, model_name in enumerate(self.filter_dependent_values[f].model_list):
            columns_by_model = [
                "_".join([filter_name, model_name, model_key]) for model_key in MODEL_DEPENDENT_KEYS
            ]
            data_by_model = [
                getattr(self.filter_dependent_values[f].model_dependent_values[m], model_key)
                for model_key in MODEL_DEPENDENT_KEYS
            ]

            required_columns.extend(columns_by_model)
            row_data.extend(data_by_model)

    return row_data, required_columns

def tmp_get_database_connection(self, filepath, create_new=False):
        """Returns the connection to the output SQL database, creating it if it does not exist.

        Parameters
        -----------
        filepath : path-like object
            Filepath with the location of the output SQL database.

        create_new : Boolean
            Whether to create the database if it doesn't already exist. Default is False.

        Returns
        ----------
        con : sqlite3 Connection object
            The connection to the output database.

        """

        database_exists = os.path.isfile(
            filepath
        )  # check this FIRST as the next statement creates the db if it doesn't exist

        if not database_exists and create_new:  # we need to make the table and a couple of starter columns
            con = sqlite3.connect(filepath)
            cur = con.cursor()
            # Using ssObjectId (which is actually provid) as primary key, canged to TEXT
            cur.execute("CREATE TABLE AdlerData(ssObjectId TEXT, timestamp REAL, PRIMARY KEY (ssObjectId))")
            # added creation of AdlerFlags table
            cur.execute("CREATE TABLE AdlerFlags(ssObjectId TEXT, filter_name TEXT, diaSourceId TEXT, midPointMjdTai INTEGER, outlier INTEGER)")
        elif not database_exists and not create_new:
            logger.error("ValueError: Database cannot be found at given filepath.")
            raise ValueError("Database cannot be found at given filepath.")
        else:
            con = sqlite3.connect(filepath)

        return con

AdlerData._get_row_data_and_columns = tmp_get_row_data_and_columns
AdlerData._get_database_connection = tmp_get_database_connection

In [11]:
conn_oct = sqlite3.connect(rubin_sql_filename_oct)
cur_oct = conn_oct.cursor()

conn_nov = sqlite3.connect(rubin_sql_filename_nov)
cur_nov = conn_nov.cursor()

In [13]:
conn_adler_out = sqlite3.connect(adler_output_filename)
cur_adler_out = conn_adler_out.cursor()

cur_adler_out.execute("DROP TABLE IF EXISTS AdlerData;")
cur_adler_out.execute("DROP TABLE IF EXISTS  AdlerFlags;")

cur_adler_out.execute("CREATE TABLE AdlerData(ssObjectId TEXT, timestamp REAL, PRIMARY KEY (ssObjectId))")
# added creation of AdlerFlags table
cur_adler_out.execute("CREATE TABLE AdlerFlags(ssObjectId TEXT, filter_name TEXT, diaSourceId TEXT, midPointMjdTai INTEGER, outlier INTEGER)")

<sqlite3.Cursor at 0x1765c9940>

In [8]:
#First time setup alterations to the newer Rubin sqlite file
# # Remove the L before filter names used by the MPC to distinguish the LSST bands
# # We do this for consistency with adler and the RSP
# cur_nov.execute("UPDATE obs_sbn SET band=substr(band, 2) WHERE band LIKE 'L%';")

# #Need to commit changes before we can operate on the database again
# conn_nov.commit()

# #Rename mpc_orbits to MPCORB for consistency with adler and the RSP
# cur_nov.execute("ALTER TABLE mpc_orbits RENAME TO MPCORB;")
# conn_nov.commit()

In [9]:
print(pd.read_sql_query("SELECT MIN(obstime) FROM obs_sbn", conn_oct))
print(pd.read_sql_query("SELECT MIN(obstime) FROM obs_sbn", conn_nov))

print(pd.read_sql_query("SELECT MAX(obstime) FROM obs_sbn", conn_oct))
print(pd.read_sql_query("SELECT MAX(obstime) FROM obs_sbn", conn_nov))

print(pd.read_sql_query("SELECT COUNT(*) FROM obs_sbn", conn_oct))
print(pd.read_sql_query("SELECT COUNT(*) FROM obs_sbn", conn_nov))

print(pd.read_sql_query("SELECT COUNT(DISTINCT provid) FROM obs_sbn", conn_oct))
print(pd.read_sql_query("SELECT COUNT(DISTINCT provid) FROM obs_sbn", conn_nov))

print(pd.read_sql_query("SELECT DISTINCT(band), count(*) FROM obs_sbn GROUP BY band", conn_oct))
print(pd.read_sql_query("SELECT DISTINCT(band), count(*) FROM obs_sbn GROUP BY band", conn_nov))

                 MIN(obstime)
0  2024-11-24T02:52:28.634000
                 MIN(obstime)
0  2024-11-24T02:52:28.634000
                 MAX(obstime)
0  2025-05-05T02:05:32.780000
                 MAX(obstime)
0  2025-05-05T02:05:32.780000
   COUNT(*)
0    345989
   COUNT(*)
0    458132
   COUNT(DISTINCT provid)
0                    2196
   COUNT(DISTINCT provid)
0                    2565
  band  count(*)
0    g    109973
1    i     72380
2    r    163521
3    u       115
  band  count(*)
0    g    142927
1    i     96860
2    r    206867
3    u     11478


# Main code

In [14]:
from tqdm import tqdm
import adler.utilities.science_utilities as sci_utils

# Set threshold for magnitude jumps
diff_cut = 1.0
std_cut = 3.0
# Set column to use for magnitude (we don't currently have reduced_mag populated for MPC file format)
mag_col = 'mag'
magErr_col = 'magErr'
# Number of days of data to retrieve (i.e. previous 30 nights)
# data_timespan=30
# data_timespan=15*365

# Set filter list (only ugri present currently, very few u, but keeping in for completeness)
filter_list=['u', 'g', 'r', 'i', 'z', 'y']
# filter_list=['g', 'r', 'i']

#Establish DB connection
conn_adler_out = sqlite3.connect(adler_output_filename)

## Analysing first obs file in bulk

In [12]:
distinct_objs_df_oct = pd.read_sql_query("SELECT DISTINCT provid FROM obs_sbn", conn_oct)
unique_obj_ids = distinct_objs_df_oct.provid.to_numpy()
logger.info(f"{len(unique_obj_ids)} objects to analyze")

In [None]:
make_plots=True

for obj_id in tqdm(unique_obj_ids):
    #Taking all data, no time constraint
    planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(ssObjectId=obj_id,
                                                        sql_filename=rubin_sql_filename_oct,
                                                        filter_list=filter_list)

    #First file so we know there isn't anything in the AdlerData table yet so won't try to populate from DB yet
    adler_data = AdlerData(obj_id, planetoid.filter_list)

    for filt in planetoid.filter_list:
        df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

        nobs_nomask = len(df_obs)

        err_flag = df_obs.magErr.isnull().all()
        if err_flag:
            logger.info("All magErr values are NaNs, proceed with caution")
        else:
        
            # Remove observations with large errorbars
            magErr_percentile_cut = 95    #Value (between 0 and 100) to define the percentile above which we cut data with large magErr values
            magErr_mask = (df_obs.magErr <= np.nanpercentile(df_obs.magErr, q=magErr_percentile_cut))
            df_obs = df_obs[magErr_mask]
        
        # First loop so there are no previous outliers
        df_obs["outlier"] = [False] * len(df_obs)

        median_mag = np.median(df_obs[mag_col])
        res = np.array(df_obs[mag_col]) - median_mag
        
        if err_flag:
            logger.info(f"No measurement errors, using outlier_diff to check for outliers more than {diff_cut} magnitudes away")
            outlier_flag = sci_utils.outlier_diff(res, diff_cut=diff_cut)
        else:
            logger.info(f"Measurement errors available, using outlier_sigma_diff to check for outliers more than {std_cut} standard deviation away")
            outlier_flag = sci_utils.outlier_sigma_diff(res, df_obs[magErr_col], std_sigma=std_cut)

        logger.info("{} outliers detected".format(np.sum(outlier_flag)))
        
        # Write only outlier flags
        df_obs.loc[:,"outlier"] = outlier_flag
        df_obs.loc[outlier_flag, ['ssObjectId', 'filter_name', 'diaSourceId', 'midPointMjdTai', 'outlier']].to_sql('AdlerFlags', con=conn_adler_out, if_exists='append', index=False)
        logger.info(f"Flags for {obj_id} written to AdlerFlags table")

        # # Write all flags (True and False)
        # df_obs.loc[:,"outlier"] = outlier_flag
        # df_obs[['ssObjectId', 'filter_name', 'diaSourceId', 'midPointMjdTai', 'outlier']].to_sql('AdlerFlags', con=conn_adler_out, if_exists='append', index=False)
        
        #Populate nobs parameter
        #FIXME Currently using the number of observations in the filter before masking out values with large errorbars
        # This is so that the comparison with the new observations file can be made to assess what objects have new observations
        # This may need to change (possibly populate a new parameter) as technically this value should be the number of values used in the fit
        # But currently the "fit" is just taking the median anyway, rather than any kind of phase curve
        #P ossible solution is below in the new observations file loop, we could load in the observations for each object and then just continue to next object if no new observations (doing a check on diaSourceId to guard against if weird stuff happens with the masking)
        adler_data.populate_phase_parameters(filt, **{'nobs':nobs_nomask})
        adler_data.write_row_to_database(adler_output_filename)
        
        if make_plots:
            fig, ax = plt.subplots()
            ax.errorbar(df_obs['midPointMjdTai'], df_obs['mag'], df_obs['magErr'], ls='', marker='.')
            ax.errorbar(df_obs[outlier_flag]['midPointMjdTai'], df_obs[outlier_flag]['mag'], df_obs[outlier_flag]['magErr'], ls='', marker='.')
            ax.axhline(median_mag)
            ax.invert_yaxis()
            ax.set_xlabel("Time [MJD]")
            ax.set_ylabel(f"{filt}-band Magnitude")
            fig.savefig(f"plots/{obj_id}_{filt}_outliers.png", bbox_inches='tight', pad_inches=0.05)
            plt.close(fig)

100%|██████████| 2196/2196 [08:46<00:00,  4.17it/s]


## Loop for processing new file now that database has been established

In [14]:
# Attach the adler output database to the new observations database with sqlite so we can join tables
cur_nov.execute(f"ATTACH DATABASE '{adler_output_filename}' AS adler_db;")

<sqlite3.Cursor at 0x171cd3ec0>

In [15]:
# Get column information to check what filters have previously been analysed (and therefore have {filter}_nobs columns in AdlerData)
cur_adler_out.execute("PRAGMA table_info(AdlerData);")
adler_out_cols = [row[1] for row in cur_adler_out.fetchall()]
filter_nobs_columns = [c for c in adler_out_cols if c.endswith('_nobs')]
current_adlerdata_filters = [c.replace('_nobs', '') for c in filter_nobs_columns]

# Build CASE section of SQL query that counts observations in each filter for each object
query_cases_list = [f"SUM(CASE WHEN band = '{b}' THEN 1 ELSE 0 END) AS {b}_nobs" for b in current_adlerdata_filters]
current_filters_joined = "', '".join(current_adlerdata_filters)
# Include a special case that catches any observations in new filters
query_cases_list.append(f"SUM(CASE WHEN band NOT IN ('{current_filters_joined}') THEN 1 ELSE 0 END) AS new_band_count")
cases_sql = ",\n       ".join(query_cases_list) # Join these all together

# Build WHERE clause for SQL query
# Check for objects with more observations in filters populated in AdlerData
comparison_parts = [
    f"obs_counts.{b}_nobs > adler_data.{b}_nobs"
    for b in current_adlerdata_filters
]
# Check for objects with observations in filters not previously populated in AdlerData
comparison_parts.append("obs_counts.new_band_count > 0")
# Check for entirely new objects not previously analzyed
comparison_parts.append("adler_data.SSObjectId IS NULL")
where_clause = "\nOR ".join(comparison_parts) #Join these all together

# Create full SQL query
new_obj_sql_query = f"""
SELECT obs_counts.provid
FROM (
    SELECT provid,
        {cases_sql}
    FROM obs_sbn
    GROUP BY provid
) AS obs_counts
LEFT JOIN AdlerData AS adler_data
    ON obs_counts.provid = adler_data.SSObjectId
WHERE {where_clause};
"""

logger.info(f"Executing {new_obj_sql_query} on {rubin_sql_filename_nov}")

cur_nov.execute(new_obj_sql_query)
# Forcing dtype object here as otherwise the values become np.str_ type, not sure if this is good or bad
obj_list = np.array([i[0] for i in cur_nov.fetchall()], dtype=object)

In [None]:
make_plots=True

for obj_id in tqdm(obj_list):
    #Set first_analysis flag to False by default (object could have new observations so would still appear in AdlerData/AdlerFlags)
    first_analysis = False
    #Taking all data, no time constraint
    planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(ssObjectId=obj_id,
                                                        sql_filename=rubin_sql_filename_nov,
                                                        filter_list=filter_list)

    adler_data = AdlerData(obj_id, planetoid.filter_list)
    try:
        adler_data.populate_from_database(adler_output_filename)
    except ValueError:
        logger.info(f"No AdlerData entry for {obj_id}, proceeding with empty AdlerData object")
        first_analysis = True

    for filt in planetoid.filter_list:
        df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

        nobs_nomask = len(df_obs)

        err_flag = df_obs.magErr.isnull().all()
        if err_flag:
            logger.info("All magErr values are NaNs, proceed with caution")
        else:
            # Remove observations with large errorbars
            magErr_percentile_cut = 95    #Value (between 0 and 100) to define the percentile above which we cut data with large magErr values
            magErr_mask = (df_obs.magErr <= np.nanpercentile(df_obs.magErr, q=magErr_percentile_cut))
            df_obs = df_obs[magErr_mask]
        
        # First loop so there are no previous outliers
        if first_analysis:
            logger.info("First analysis set to True so no outlier flags available to load")
            df_obs["outlier"] = [False] * len(df_obs)
        else:
            logger.info("Loading previous outlier flags")
            sql_query = f"SELECT * FROM AdlerFlags WHERE ssObjectId='{obj_id}' AND filter_name='{filt}'"
            flags_df = pd.read_sql_query(sql_query, conn_adler_out)
            logger.info("{} previous outliers found".format(len(flags_df)))

            # Include the outlier flags in the observations dataframe
            df_obs = df_obs.merge(flags_df[['diaSourceId', 'outlier']],
                                on='diaSourceId',
                                how='left')
            df_obs['outlier'] = df_obs['outlier'].astype('boolean').fillna(False)

        
        median_mag = np.median(df_obs.loc[~df_obs['outlier'], mag_col])
        res = np.array(df_obs[mag_col]) - median_mag
        
        if err_flag:
            logger.info(f"No measurement errors, using outlier_diff to check for outliers more than {diff_cut} magnitudes away")
            outlier_flag = sci_utils.outlier_diff(res, diff_cut=diff_cut)
        else:
            logger.info(f"Measurement errors available, using outlier_sigma_diff to check for outliers more than {std_cut} standard deviation away")
            outlier_flag = sci_utils.outlier_sigma_diff(res, df_obs[magErr_col], std_sigma=std_cut)

        logger.info("{} outliers detected".format(np.sum(outlier_flag)))
        
        # Check if any outliers swapped from True to False and update in AdlerFlags
        true_to_false_mask = (df_obs['outlier']) & (~outlier_flag)
        logger.info("{} previous outliers now determined to be not significant".format(np.sum(true_to_false_mask)))
        if np.sum(true_to_false_mask)>0:
            update_query = f"""UPDATE AdlerFlags SET outlier=0 WHERE diaSourceId IN {df_obs.loc[true_to_false_mask, 'diaSourceId'].to_numpy()}"""
            logger.info(f"Executing {update_query} on {adler_output_filename}")
            cur_adler_out.execute(update_query)
            conn_adler_out.commit()

        # Check for new outliers (i.e. have swapped from False to True)
        false_to_true_mask = (~df_obs['outlier']) & (outlier_flag)
        logger.info("{} new outliers detected".format(np.sum(false_to_true_mask)))
        df_obs.loc[:,"outlier"] = outlier_flag
        df_obs.loc[false_to_true_mask, ['ssObjectId', 'filter_name', 'diaSourceId', 'midPointMjdTai', 'outlier']].to_sql('AdlerFlags', con=conn_adler_out, if_exists='append', index=False)
        logger.info(f"New flags for {obj_id} written to AdlerFlags table")

        # # Write all flags (True and False)
        # df_obs.loc[:,"outlier"] = outlier_flag
        # df_obs[['ssObjectId', 'filter_name', 'diaSourceId', 'midPointMjdTai', 'outlier']].to_sql('AdlerFlags', con=conn_adler_out, if_exists='append', index=False)
        
        #Populate nobs parameter
        #FIXME Currently using the number of observations in the filter before masking out values with large errorbars
        # This is so that the comparison with the new observations file can be made to assess what objects have new observations
        # This may need to change (possibly populate a new parameter) as technically this value should be the number of values used in the fit
        # But currently the "fit" is just taking the median anyway, rather than any kind of phase curve
        # Possible solution is in the new observations file loop, we could load in the observations for each object and then just continue to next object if no new observations (doing a check on diaSourceId to guard against if weird stuff happens with the masking)
        adler_data.populate_phase_parameters(filt, **{'nobs':nobs_nomask})
        adler_data.write_row_to_database(adler_output_filename)
        
        if make_plots:
            fig, ax = plt.subplots()
            ax.errorbar(df_obs['midPointMjdTai'], df_obs['mag'], df_obs['magErr'], ls='', marker='.')
            ax.errorbar(df_obs[outlier_flag]['midPointMjdTai'], df_obs[outlier_flag]['mag'], df_obs[outlier_flag]['magErr'], ls='', marker='.')
            ax.axhline(median_mag)
            ax.invert_yaxis()
            ax.set_xlabel("Time [MJD]")
            ax.set_ylabel(f"{filt}-band Magnitude")
            fig.savefig(f"plots/{obj_id}_{filt}_outliers.png", bbox_inches='tight', pad_inches=0.05)
            plt.close(fig)

100%|██████████| 369/369 [02:17<00:00,  2.69it/s]


# Specified processing date

### Idea here is to set it so that if a processing date is specified then the last N nights of data are considered

In [None]:
#TODO implement ability to specify a processing dates
process_mjd = 60800.5
process_date = mjd_to_utc(process_mjd)

# Number of days of data to retrieve (i.e. previous 30 nights)
data_timespan=30

#what is the plan for writing out/reading in in this situation?

#should we be considering the model as the previous nights and then check for outliers in the new night?

In [None]:
# make it so that new file written to, the idea is that these functions and general functionality can be adapted to whatever actual database is created

In [None]:
make_plots=True

for obj_id in tqdm(obj_list):
    #Set first_analysis flag to False by default (object could have new observations so would still appear in AdlerData/AdlerFlags)
    first_analysis = False
    #Taking all data, no time constraint
    planetoid = AdlerPlanetoid.construct_from_mpc_obs_sbn(ssObjectId=obj_id,
                                                        sql_filename=rubin_sql_filename_nov,
                                                        filter_list=filter_list,
                                                        date_range=[process_mjd-data_timespan, process_mjd])

    adler_data = AdlerData(obj_id, planetoid.filter_list)
    try:
        adler_data.populate_from_database(adler_output_filename)
    except ValueError:
        logger.info(f"No AdlerData entry for {obj_id}, proceeding with empty AdlerData object")
        first_analysis = True

    for filt in planetoid.filter_list:
        df_obs = sci_utils.get_df_obs_filt(planetoid, filt=filt)

        nobs_nomask = len(df_obs)

        err_flag = df_obs.magErr.isnull().all()
        if err_flag:
            logger.info("All magErr values are NaNs, proceed with caution")
        else:
            # Remove observations with large errorbars
            magErr_percentile_cut = 95    #Value (between 0 and 100) to define the percentile above which we cut data with large magErr values
            magErr_mask = (df_obs.magErr <= np.nanpercentile(df_obs.magErr, q=magErr_percentile_cut))
            df_obs = df_obs[magErr_mask]
        
        # First loop so there are no previous outliers
        if first_analysis:
            logger.info("First analysis set to True so no outlier flags available to load")
            df_obs["outlier"] = [False] * len(df_obs)
        else:
            logger.info("Loading previous outlier flags")
            sql_query = f"SELECT * FROM AdlerFlags WHERE ssObjectId='{obj_id}' AND filter_name='{filt}'"
            flags_df = pd.read_sql_query(sql_query, conn_adler_out)
            logger.info("{} previous outliers found".format(len(flags_df)))

            # Include the outlier flags in the observations dataframe
            df_obs = df_obs.merge(flags_df[['diaSourceId', 'outlier']],
                                on='diaSourceId',
                                how='left')
            df_obs['outlier'] = df_obs['outlier'].astype('boolean').fillna(False)

        
        median_mag = np.median(df_obs.loc[~df_obs['outlier'], mag_col])
        res = np.array(df_obs[mag_col]) - median_mag
        
        if err_flag:
            logger.info(f"No measurement errors, using outlier_diff to check for outliers more than {diff_cut} magnitudes away")
            outlier_flag = sci_utils.outlier_diff(res, diff_cut=diff_cut)
        else:
            logger.info(f"Measurement errors available, using outlier_sigma_diff to check for outliers more than {std_cut} standard deviation away")
            outlier_flag = sci_utils.outlier_sigma_diff(res, df_obs[magErr_col], std_sigma=std_cut)

        logger.info("{} outliers detected".format(np.sum(outlier_flag)))
        
        # Check if any outliers swapped from True to False and update in AdlerFlags
        true_to_false_mask = (df_obs['outlier']) & (~outlier_flag)
        logger.info("{} previous outliers now determined to be not significant".format(np.sum(true_to_false_mask)))
        if np.sum(true_to_false_mask)>0:
            update_query = f"""UPDATE AdlerFlags SET outlier=0 WHERE diaSourceId IN {df_obs.loc[true_to_false_mask, 'diaSourceId'].to_numpy()}"""
            logger.info(f"Executing {update_query} on {adler_output_filename}")
            cur_adler_out.execute(update_query)
            conn_adler_out.commit()

        # Check for new outliers (i.e. have swapped from False to True)
        false_to_true_mask = (~df_obs['outlier']) & (outlier_flag)
        logger.info("{} new outliers detected".format(np.sum(false_to_true_mask)))
        df_obs.loc[:,"outlier"] = outlier_flag
        df_obs.loc[false_to_true_mask, ['ssObjectId', 'filter_name', 'diaSourceId', 'midPointMjdTai', 'outlier']].to_sql('AdlerFlags', con=conn_adler_out, if_exists='append', index=False)
        logger.info(f"New flags for {obj_id} written to AdlerFlags table")

        # # Write all flags (True and False)
        # df_obs.loc[:,"outlier"] = outlier_flag
        # df_obs[['ssObjectId', 'filter_name', 'diaSourceId', 'midPointMjdTai', 'outlier']].to_sql('AdlerFlags', con=conn_adler_out, if_exists='append', index=False)
        
        #Populate nobs parameter
        #FIXME Currently using the number of observations in the filter before masking out values with large errorbars
        # This is so that the comparison with the new observations file can be made to assess what objects have new observations
        # This may need to change (possibly populate a new parameter) as technically this value should be the number of values used in the fit
        # But currently the "fit" is just taking the median anyway, rather than any kind of phase curve
        # Possible solution is in the new observations file loop, we could load in the observations for each object and then just continue to next object if no new observations (doing a check on diaSourceId to guard against if weird stuff happens with the masking)
        # Does LinearPhaseFunc satisfied our needs? possibly not because it expects absolute magnitude, but maybe that works?
        adler_data.populate_phase_parameters(filt, **{'nobs':nobs_nomask})
        adler_data.write_row_to_database(adler_output_filename)
        
        if make_plots:
            fig, ax = plt.subplots()
            ax.errorbar(df_obs['midPointMjdTai'], df_obs['mag'], df_obs['magErr'], ls='', marker='.')
            ax.errorbar(df_obs[outlier_flag]['midPointMjdTai'], df_obs[outlier_flag]['mag'], df_obs[outlier_flag]['magErr'], ls='', marker='.')
            ax.axhline(median_mag)
            ax.invert_yaxis()
            ax.set_xlabel("Time [MJD]")
            ax.set_ylabel(f"{filt}-band Magnitude")
            fig.savefig(f"plots/{obj_id}_{filt}_outliers.png", bbox_inches='tight', pad_inches=0.05)
            plt.close(fig)

In [19]:
pd.read_sql_query("SELECT COUNT(*) FROM obs_sbn", conn_nov)

Unnamed: 0,COUNT(*)
0,458132
