# EFD data consistency checks

In [None]:
import pandas as pd
from lsst_efd_client import EfdClient

## Reliability check
Are all messages being recorded? Use the sequence number private field added by SAL

In [None]:
async def reliability_check(env, date, duration="24h"):
    
    client = EfdClient(env)

    topics = await client.get_topics()
    print(f"Found {len(topics)} topics at {env}.")
    
    for topic in topics:
        query = f'''SELECT private_seqNum FROM "{topic}" WHERE time > '{date}' AND time < '{date}' + {duration} '''
        df = await client.influx_client.query(query)
        

        if type(df) is not dict: 

            missing = (df - df.shift(periods=1) - 1).sum().iloc[0]
            if missing > 0:
                print(f"Found {int(missing)} message(s) missing from {topic} on {date}.")


In [None]:
async def plot_seqnum(env, topic, date, duration="24h"):
    
    client = EfdClient(env)
    
    query = f'''SELECT private_seqNum FROM "{topic}" WHERE time > '{date}' AND time < '{date}' + {duration} '''
    df = await client.influx_client.query(query)
    
    df.plot()

In [None]:
await reliability_check("summit_efd", "2023-08-23")

In [None]:
await reliability_check("usdf_efd", "2023-08-23")

## Data consistency check

Does Summit EFD and USDF EFD have the same data? Count messages in each environment and compare

In [None]:
async def consitency_check(env1, env2, date, duration="24h", interval="1h"):
    
    client1 = EfdClient(env1)
    client2 = EfdClient(env2)
    
    topics = await client1.get_topics()
    
    for topic in topics:
        
        query = f'''SELECT count(private_sndStamp) FROM "{topic}" WHERE time > '{date}' AND time < '{date}' + {duration} GROUP BY time({interval}) '''
        df1 = await client1.influx_client.query(query)
        df2 = await client2.influx_client.query(query)
        
        if type(df1) is not dict:           
            counts1 = df1.sum().iloc[0]
        else:
            continue
            
        if type(df2) is not dict:
            counts2 = df2.sum().iloc[0]
        else:
            counts2 = 0
    
        if counts1 != counts2:
            diff = int(counts1 - counts2)
            print(f"{topic}: {int(counts1)} message(s) at {env1} and {int(counts2)} messages at {env2}. Difference is {int(diff)} message(s) or {round(diff/counts1*100,3)}%.")



In [None]:
await consitency_check("summit_efd", "usdf_efd", "2023-08-23")

In [None]:
async def find_missing_data_events(env1, env2, topic, date, duration="24h", interval="15m"):
    
    client1 = EfdClient(env1)
    client2 = EfdClient(env2)
    
        
    query = f'''SELECT count(private_sndStamp) FROM "{topic}" WHERE time > '{date}' AND time < '{date}' + {duration} GROUP BY time({interval}) '''
    df1 = await client1.influx_client.query(query)
    df2 = await client2.influx_client.query(query)

    diff = df1 - df2
    diff.plot()
    
    
    print(f"Missing data events: {diff[diff['count']>0]}")
        

In [None]:
await find_missing_data_events("summit_efd", "usdf_efd", "lsst.sal.MTM1M3.accelerometerData", "2023-08-23")