In [1]:
import pandas as pd

from agage_archive.io import read_ale_gage
from agage_archive.data_selection import read_release_schedule
from agage_archive.io_other_formats import read_wang
from agage_archive.util import tz_local_to_utc


In [39]:
def read_files(species, site, network):
    df_wang = read_wang(species, site, network)
    df_wang.rename(columns={"mf": "mf_wang"}, inplace=True)

    df = read_ale_gage(species, site, network, utc=False, scale="SIO-05").to_pandas()

    return df, df_wang

def check_timestamps(df, df_wang):
    """Check that the timestamps in df and df_wang are the same"""
    df_timestamps = set(df.index)
    df_wang_timestamps = set(df_wang.index)
    df_only_timestamps = df_timestamps - df_wang_timestamps
    df_wang_only_timestamps = df_wang_timestamps - df_timestamps

    if df_only_timestamps:
        print(f"Timestamps only in df: {df_only_timestamps}")

    if df_wang_only_timestamps:
        print(f"Timestamps only in df_wang: {df_wang_only_timestamps}")

    # Check timestamps are the same
    if len(df) != len(df_wang):
        raise ValueError("Dataframes are different lengths. Check for duplicates")

    return df_only_timestamps, df_wang_only_timestamps

def compare_scales(df, df_wang, plot = False):
    """ Compare scales of two dataframes."""

    df_merged = pd.concat([df, df_wang], axis=1)

    if plot:
        (df_merged["mf"]/df_merged["mf_wang"]).plot(marker = ".", ylim = [0.95, 1.05], ylabel = "mf/mf_wang")

    print(f"Scale difference Rigby/Wang: {(df_merged['mf']/df_merged['mf_wang']).mean()}")


def list_missing_periods(file_handle, flagged_in_wang, df, site, species, network):

    df_wang_flagged = df[["mf"]].copy()
    df_wang_flagged["flag"] = False
    df_wang_flagged.loc[flagged_in_wang, "flag"] = True
    # Remove rows where df_wang_flagged["mf"] is NaN
    df_wang_flagged = df_wang_flagged.dropna(subset=["mf"])
    # Move the index to a column "time"
    df_wang_flagged = df_wang_flagged.reset_index()

    # Identify ranges where consecutive flagged values exist
    flagged_ranges = []
    start_range = None
    for idx, row in df_wang_flagged.iterrows():
        if row['flag']:
            if start_range is None:
                start_range = row["time"]
        else:
            if start_range is not None:
                flagged_ranges.append((start_range, df_wang_flagged.loc[idx-1, "time"]))
                start_range = None

    # If the last range extends till the end of the dataframe
    if start_range is not None:
        flagged_ranges.append((start_range, df_wang_flagged.loc[len(df_wang_flagged)-1, "time"]))

    if len(flagged_ranges) > 0:
#        print("Flagged ranges START (*UTC*):")
        for flagged_range in flagged_ranges:
            # Convert strings to YYYY-MM-DD HH:MM format, separated by a comma
            # Convert to UTC, since exclusion applied after UTC conversion
            # Add a minute before and after to ensure data are removed
            flagged_start = tz_local_to_utc(flagged_range[0], site) - pd.Timedelta(minutes=1)
            flagged_end = tz_local_to_utc(flagged_range[1], site) + pd.Timedelta(minutes=1)
            data_exclude_string = f"{species},{network},"
            data_exclude_string += f"{flagged_start.strftime('%Y-%m-%d %H:%M')},"
            data_exclude_string += f"{flagged_end.strftime('%Y-%m-%d %H:%M')},"
            data_exclude_string += "Flagged by Ray Wang"
            print(data_exclude_string)
            file_handle.write(data_exclude_string + "\n")


def flagged(df, df_wang, utc = False, site = None, verbose = True):
    """ Find values that are NaN in df_wang, but not in df or vice versa"""

    def flagged_compare(df1, df2):
        """ Find values that are NaN in df1, but not in df2 """
        flagged_in_1_not_2 = df1.isna() & df2.notna()
        indices = flagged_in_1_not_2[flagged_in_1_not_2 == True].index
        if len(indices) > 0:
            if utc:
                indices = tz_local_to_utc(indices, site)
            for i in indices:
                if verbose:
                    print(i.strftime("%Y-%m-%d %H:%M"))
        
        return indices

    print("Wang flagged:")
    flagged_in_wang = flagged_compare(df_wang["mf_wang"], df["mf"])
    print("---------")

    print("Rigby flagged:")
    flagged_in_rigby = flagged_compare(df["mf"], df_wang["mf_wang"])
    print("---------")

    return flagged_in_wang, flagged_in_rigby

In [40]:
network = "GAGE"
rs = read_release_schedule(network)

with open("/Users/chxmr/Downloads/gage_wang.txt", "w") as f:

    for site in rs.columns:
        f.write(f"{site}................................\n")
        for species in rs.index:
            print(species, site)
            df, df_wang = read_files(species, site, network)
            check_timestamps(df, df_wang)
            compare_scales(df, df_wang, plot=False)
            flagged_in_wang, flagged_in_rigby = flagged(df, df_wang, verbose = False)
            list_missing_periods(f, flagged_in_wang, df, site, species, network)
            print("      ")
            print("**********************")

cfc-11 MHD
Scale difference Rigby/Wang: 1.0000001599390493
Wang flagged:
---------
Rigby flagged:
---------
cfc-11,GAGE,1987-03-01 03:34,1987-03-01 03:36,Flagged by Ray Wang
cfc-11,GAGE,1987-03-01 09:21,1987-03-01 11:19,Flagged by Ray Wang
cfc-11,GAGE,1987-06-27 16:35,1987-06-27 18:37,Flagged by Ray Wang
cfc-11,GAGE,1987-11-18 02:28,1987-11-18 02:30,Flagged by Ray Wang
cfc-11,GAGE,1987-11-18 12:27,1987-11-18 12:29,Flagged by Ray Wang
cfc-11,GAGE,1990-07-04 18:39,1990-07-04 18:41,Flagged by Ray Wang
cfc-11,GAGE,1990-12-21 02:02,1990-12-21 02:04,Flagged by Ray Wang
cfc-11,GAGE,1992-09-06 09:42,1992-09-06 09:44,Flagged by Ray Wang
cfc-11,GAGE,1994-01-26 20:57,1994-01-26 20:59,Flagged by Ray Wang
cfc-11,GAGE,1994-02-19 00:38,1994-02-19 00:40,Flagged by Ray Wang
cfc-11,GAGE,1994-06-30 23:24,1994-06-30 23:26,Flagged by Ray Wang
      
**********************
cfc-12 MHD
Scale difference Rigby/Wang: 1.0000000366504016
Wang flagged:
---------
Rigby flagged:
---------
cfc-12,GAGE,1987-03-01 05:29

ValueError: 'SIO-93' is not in list