## QARTOD workflow


Access data (data sample: PNBOIA buoy)

In [100]:
import numpy as np
import pandas as pd
from ioos_qc.config import Config
from ioos_qc.streams import PandasStream
import datetime
from ioos_qc.config import QcConfig
import matplotlib.pyplot as plt
import scipy.stats



In [101]:
def apply_qc(inp_aux, tinp_aux, zinp_aux, config):
    qc = QcConfig(qc_config)
    
    qc_results = qc.run(
        inp=inp,
        tinp=tinp,
        zinp=zinp)

    return qc_results

In [102]:
# Open buoy
ds = pd.read_csv('noronha.csv')
ds.columns

Index(['date_time', 'id', 'buoy_id', 'lat', 'lon', 'battery', 'compass',
       'flag_compass', 'flood', 'rh', 'flag_rh', 'pres', 'flag_pres', 'atmp',
       'flag_atmp', 'dewpt', 'flag_dewpt', 'wspd', 'flag_wspd', 'wdir',
       'flag_wdir', 'gust', 'flag_gust', 'arad', 'flag_arad', 'sst',
       'flag_sst', 'cspd1', 'flag_cspd1', 'cdir1', 'flag_cdir1', 'cspd2',
       'flag_cspd2', 'cdir2', 'flag_cdir2', 'cspd3', 'flag_cspd3', 'cdir3',
       'flag_cdir3', 'swvht1', 'flag_swvht1', 'swvht2', 'flag_swvht2',
       'mxwvht1', 'flag_mxwvht1', 'tp1', 'flag_tp1', 'tp2', 'flag_tp2',
       'wvdir1', 'flag_wvdir1', 'wvdir2', 'flag_wvdir2', 'wvspread1',
       'flag_wvspread1', 'pk_dir', 'flag_pk_dir', 'pk_wvspread',
       'flag_pk_wvspread', 'mean_tp', 'flag_mean_tp'],
      dtype='object')

In [103]:
# Rename columns
ds = ds.rename(columns={'swvht1': 'wvht', 'tp1': 'tp', 'wvdir1': 'wvdir'})

In [104]:
ds

Unnamed: 0,date_time,id,buoy_id,lat,lon,battery,compass,flag_compass,flood,rh,...,wvdir2,flag_wvdir2,wvspread1,flag_wvspread1,pk_dir,flag_pk_dir,pk_wvspread,flag_pk_wvspread,mean_tp,flag_mean_tp
0,2022-08-17 22:27:49+00:00,58698,28,-3.798067,-32.371533,,,,,,...,,,46.0,0,127.0,0,44.0,0,5.9,0
1,2022-08-17 21:57:49+00:00,58697,28,-3.798067,-32.371533,,,,,,...,,,45.0,0,124.0,0,40.0,0,5.8,0
2,2022-08-17 21:27:49+00:00,58690,28,-3.798050,-32.371533,,,,,,...,,,47.0,0,137.0,0,39.0,0,5.8,0
3,2022-08-17 20:57:49+00:00,58689,28,-3.798050,-32.371517,,,,,,...,,,47.0,0,112.0,0,38.0,0,6.0,0
4,2022-08-17 20:27:49+00:00,58682,28,-3.798067,-32.371583,,,,,,...,,,49.0,0,117.0,0,41.0,0,6.1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,2022-08-13 00:27:49+00:00,57770,28,-3.798117,-32.371700,,,,,,...,,,49.0,0,114.0,0,39.0,0,5.1,0
237,2022-08-12 23:57:49+00:00,57769,28,-3.798117,-32.371700,,,,,,...,,,52.0,0,115.0,0,37.0,0,6.5,0
238,2022-08-12 23:27:49+00:00,57762,28,-3.798117,-32.371700,,,,,,...,,,52.0,0,119.0,0,35.0,0,6.1,0
239,2022-08-17 23:27:49+00:00,58706,28,-3.798167,-32.371717,,,,,,...,,,40.0,0,132.0,0,35.0,0,5.8,0


In [105]:
ds.columns

Index(['date_time', 'id', 'buoy_id', 'lat', 'lon', 'battery', 'compass',
       'flag_compass', 'flood', 'rh', 'flag_rh', 'pres', 'flag_pres', 'atmp',
       'flag_atmp', 'dewpt', 'flag_dewpt', 'wspd', 'flag_wspd', 'wdir',
       'flag_wdir', 'gust', 'flag_gust', 'arad', 'flag_arad', 'sst',
       'flag_sst', 'cspd1', 'flag_cspd1', 'cdir1', 'flag_cdir1', 'cspd2',
       'flag_cspd2', 'cdir2', 'flag_cdir2', 'cspd3', 'flag_cspd3', 'cdir3',
       'flag_cdir3', 'wvht', 'flag_swvht1', 'swvht2', 'flag_swvht2', 'mxwvht1',
       'flag_mxwvht1', 'tp', 'flag_tp1', 'tp2', 'flag_tp2', 'wvdir',
       'flag_wvdir1', 'wvdir2', 'flag_wvdir2', 'wvspread1', 'flag_wvspread1',
       'pk_dir', 'flag_pk_dir', 'pk_wvspread', 'flag_pk_wvspread', 'mean_tp',
       'flag_mean_tp'],
      dtype='object')

In [116]:
flags_dict={}
pd_flags={}

for var in ['wvht', 'tp', 'wvdir']:
    
    if var == 'wvht':
        fail_span = [0.1, 19.9]
        suspect_span = [0.2, 16]
        #spike_test
        tolerance = 7
        suspect_threshold_spike = 25
        fail_threshold_spike = 30
        
        # flat_test (confirm values)
        suspect_threshold_flat = 2
        fail_threshold_flat = 3

    elif var == 'tp':
        fail_span = [1.7, 30]
        suspect_span = [1.9, 25]
        
        #spike_test
        tolerance = 7
        suspect_threshold_spike = 40
        fail_threshold_spike = 50
        
        # flat_test (confirm values)
        suspect_threshold_flat = 2
        fail_threshold_flat = 3


    qc_config = {
        "qartod": 
            {
            "gross_range_test": {"fail_span": fail_span, "suspect_span": suspect_span},

            "flat_line_test": {"tolerance": tolerance, "suspect_threshold": suspect_threshold_spike, "fail_threshold": fail_threshold_spike},

            "spike_test": {"suspect_threshold": suspect_threshold_flat, "fail_threshold": fail_threshold_flat},

           # "climatology_test": {},

            #"attenuated_signal_test": {},

            #"density_inversion_test": {},

            #"location_test": {},

            #"rate_of_change_test": {}

        }
    }

    
    if var == 'wvdir':
        fail_span = [0, 360]
        suspect_span = [0, 360]


        qc_config = {
            "qartod": 
                {
                "gross_range_test": {"fail_span": fail_span, "suspect_span": suspect_span},
            }
        }
    
    
    inp=ds[var]
    tinp=ds['date_time']
    zinp=ds[var]*0
    
    flags_dict[var] = apply_qc(inp,tinp,zinp, qc_config)
    
    ## Flag  grouping: if scores the same for all tests final flag is mantained, otherwise is labelled as bad data (e.g. 111 = 1, 999=9, 141 = 4)
    # 0 flag won't be applicable
    pd_flags[var] = pd.DataFrame(flags_dict[var]['qartod'])

    if var is not 'wvdir':
        aux = pd_flags[var].apply(lambda x: x.gross_range_test == x.flat_line_test == x.spike_test, axis = 1)
        pd_flags[var]['final_flags'] = 4
        for ind in np.where(aux==True)[0]:
            pd_flags[var]['final_flags'][ind] = int(pd_flags[var].gross_range_test[ind])
    
    else:
        pd_flags[var]['final_flags']=pd_flags[var].gross_range_test
        



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [122]:
df = pd.DataFrame()

df['wvht'] = ds.wvht
df['wvht_flag_pnboia'] = ds.flag_swvht1
df['wvht_flag_qartod'] = pd_flags['wvht']['final_flags']

df['tp'] = ds.tp
df['tp_flag_pnboia'] = ds.flag_tp1
df['tp_flag_qartod'] = pd_flags['tp']['final_flags']

df['wvdir'] = ds.wvdir
df['wvdir_flag_pnboia'] = ds.flag_wvdir1
df['wvdir_flag_qartod'] = pd_flags['wvdir']['final_flags']
df

Unnamed: 0,wvht,wvht_flag_pnboia,wvht_flag_qartod,tp,tp_flag_pnboia,tp_flag_qartod,wvdir,wvdir_flag_pnboia,wvdir_flag_qartod
0,2.10,2,4,8.5,0,4,129.0,0,1
1,2.06,2,4,8.5,0,4,129.0,0,1
2,1.98,2,4,8.5,0,4,129.0,0,1
3,2.19,2,4,7.9,0,4,122.0,0,1
4,2.12,2,4,8.5,0,4,124.0,0,1
...,...,...,...,...,...,...,...,...,...
236,1.63,2,4,7.9,0,4,126.0,0,1
237,1.98,2,4,7.9,0,4,123.0,0,1
238,1.86,2,4,7.9,0,4,123.0,0,1
239,1.94,2,4,8.5,0,4,131.0,0,1


In [8]:
def plot_results(data, variable_name, results, title, test_name):
    time = data.cf["time"]
    obs = data[variable_name]
    qc_test = results["qartod"][test_name]

    qc_pass = np.ma.masked_where(qc_test != 1, obs)
    qc_suspect = np.ma.masked_where(qc_test != 3, obs)
    qc_fail = np.ma.masked_where(qc_test != 4, obs)
    qc_notrun = np.ma.masked_where(qc_test != 2, obs)

    fig, ax = plt.subplots(figsize=(15, 3.75))
    fig.set_title = f"{test_name}: {title}"
    
    ax.set_xlabel("Time")
    ax.set_ylabel("Observation Value")

    kw = {"marker": "o", "linestyle": "none"}
    ax.plot(time, obs,  label="obs", color="#A6CEE3")
    ax.plot(time, qc_notrun, markersize=2, label="qc not run", color="gray", alpha=0.2, **kw)
    ax.plot(time, qc_pass, markersize=4, label="qc pass", color="green", alpha=0.5, **kw)
    ax.plot(time, qc_suspect, markersize=4, label="qc suspect", color="orange", alpha=0.7, **kw)
    ax.plot(time, qc_fail, markersize=6, label="qc fail", color="red", alpha=1.0, **kw)
    ax.grid(True)