In [1]:
def clean_anomalies(filterMin=-1, filterMax=-1):
    """
    This function will clean a saildrone dataset of invalid/unreported data points, and optionally 
    filter the SSS data between two values (<filterMin>, <filterMax>)
    Author: Austin
    Args:
      <str>: the URL to a local saildrone dataset.
      <float>: The minimum cutoff point for all SSS data
      <float>: The maximum cutoff point for all SSS data
    Returns:
      <xr.DataSet> A clean saildrone dataset, free of invalud/unreported SSS data points.
    """

    #import necessary packages
    import numpy as np
    import pandas as pd
    import xarray as xr
    import matplotlib.pyplot as plt 
    import matplotlib.dates as mdates
    import seaborn as sns
    import datetime
    import warnings
    import glob

    warnings.simplefilter('ignore') 

    #load map packages
    from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
    import cartopy.feature as cfeature
    import cartopy.crs as ccrs
    from calendar import month_abbr

    #open .nc file with Saildrone Data
    #pulling saildrone data and returning files in fns list. This is where data is first stored. 
    ddir = '../saildrone_data/'
    fns = glob.glob(ddir+'*.nc')
    
    for i in fns: ## Going through all files in fns
        sail = xr.open_dataset(i)  # Opening all files to read data
    
    #copy data for calculations versus graphing

    sail

    def filterSAL(min, max):
        if (max == -1):
            return False
        sail['SAL_CTD_MEAN']=sail.SAL_CTD_MEAN.where(sail.SAL_CTD_MEAN>min, np.nan)
        sail['SAL_CTD_MEAN']=sail.SAL_CTD_MEAN.where(sail.SAL_CTD_MEAN<max, np.nan)

        return True

    font = {
        'family': 'monospace',
        'color':  'k',
        'weight': 'normal',
        'size': 16,
    }
    # Remove Any Error Data
    filterSAL(0, 9999)

    min_thresh = 32
    max_thresh = 34

    if (min_thresh & max_thresh):
        print("Filtered between " + str(min_thresh) + " and " + str(max_thresh) + " PPU")
        filterSAL(min_thresh, max_thresh)

    return sail

In [2]:
clean_anomalies()

Filtered between 32 and 34 PPU


In [5]:
#Author: William Gilmore
#Isolates data on the westcoast
def westcoast(upperLat, lowerLat):
    import numpy as np
    import pandas as pd
    import xarray as xr
    import matplotlib.pyplot as plt 
    import matplotlib.dates as mdates
    import seaborn as sns
    import datetime
    import warnings
    import glob

    warnings.simplefilter('ignore') 

    
        #Author: Paul
#compiles all voyages

    #I've changed all instances of "sail to ds" and changed ddir to be a relative file path -will
    def Compile_Datasets(fn_list_in):


        # fn_list_in: list of strings with the file names, or filename(string), or "all"
        # returns: compiled list
        fn_list = []

        ddir = "../saildrone_data"

        # Make sure the fn_list_in is formatted correctly
        if(fn_list_in == "all"):

            fn_list = glob.glob(ddir+ '/*.nc')
        elif(type(fn_list_in) == 'list' and type(fn_list_in[0]) == 'string'):
            fn_list = fn_list_in
        elif(type(fn_list_in) == 'string'):
            fn_list[0] = fn_list_in
        else: 
            raise Exception("first argument to 'Compile_Data_Set_And_Graph' function must be; a list of file names, a file name, or \"all\"")

        print()

        # open the first dataset
        sail = xr.open_dataset(fn_list[0])

        sail = sail.drop_vars("trajectory", errors='ignore')

        # give the first dataset a relative ID so all datasets can be differentiated
        sail["relativeID"] = 0
        # make lists for certain variables that remain constant for each dataset. these are used later in the last two cells
        yearList = [sail["time"][0].dt.year]
        durationList = [sail["time"][len(sail["time"]) - 1] - sail["time"][0]]
        # take the actual cruise ID from the dataset attributes and put it in a new list
        try:
            realID = [int(sail.attrs["id"])]
        except:
            realID = [fn_list[0]]
        sail["realID"] = realID[0]
        # add the duration back to the dataset
        sail["duration"] = durationList[0]

        # repeat previous steps for other datasets that need to be combined.

        if len(fn_list) > 1:
            for i in range(1, len(fn_list)):
                temp = xr.open_dataset(fn_list[i])
                temp = temp.drop_vars("trajectory", errors='ignore')
                temp["relativeID"] = i
                yearList.append(temp["time"][0].dt.year)

                try:
                    realID.append(int(temp.attrs["id"]))
                except:
                    realID.append(fn_list[i])

                tempDuration = temp["time"][len(temp["time"]) - 1] - temp["time"][0]
                temp["duration"] = tempDuration
                durationList.append(tempDuration)
                temp["realID"] = realID[i]
                sail = xr.concat([sail, temp], dim="time")
                temp.close()

        # reformat dates
        sail['date'] = mdates.date2num(sail['time'].dt.date)

        # ask what variable should be plotted
        return(sail)

    

    #upperLat: Upper acceptable latitude
    #lowerLat: Lower acceptable latitude
    # -function will discard all data not between upperLat and lowerLat
    
    # Runs Pauls function to gather all saildrone data into 1 dataset
    ds = Compile_Datasets("all")
    
    #Removes Data outside of Westcoast
    ds = ds.where((ds.lon > -130) & (ds.lon < -115) & (ds.lat > 27) & (ds.lat < 52))
    
    #Removes data that is not within 300 km of shore
    ds = ds.where(ds.dist_land <= 300)
    
    #Removes SF bay data
    ds = ds.where(~(((ds.lon > -122.5938) & (ds.lat > 37.72783)) & ((ds.lon < -122.2506620424831) & (ds.lat < 38.094658646550556))) | ~(((ds.lon > -122.38678630116495) & (ds.lat > 37.430464705762226)) & ((ds.lon < -121.99799777841487) & (ds.lat < 37.81408437558721))))
    
    #Removes Data not between upperLat and lowerLat
    ds = ds.where((ds.lat > lowerLat) & (ds.lat < upperLat))
     

    return(ds)


In [9]:
westcoast(50, 20)




In [10]:
# final kickback is dataset with cleaned west coast data