# Create netcdf from cdp data

Status 'Turn into script'

- read_chunky_csv OK
- resolve_date OK


TODO: 
- imports 
- read_nav
- cdp_df_to_netcdf
- update metadata
- separate paths and struct information for reuse
- pull the metadata definitions outside of this program

In [1]:
# imports from packages
import pandas as pd
import xarray as xr
import numpy as np
import warnings
import glob # allows for wildcards in filemanagement
import os # get a list of all directories/files
import re
from datetime import datetime

# imports from files
from utils.flight_utils import get_safire_flightid
from utils.nc_utils import read_chunky_csv, binned_cdp_to_xds, cdp_df_to_netcdf, cdp_to_df, add_cdp_df_to_xds
from utils.func_nc import resolve_date, floor_to_sec_res
import utils.read_nav as read_nav # get navigational data from nav-file

# surpress UserWarning connected to timezoneless np.datetime
warnings.filterwarnings("ignore", message="no explicit representation of timezones available for np.datetime64")
# surpress UserWrning connected to boolean series indexing (creating dataframe with null values)
warnings.filterwarnings("ignore", message="Boolean Series key will be reindexed to match DataFrame index.")


In [2]:
# --- Read in data to dataframes

# -- Paths to datafiles
main_path = '/home/ninalar/Documents/MC2/2022-islas/' # Local disk path to nav data:
pads_path = '/microphy/pads/' # path to pads (CIP and CDP data)
cdp_main_path = main_path + pads_path
path_store = '/home/ninalar/Documents/MC2/Results_2022-islas/Processed/CDP_processed/' # where to store the netcdfs

# structure of file names (for access)
file_struct = {'cip':'/*CIP.nc',
               'cdp':'/02CDP*.csv',
               'nav_tdyn':'/*_TDYN_*.nc',
               'nav_nav': '/*_NAV_*.nc',
               'flight_rep': '/*MAIN*.csv'} # Flight report file name

flights, safire_to_islas = get_safire_flightid(main_path)

# -- read nav
# extra_info includes limits for the plots (campaign_cood_limits) and extra landing and takeoff times (not used here)
#nav_df, nav_stats_dict, extra_info = read_nav.read_nav(flights) 

In [3]:
# get the nav information from the given flight
# - need to loop over each fligth and extract only the dimensions and the coordinates, and attributes from the nav file
# - add islasid as an attribute
# - for safireid with more than one islasid, separate into two

flight = 'as220007'

# -- Get NAV files
# get the nav file from the given flight
nav_file = glob.glob(main_path + flight + file_struct['nav_tdyn'])

nav_xds = xr.open_dataset(nav_file[0]) # the nav file xarray

# NAV preparations: drop duplicate time steps
index = np.unique(nav_xds.time, return_index = True)[1]
nav_xds = nav_xds.isel(time=index)
nav_xds = floor_to_sec_res(nav_xds,'time') # floor the times to sec for easier joining

# drop variables from nav
nav_xds = nav_xds[['TAS1']] # only keeps TAS

# update attributes of data variables and coordinates with original file id
for var_name, variable in nav_xds.data_vars.items():
    nav_xds[var_name].attrs['comment'] = f'source id: {nav_xds.attrs["id"]}' # update data variables

for coord_name, coordinate in nav_xds.coords.items():
    nav_xds[coord_name].attrs['comment'] = f'source id: {nav_xds.attrs["id"]}' # update coordinates

# filter out relevant attributes for resulting dataset
attrs_relevant = ['flight_id','project','platform','source','product_version','Conventions']

filtered_attrs = {k: v for k, v in nav_xds.attrs.items() if k in attrs_relevant} # new dictionary of relevant attributes
nav_xds.attrs = filtered_attrs # set new attrs

# ---- Get CDP data    
# path to CDP data
path_in = main_path + flight + pads_path

# Get a list of all the CDPfiles in the directory (also look in subdirectories)
filelist = glob.glob(path_in + '**' + file_struct['cdp'], recursive=True)

cdp_df, filenames, meta_df, chan_list, pads_df, bins_df = cdp_to_df(filelist, flight)


# separate out the CDP_Bin columns and the time for separate handling
cdp_bin_df = cdp_df.loc[:,cdp_df.columns.str.startswith('CDP Bin')|(cdp_df.columns == 'time')]

# separate out the other columns
cdp_df = cdp_df.loc[:,~cdp_df.columns.str.startswith('CDP Bin')]



Reading: /home/ninalar/Documents/MC2/2022-islas/as220007/microphy/pads/20220324080247/02CDP 20220322105458.csv


In [4]:
# Check if two islasids for the safireid

# if two islasids in the safire id separate the nav_xds in two
if isinstance(safire_to_islas[flight],list):
    # Get the flight report for the given flight
    pattern = os.path.join(main_path,f'{flight}/CRvol{file_struct['flight_rep']}')
    file = glob.glob(pattern)

    fr_list = read_chunky_csv(file[0]) 

    # store flightreport entries as dataframe
    headers = pd.DataFrame(fr_list[1]).iloc[0]
    report_df  = pd.DataFrame(pd.DataFrame(fr_list[1]).values[1:], columns=headers)

    # find the landing times
    landings = report_df[report_df['title']=='landing']
    takeoffs = report_df[report_df['title']=='takeoff']

    # get time to split dataset on based on first landing
    split_land = datetime.strptime(landings.iloc[0].date, "%Y-%m-%dT%H:%M:%S.%fZ")
    split_take = datetime.strptime(takeoffs.iloc[1].date, "%Y-%m-%dT%H:%M:%S.%fZ")

    # separate the nav_xds into two:
    nav_0_xds = nav_xds.sel(time=slice(None,split_land))
    nav_1_xds = nav_xds.sel(time=slice(split_take, None))

    # split cdp data on same conditions (index of cdp_df is time)
    flight_sep0_cond = cdp_df.index<split_land
    flight_sep1_cond = cdp_df.index>split_take

    # separate the cdp_df into two:
    cdp_0_df = cdp_df[flight_sep0_cond]
    cdp_1_df = cdp_df[flight_sep1_cond]
else:
    # only one islasid for safireid, go directly to generating the full_xds
    # Process for creating complete xarray:
    #
    #  adding dataframe to the xarray
    cdp_xds = add_cdp_df_to_xds(nav_xds, cdp_df, meta_df, pads_df)

    # Turning the binned information into an xarray to be added to dataset
    bins_xds = binned_cdp_to_xds(bins_df, cdp_bin_df)

    # merge the binned variables with the rest of the cdp data
    full_cdp_xds = xr.merge([cdp_xds,bins_xds])

#print(nav_0_xds.time.values.max())
#print(nav_1_xds.time.values.min())
#print(f'First flight, min: {cdp_0_df.time.min()}, max: {cdp_0_df.time.max()}')
#print(f'Second flight, min: {cdp_1_df.time.min()}, max: {cdp_1_df.time.max()}')

orig 14358
todataset: 14358
merged: 14358


In [5]:
full_cdp_xds

In [None]:
#Move to different location 


# Fix the sample time and sample volume metadata for later use
        # sample area from meta information and given in mm^2 readjust to m by dividing with 10⁶
        sa = float(meta_df.loc[meta_df['Metadata'] == 'Sample Area (mm^2)', 'Value'].iloc[0]) /(1000*1000)
        st_text = meta_df.loc[meta_df['Metadata'] == 'Sample Time', 'Value'].iloc[0] 
        # find seconds from first number in text
        pattern = r'\d+'
        match = re.search(pattern, st_text)
        if match:
            st = int(match.group())


#calculate the sample volume (sample area SA * TAS redused * sample time (1 sek))
islas_cdp_df['SV (m^3)'] = sa * islas_cdp_df['TAS (m/s)'] * st


# Get variable information from header of dataframe
var_df = pd.DataFrame(islas_cdp_df.columns[islas_cdp_df.columns.str.endswith(')')], columns=['Variable'])
var_df['unit'] = var_df['Variable'].apply(lambda x: x.split('(')[1].strip())
var_df['Variable'] = var_df['Variable'].apply(lambda x: x.split('(')[0].strip())
var_df = var_df.replace(r'\)','',regex=True) # removing remaining ] in units

In [None]:
# Savind df as NetCDF

# Dictionary to hold counts of second level entries
second_level_counts = {}

for key in meta_dict:
    # Count entries in the second level for each first-level key
    second_level_counts[key] = len(meta_dict[key])
    if second_level_counts[key]==1:
        # Keep metadata as is and extract all information for the given key and islasid
        source_files = [meta_dict[key][0]['filename']]
        chan_list = meta_dict[key][0]['channel info']
        meta_df = meta_dict[key][0]['instrument info']
        cdp_list = meta_dict[key][0]['pads info']
        flightid = meta_dict[key][0]['islasids'][0]
        # filter dataframe with all data on flightid
        cdp_nav_flight_df = cdp_nav_df[cdp_nav_df['flightid']==flightid]
        # create netcdf for this flight:
        print(f'creating netcdf for flight {flightid}')
        ds = cdp_df_to_netcdf(cdp_nav_flight_df, cdp_list, meta_df, chan_list, bins_df, source_files, path_store)
        print('-----')                
    else:
        # if more than one file, check and consolidate metadata
        # first: check if the files have different islasid
        unique_ids=func_nc.find_unique_listkey(meta_dict[key],'islasids')
        
        if len(unique_ids)==1: # if all files have the same islasid
            # TODO: if all the islasids from the dict is the same, check if the metadata is the same
            # meta_dict['as220012'][0]['channel info']==meta_dict['as220012'][1]['channel info']
            # For now: use the first entry as the default
            source_files = [entry['filename'] for entry in meta_dict[key].values()] # get all filenames by list comprehension
            chan_list = meta_dict[key][0]['channel info']
            meta_df = meta_dict[key][0]['instrument info']
            cdp_df = meta_dict[key][0]['pads info']
            flightid = meta_dict[key][0]['islasids'][0]
            cdp_nav_flight_df = cdp_nav_df[cdp_nav_df['flightid']==flightid]
            # create netcdf for this flight:
            print(f'creating netcdf for flight {flightid}')
            ds = cdp_df_to_netcdf(cdp_nav_flight_df, cdp_list, meta_df, chan_list, bins_df, source_files, path_store)
        else:
            # separate by islasid: Loop through the different unique ids
            for id in unique_ids:
                # Get the entires in the dictionary that has the specific islasid in unique_ids
                keys_id = [key for key, value in meta_dict[key].items() if value['islasids'] == id]

                # create a subset dictionary only containing the entries where the islasids is in unique_ids
                subset_dict = {key: value for key, value in meta_dict[key].items() if value['islasids'] == id}
                source_files = [entry['filename'] for entry in subset_dict.values()] # get all filenames by list comprehension

                # get the metadata items for each of the entries in subset, but only use one of them
                #TODO: double check that the metadata is the same for these (same flight different file)
                chan_list = subset_dict[next(iter(subset_dict))]['channel info']
                meta_df = subset_dict[next(iter(subset_dict))]['instrument info']
                cdp_df = subset_dict[next(iter(subset_dict))]['pads info']
                flightid = id
                cdp_nav_flight_df = cdp_nav_df[cdp_nav_df['flightid']==flightid]
                # create netcdf for this flight:
                print(f'creating netcdf for flight {flightid}')
                ds = cdp_df_to_netcdf(cdp_nav_flight_df, cdp_list, meta_df, chan_list, bins_df, source_files, path_store)
                

creating netcdf for flight IS22-10
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-10.nc
-----
creating netcdf for flight IS22-05
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-05.nc
-----
creating netcdf for flight IS22-06
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-06.nc
-----
creating netcdf for flight IS22-02
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-02.nc
-----
creating netcdf for flight IS22-11
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-11.nc
-----
creating netcdf for flight IS22-04
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-04.nc
creating netcdf for flight IS22-03
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-03.nc
creating netcdf for flight IS22-09
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-09.nc
-----
creating netcdf for flight IS22-07
../Results_2022-islas/Processed/CDP_processed/CDP_updated_IS22-07.nc
-----
creating netcdf for fl

In [None]:
# Testing how the netcdf files looks

path_store = '../Results_2022-islas/Processed/CDP_processed/' # where to store the netcdfs
filepath = f'{path_store}CDP_updated_IS22-11.nc'

cdp_ds = xr.open_dataset(filepath)

In [8]:
cdp_ds