Re-calculate bimodal parameters for each CCN_observation window. (We want NSD1_sum and NSD2_sum for each window)

In [1]:
import numpy as np
import os
import pandas as pd


In [2]:
# load data:
obs_dir = '../input_data'   

bimodal_params = pd.read_csv(os.path.join(obs_dir, 'Bimodal_parameters.csv'))  # Fitted bimodal parameters, which we are recalculating because it does not contain scaled number concentration values.
NSD_params_all = pd.read_csv(os.path.join(obs_dir, 'NSD_PARAMS_SCALED.CSV'), parse_dates=['datetime'])  # bimodal parameters scaled to aerosol observations at 10min resolution
CCN_all = pd.read_csv(os.path.join(obs_dir, 'CCN_all.csv'), parse_dates=['datetime','start_time','end_time'])  # observed CCN data (contains CCN-obs Window information (start and end time), 2hr resolution)

In [3]:
# function to calculate the median absolute deviation (MAD):

def mad(series):
    mad = np.median(np.abs(series - series.median()))
    return max(mad, 0.01)  # ensure MAD is not zero

In [14]:
#re-calculate median bimodal parameters for each 2 hr window, with scaled number concentration:

NSD_params = NSD_params_all.copy(deep=True)

tw_start = CCN_all.loc[CCN_all['datetime']>= '2016-08-16 00:00:00'].reset_index(drop=True)['start_time'] # time window start
tw_end = CCN_all.loc[CCN_all['datetime']>= '2016-08-16 00:00:00'].reset_index(drop=True)['end_time'] # time window end
tw_mid = CCN_all.loc[CCN_all['datetime']>= '2016-08-16 00:00:00'].reset_index(drop=True)['datetime'] # time window mid
window_idx = 0

# create mask which identifies which time window parameters belong to:
for i in range(len(tw_start)):
    mask = (
        (NSD_params['datetime'] >= tw_start[i]) &
        (NSD_params['datetime'] < tw_end[i])         
    )
    
    if mask.any():
        NSD_params.loc[mask, 'CCN_window'] = window_idx # index of the time window, used for grouping
        NSD_params.loc[mask, 'datetime'] = tw_mid[i] # assign the mid-point of the time window to the datetime column
        window_idx += 1

# drop rows with NaN in CCN_window column (no data in that time window):
NSD_params = NSD_params.dropna(subset=['CCN_window'])

# drop where the datetime does not match original datetimes in bimodal_params:
NSD_params = NSD_params[NSD_params['datetime'].isin(pd.to_datetime(bimodal_params['datetime']))].reset_index(drop=True)

# THERE IS ONE OBS WINDOW WHERE THE NSD PARAMETERS ARE VERY STRANGE AND RESULT IN UNREALISTIC MASS VALUES:
# here we filter these out:
NSD_params = NSD_params[~((NSD_params['CCN_window'] == 8564.0) & (NSD_params['mode2_d']>100))]

# Take median of parameters within each time window:
NSD_params_medians = NSD_params.groupby('CCN_window').median().copy(deep=True).reset_index(drop=True) # take median of parameters within each time window
NSD_params_mads = NSD_params.groupby('CCN_window').apply(lambda x: x.select_dtypes(include='number').agg(mad), include_groups=False).copy(deep=True).reset_index(drop=True)  # take median absolute deviation of parameters within each time window
NSD_params_mads = NSD_params_mads.rename(columns=lambda x: x + '_mad')  # rename columns to indicate MAD

# merge median and MAD dataframes:
NSD_params_windows = pd.concat([NSD_params_medians, NSD_params_mads], axis=1)

# add column for number of measurements in each time window:
NSD_params_windows['n_measurements'] = NSD_params.groupby('CCN_window')['mode1_d'].count().reset_index(drop=True)

# additional dfs for max/min/median abs. dev. in each time window:
NSD_params_maxs = NSD_params.groupby('CCN_window').max().copy(deep=True).reset_index(drop=True)
NSD_params_maxs['n_measurements'] = NSD_params.groupby('CCN_window')['mode1_d'].count().reset_index(drop=True)

NSD_params_mins = NSD_params.groupby('CCN_window').min().copy(deep=True).reset_index(drop=True)
NSD_params_mins['n_measurements'] = NSD_params.groupby('CCN_window')['mode1_d'].count().reset_index(drop=True)

# create a unique ID for each time window for original dataset:
NSD_params['window_id'] = pd.factorize(NSD_params['CCN_window'])[0]  

In [17]:
# save to CSV:
NSD_params_windows.to_csv(os.path.join(obs_dir, 'bimodal_params_medians.csv')) 
NSD_params_maxs.to_csv(os.path.join(obs_dir, 'bimodal_params_maxs.csv'))
NSD_params_mins.to_csv(os.path.join(obs_dir, 'bimodal_params_mins.csv'))

NSD_params.to_csv(os.path.join(obs_dir, 'NSD_params_withwindows.csv'))  