In [56]:
import pandas as pd
import xarray as xr
import numpy as np
import glob
import dask
dask.config.set(**{'array.slicing.split_large_chunks': True})
import sys
import matplotlib.pyplot as plt
import matplotlib as mpl
from my_library.track_analyses import helpers
import pathlib
import logging

datadir = pathlib.Path(f'/work/bb1153/b382635/plots/tracked_results_2025/dataset_paper/results_data/acp_submission/')

In [106]:
# view current
pd.read_csv(datadir / 'system_validity.csv', index_col='system_id')

Unnamed: 0_level_0,hits_boundary,n_cores,relative_size,n_cores_above_freezing,lifetime_mins
system_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,True,91,,,2850.0
3,False,1,0.225000,0.0,570.0
4,False,41,,,1410.0
5,False,1,0.329167,1.0,600.0
11,True,1,,,135.0
...,...,...,...,...,...
121747,False,1,0.024167,0.0,345.0
121969,False,1,0.020833,1.0,105.0
122074,True,1,,,105.0
122108,False,1,0.026667,0.0,165.0


In [97]:
# load data filtering results created during tracking 

# i) systems that hit boundaries
df1 = pd.read_csv('/work/bb1153/b382635/data/final_tracks/updraft_ice_only/amazon/data_filtering_stats/system_hits_boundary.csv', index_col='system_id')
invalid = df1.index[df1.hits_boundary==True]

# ii) number of cores
df2 = pd.read_csv('/work/bb1153/b382635/data/final_tracks/updraft_ice_only/amazon/data_filtering_stats/system_n_cores.csv', index_col='system_id')

# lifetime
df3 = pd.read_csv('/work/bb1153/b382635/data/final_tracks/updraft_ice_only/amazon/data_filtering_stats/system_lifetime.csv', index_col='system_id')
df3 = df3.rename(columns={'0':'lifetime'})
df3['lifetime_mins'] = pd.to_timedelta(df3["lifetime"]).dt.total_seconds() / 60 # to minutes
df3 = df3.loc[df2.index]

# collect
df = pd.concat((df1, df2, df3), axis=1)
hits_bndry = df.index[df.hits_boundary]

In [10]:
# load statistics to calculate (iii) system size relative to the domain and (iv)  whether the fist core arose below the freezing level

In [44]:
# select whether to iterate
iterate = 0
if iterate:
    batch = int(sys.argv[1])
    size = int(sys.argv[2])
    batch = iterate
    size = 250
    n_clouds = size
else:
    batch = size = None
    n_clouds = 100
data_params = dict(batch=batch, size=size, n_clouds=n_clouds)

fdir = f'/work/bb1153/b382635/data/track_statistics/updraft_ice_only/amazon/system-wise/fcsfirst/'  
ds = helpers.load_stats(fdir, ['cloud_area', 'core_bh'], sidx_ignore=hits_bndry, **data_params)

In [45]:
# (iii) size relative to domain
n_cells = 300 * 400
cell_area = 11000**2 # m2
domain_area = cell_area * n_cells # m2
rel_size = (100 * (ds.cloud_area.max('time') / domain_area)).to_dataframe(name='relative_size')

In [46]:
# (iv) first core arises at what height?
n_cores_above_freezing = (ds.core_bh.min('time')>4000).sum('core').to_dataframe('n_cores_above_freezing')
n_cores_above_freezing = n_cores_above_freezing.astype(int)

In [47]:
# results
previous = pd.read_csv(datadir / 'system_validity.csv', index_col='system_id')
new = pd.concat((df, rel_size, n_cores_above_freezing), axis=1)
final = pd.concat((previous, new), axis=0).groupby(level=0).first() # keep existing result and append new
final.index.name = 'system_id'

In [31]:
# save
final.to_csv(datadir / 'system_validity.csv')

In [23]:
logging.info(f'saved {rel_size.index.size} new data points')

INFO:root:saved 250 new data points


In [43]:
# are there any valid clouds without stats calculated... ?
import re
fdir = f'/work/bb1153/b382635/data/track_statistics/updraft_ice_only/amazon/system-wise/fcsfirst/'
valid = df.index[~df.hits_boundary]
stats_exist_for = []
for f in glob.glob(fdir+'*'):
    stats_exist_for.append(int(re.search(r'cloud_(\d+)\.nc$', pathlib.Path(f).name).group(1)))
missing_stats = [x for x in valid if x not in stats_exist_for]

[4, 12, 14571, 27069, 49853, 64193]

In [49]:
df.loc[missing_stats]

Unnamed: 0_level_0,hits_boundary,n_cores
system_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4,False,41
12,False,67
14571,False,51
27069,False,73
49853,False,36
64193,False,49
