# DSSIM Gathering Notebook

This notebook opens original and compressed NetCDF files at a given data path, computes the DSSIM on the compressed files for specified time steps, and stores the values in a CSV file in the lcr/data/ directory.

In [4]:
# Make sure you are using the cmip6-2019.10 kernel

# Add ldcpy root to system path (MODIFY FOR YOUR LDCPY CODE LOCATION)
import sys

sys.path.insert(0, '/glade/u/home/apinard/newldcpy/ldcpy')
import ldcpy

# Display output of plots directly in Notebook
%matplotlib inline
# Automatically reload module if it is editted
%reload_ext autoreload
%autoreload 2

# silence warnings
import warnings

warnings.filterwarnings("ignore")

In [5]:
# start the dask scheduler

# Note: This notebook should run on Cheyenne for now, running on casper will work for
# the most part but trying to plot the LHFLX variable will result in a timeout,
# presumably due to the large file sizes for the variable.

# for Cheyenne
from dask_jobqueue import PBSCluster

cluster = PBSCluster(
    queue="regular",
    walltime="02:00:00",
    project="NIOW0001",
    memory="109GB",
    resource_spec="select=1:ncpus=9:mem=109GB",
    cores=36,
    processes=9,
)


# scale as needed
cluster.adapt(minimum_jobs=1, maximum_jobs=30)
cluster

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

In [6]:
from dask.distributed import Client

# Connect client to the remote dask workers
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/apinard/proxy/{port}/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/apinard/proxy/{port}/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.148.10.19:36705,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/apinard/proxy/{port}/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [7]:
import time

cols_monthly = {}
cols_daily = {}
sets = {}
levels = {}
data_path = "/glade/p/cisl/asap/CAM_lossy_test_data_31/"
monthly_variables = ["CCN3", "CLOUD", "FLNS", "FLNT", "FSNS", "FSNT", "LHFLX",
            "PRECC", "PRECL", "PS", "QFLX", "SHFLX", "TMQ", "TS", "U"]
daily_variables = ["FLUT", "LHFLX", "PRECT", "TAUX", "TS", "Z500"]

for variable in daily_variables:
    print(variable)
    levels[variable] = [f"bg_2_{variable}",
                        f"bg_3_{variable}",
                        f"bg_4_{variable}", f"bg_5_{variable}",
                        f"bg_6_{variable}", f"bg_7_{variable}",]
    sets[variable] = [f"{data_path}/orig/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h1.{variable}.20060101-20071231.nc",
                      f"{data_path}/bg/bg_2/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h1.{variable}.20060101-20071231.nc",
                      f"{data_path}/bg/bg_3/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h1.{variable}.20060101-20071231.nc",
                      f"{data_path}/bg/bg_4/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h1.{variable}.20060101-20071231.nc",
                      f"{data_path}/bg/bg_5/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h1.{variable}.20060101-20071231.nc",
                      f"{data_path}/bg/bg_6/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h1.{variable}.20060101-20071231.nc",
                      f"{data_path}/bg/bg_7/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h1.{variable}.20060101-20071231.nc"]
    cols_daily[variable] = ldcpy.open_datasets("cam-fv", [f"{variable}"], sets[variable], [f"orig_{variable}"] + levels[variable], chunks={"time":700})

for variable in monthly_variables:
    print(variable)
    levels[variable] = [f"bg_2_{variable}",
                        f"bg_3_{variable}",
                        f"bg_4_{variable}", f"bg_5_{variable}",
                        f"bg_6_{variable}", f"bg_7_{variable}",]
    sets[variable] = [f"{data_path}/orig/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h0.{variable}.200601-201012.nc",
                      f"{data_path}/bg/bg_2/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h0.{variable}.200601-201012.nc",
                      f"{data_path}/bg/bg_3/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h0.{variable}.200601-201012.nc",
                      f"{data_path}/bg/bg_4/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h0.{variable}.200601-201012.nc",
                      f"{data_path}/bg/bg_5/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h0.{variable}.200601-201012.nc",
                      f"{data_path}/bg/bg_6/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h0.{variable}.200601-201012.nc",
                      f"{data_path}/bg/bg_7/b.e11.BRCP85C5CNBDRD.f09_g16.031.cam.h0.{variable}.200601-201012.nc"]
    cols_monthly[variable] = ldcpy.open_datasets("cam-fv", [f"{variable}"], sets[variable], [f"orig_{variable}"] + levels[variable], chunks={"time":700})



FLUT
dataset size in GB 1.13

LHFLX
dataset size in GB 1.13

PRECT
dataset size in GB 1.13

TAUX
dataset size in GB 1.13

TS
dataset size in GB 1.13

Z500
dataset size in GB 1.13

CCN3
dataset size in GB 2.79

CLOUD
dataset size in GB 2.79

FLNS
dataset size in GB 0.10

FLNT
dataset size in GB 0.10

FSNS
dataset size in GB 0.10

FSNT
dataset size in GB 0.10

LHFLX
dataset size in GB 0.10

PRECC
dataset size in GB 0.10

PRECL
dataset size in GB 0.10

PS
dataset size in GB 0.10

QFLX
dataset size in GB 0.10

SHFLX
dataset size in GB 0.10

TMQ
dataset size in GB 0.10

TS
dataset size in GB 0.10

U
dataset size in GB 2.79



In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
for variable in ["FLNS", "FLNT", "FSNS", "FSNT", "LHFLX",
            "PRECC", "PRECL", "PS", "QFLX", "SHFLX", "TMQ", "TS"]:
    for time in range(0,cols_monthly[variable].dims["time"]):
        for i in ["bg_2", "bg_3", "bg_4", "bg_5", "bg_6", "bg_7"]:
            ldcpy.save_metrics(cols[variable], variable, f"orig_{variable}", f"{i}_{variable}", time=time, location="../data/dssims.csv")

for variable in daily_variables:
    for time in range(0,cols_daily[variable].dims["time"]):
        for i in ["bg_2", "bg_3", "bg_4", "bg_5", "bg_6", "bg_7"]:
            ldcpy.save_metrics(cols[variable], variable, f"orig_{variable}", f"{i}_{variable}", time=time, location="../data/dssims.csv")



In [None]:
for variable in ["CCN3", "CLOUD", "U"]:
    for time in range(0,cols_monthly[variable].dims["time"]):
        for i in ["bg_2", "bg_3", "bg_4", "bg_5", "bg_6", "bg_7"]:
            ldcpy.save_metrics(cols_monthly[variable], variable, f"orig_{variable}", f"{i}_{variable}", time=time, location="../data/dssims.csv")

Evaluating 4 metrics for orig_CCN3 data (set1) and bg_2_CCN3 data (set2), time 0 :
Evaluating 4 metrics for orig_CCN3 data (set1) and bg_3_CCN3 data (set2), time 0 :
Evaluating 4 metrics for orig_CCN3 data (set1) and bg_4_CCN3 data (set2), time 0 :
Evaluating 4 metrics for orig_CCN3 data (set1) and bg_5_CCN3 data (set2), time 0 :
Evaluating 4 metrics for orig_CCN3 data (set1) and bg_6_CCN3 data (set2), time 0 :
