# DINEOF Preprocessing of data

In this script we will:

* Read initial data file
* Create land-sea mask
* Eliminate pixels/images that are too often cloudy
* Create time variable
* Write results to a new file to be used in DINEOF

In [1]:
#import Pkg; Pkg.add("Missings")
using NCDatasets
#using PyPlot
using Missings
using Dates
using Statistics
include("/home/DINEOF/Scripts/Julia/dineof_scripts.jl")

coverage (generic function with 1 method)

In [2]:
# print current directory
pwd()

"/mnt/d/Dropbox/R_projects/SV_phenology/R"

In [4]:
#We will download the data combining current directory with test_coarsened\CHL_3M_coarsened.nc"
# path = pwd() * "/data/test_coarsened/"
path = "/home/DINEOF/"
filename = path * "test_chl3daysPY.nc";

isfile(filename)


true

In [9]:
#Reading Sentinel-3A  SST L3 data as downloaded from CMEMS site
#ds = Dataset("METEOFRANCE-EUR-SST_L3MULTISENSOR_NRT-OBS_FULL_TIME_SERIE_1634307629210_adriatic_long.nc");

#A
ds = Dataset(filename);

tmp = nomissing(ds["CHL"][:],NaN);
sst  = tmp 
time = ds["time"][:]; 
lat = ds["latitude"][:];
lon = ds["longitude"][:];
mask = ds["mask"][:];


#Size of SST dataset
@show size(tmp)
close(ds);

size(tmp) = (1056, 1248, 62)


In [30]:
#Start and end dates of our dataset

@show time[1]
@show time[end];
@show typeof(sst);
@show lat[1]

time[1] = 2010-03-21T00:00:00
time[end] = 2010-09-20T00:00:00
typeof(sst) = Array{Float32,3}
lat[1] = 84.99479166666667


84.99479166666667

In [24]:
time

62-element Array{DateTime,1}:
 2010-03-21T00:00:00
 2010-03-24T00:00:00
 2010-03-27T00:00:00
 2010-03-30T00:00:00
 2010-04-02T00:00:00
 2010-04-05T00:00:00
 2010-04-08T00:00:00
 2010-04-11T00:00:00
 2010-04-14T00:00:00
 2010-04-17T00:00:00
 ⋮
 2010-08-27T00:00:00
 2010-08-30T00:00:00
 2010-09-02T00:00:00
 2010-09-05T00:00:00
 2010-09-08T00:00:00
 2010-09-11T00:00:00
 2010-09-14T00:00:00
 2010-09-17T00:00:00
 2010-09-20T00:00:00

In [25]:
# convert time in days since 2017-01-01 00:00:00 (is a )

# Define the base date
base_date = DateTime(2017, 1, 1)

# Convert the DateTime array into a Float64 array representing days since the base date
time_in_days = Float64.([(t - base_date) / Millisecond(1) / 1000 / 60 / 60 / 24 for t in time])
time_in_days

62-element Array{Float64,1}:
 -2478.0
 -2475.0
 -2472.0
 -2469.0
 -2466.0
 -2463.0
 -2460.0
 -2457.0
 -2454.0
 -2451.0
     ⋮
 -2319.0
 -2316.0
 -2313.0
 -2310.0
 -2307.0
 -2304.0
 -2301.0
 -2298.0
 -2295.0

In [13]:
#transform time variable (in miliseconds) into year-day 
mdate = time

#create a first land-sea mask. This mask will be "refined" by eliminating pixels that are 
# covered more than 98% of the time and images that are covered more than 98% in space
#mask = nanmean(sst,3);
#mask[.!isnan.(mask)].=1;
#mask[isnan.(mask)].=0;

covT = coverage(sst,mask,"tm"); #calculate average % of missing data in time

#Visualise % of missing data in time

println("Average amount of missing data in your dataset: $(mean(covT)) %");

Average amount of missing data in your dataset: 80.01961609463166 %


In [15]:
#There are images with almost no data. We will eliminate those

i=findall(covT.<98); #identify images with more than 95% of missing data
sstb = sst[:,:,i]; #remove those images
mdateb = mdate[i]; #remove those dates from the time vector

#old and new temporal size of the SST matrix
@show(size(sst))
@show size(sstb);

size(sst) = (1056, 1248, 62)
size(sstb) = (1056, 1248, 61)


The regions most affected by missing data are the coastal ones, especially along the Croatian coast due to the large amount of islands.

In [27]:
close(output)

closed NetCDF NCDataset

In [28]:
#write down the results into a new netCDF file
output = Dataset("test_chl3daysJU.nc","c");
defDim(output,"lon",size(mask,1))
defDim(output,"lat",size(mask,2))
defDim(output,"time",size(sst,3))

ncCHL = defVar(output,"CHL",Float32,("lon","lat","time");fillvalue=-9999.f0);
sst[isnan.(sst)].=-9999.;
ncCHL[:] = sst;

ncTime = defVar(output,"time",Float32,("time",));
ncTime[:] = time_in_days;

ncMask = defVar(output,"mask",Float32,("lon","lat"));
ncMask[:] = mask;

ncLat = defVar(output,"lat",Float32,("lat",));
ncLat[:] = lat;

ncLon = defVar(output,"lon",Float32,("lon",));
ncLon[:] = lon;

close(output)

closed NetCDF NCDataset

In [23]:

# Create a new NetCDF file
output = Dataset("test_chl3daysJU.nc", "c")

# Define dimensions
defDim(output, "lon", size(mask, 1))
defDim(output, "lat", size(mask, 2))
defDim(output, "time", size(sst, 3))

# Define the CHL variable
ncCHL = defVar(output, "CHL", Float32, ("lon", "lat", "time"), fillvalue=-9999.0f0)
sst[isnan.(sst)] .= -9999.0
ncCHL[:] = sst

# Define the time variable with units and calendar attributes
ncTime = defVar(output, "time", Float32, ("time",))
ncTime.attrib["units"] = "days since 2017-01-01"
ncTime.attrib["calendar"] = "standard"
ncTime[:] = time

# Define the mask variable
ncMask = defVar(output, "mask", Float32, ("lon", "lat"))
ncMask[:] = mask

# Define the latitude variable
ncLat = defVar(output, "lat", Float32, ("lat",))
ncLat[:] = lat

# Define the longitude variable
ncLon = defVar(output, "lon", Float32, ("lon",))
ncLon[:] = lon

# Close the NetCDF file
close(output)

NCDatasets.NetCDFError: NetCDF error: Time units and calendar must be defined during defVar and cannot change (NetCDF error code: -1)

In [36]:
#Choose cross-validation points in the form of real clouds
#You should run changing last argument until % of added clouds is about 3%
#output is in clouds_index.nc, but should be renamed to avoid overwriting it.
include("/home/DINEOF/Scripts/Julia/dineof_scripts.jl")
dineof_cvp("CHLA_prova.nc#SST","CHLA_prova.nc#mask",".",3);

file = "CHLA_prova.nc"
7.66791236810518 % of cloud cover added




In [3]:
# print directory
run(`ls -l`)

total 265168
-rwxrwxrwx 1 matzuc matzuc      2134 Mar  6 19:26 00.1_monthly_chla_download.knit.md
-rwxrwxrwx 1 matzuc matzuc      4738 Mar  8 11:33 00.1_monthly_chla_download.qmd
-rwxrwxrwx 1 matzuc matzuc      4930 Mar  9 14:55 00.2.1_monthly_gaps_fill_from_daily.qmd
-rwxrwxrwx 1 matzuc matzuc      5109 Mar  9 16:44 00.2.2_monthly_gaps_fill_from_dailyTEST.qmd
-rwxrwxrwx 1 matzuc matzuc   1988328 Mar  9 17:55 00.2.3_monthly_gaps_fill_from_daily_TESTaggregation.html
-rwxrwxrwx 1 matzuc matzuc   1415900 Mar  9 16:49 00.2.3_monthly_gaps_fill_from_daily_TESTaggregation.ipynb
-rwxrwxrwx 1 matzuc matzuc   1395764 Mar 10 17:04 00.2.4.1_test_opendap_accessPyhton.ipynb
-rwxrwxrwx 1 matzuc matzuc   1658423 Mar 10 23:42 00.2.4.2_subset_nc.ipynb
-rwxrwxrwx 1 matzuc matzuc   4578015 Mar 28 14:22 00.2.4.3_subset_nc_ALL_DAILY.ipynb
-rwxrwxrwx 1 matzuc matzuc     40964 Mar 30 16:52 00.2.4.4_subset_nc_ALL_DAILY_coarsened.ipynb
-rwxrwxrwx 1 matzuc matzuc   5277470 Mar 31 16:32 00.2.4.6.2_dineof_ready_py

Process(`[4mls[24m [4m-l[24m`, ProcessExited(0))

In [11]:
# change directory to /home/DINEOF
cd("/home/DINEOF")

In [12]:
# print files in current directory
run(`ls -l`)

total 984940
-rw-r--r-- 1 root   root   190494888 Mar 29 11:43 CHLA_L4_dineof.nc
-rwxr-xr-x 1 root   root    96320056 Mar 29 10:41 CHLA_prova.nc
drwxr-xr-x 2 root   root        4096 Mar 28 11:47 Compilers
-rw-r--r-- 1 root   root       18046 Mar 28 11:47 LICENSE
-rw-r--r-- 1 root   root        3300 Mar 28 11:47 Makefile
-rw-r--r-- 1 root   root        2108 Mar 28 11:47 README.md
-rw-r--r-- 1 root   root        7368 Mar 28 11:47 ReadMatrix.F90
-rw-r--r-- 1 root   root         500 Mar 28 11:47 ReadMatrix.h
-rw-r--r-- 1 root   root       17896 Mar 28 16:01 ReadMatrix.o
drwxr-xr-x 4 root   root        4096 Mar 28 11:47 Scripts
drwxr-xr-x 4 root   root        4096 Mar 28 11:47 SmallExample
-rw-r--r-- 1 root   root        2247 Mar 28 11:47 appveyor.yml
-rwxr-xr-x 1 root   root     3035712 Mar 29 10:42 clouds_index.nc
-rw-r--r-- 1 root   root        3983 Mar 28 15:52 config.mk
-rw-r--r-- 1 root   root        3983 Mar 28 11:47 config.mk.template
-rw-r--r-- 1 root   root        5879 Mar 28 11:4

Process(`[4mls[24m [4m-l[24m`, ProcessExited(0))

In [14]:
include("/home/DINEOF/Scripts/Julia/dineof_scripts.jl")
dineof_cvp("/home/DINEOF/test_chl3days.nc#CHL","CHLA_prova.nc#mask",".",3);

file = "/home/DINEOF/test_chl3days.nc"


UndefVarError: UndefVarError: Dataset not defined