In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import pathlib

In [3]:
from datetime import datetime, timedelta

from dateutil.relativedelta import relativedelta

In [4]:
import numpy as np
import pandas as pd
import xarray as xr

In [5]:
from dask.diagnostics import ProgressBar

In [None]:
from ICU_Water_Watch import utils, domains

### Parameters 

In [6]:
dpath_MSWEP_Past = "/media/nicolasf/END19101/ICU/data/glo2ho/MSWEP280/Past/Daily/"
dpath_MSWEP_NRT = "/media/nicolasf/END19101/ICU/data/glo2ho/MSWEP280/NRT/Daily/"
varname = "precipitation"
domain_name = 'Water_Watch'

In [None]:
domain = domains.domains[domain_name]

### casts the paths from string to pathlib `Path` objects 

In [7]:
dpath_MSWEP_Past = pathlib.Path(dpath_MSWEP_Past)
dpath_MSWEP_NRT = pathlib.Path(dpath_MSWEP_NRT)

In [8]:
lfiles_Past = list(dpath_MSWEP_Past.glob("*.nc"))

In [9]:
lfiles_NRT = list(dpath_MSWEP_NRT.glob("*.nc"))

In [10]:
lfiles_Past.sort()

In [11]:
lfiles_NRT.sort()

### get the actual datetime dates corresponding to each file (each file as the format `YYYYDOY.nc`)

In [12]:
lfiles_NRT_dates = [
    datetime.strptime(f"{f.name[:4]} {f.name[4:7]}", "%Y %j") for f in lfiles_NRT
]

In [13]:
lfiles_Past_dates = [
    datetime.strptime(f"{f.name[:4]} {f.name[4:7]}", "%Y %j") for f in lfiles_Past
]

### read the station coordinates 

In [14]:
station_coords = pd.read_csv("./StationsForNico.csv", index_col=None, header=None)

In [15]:
station_coords = station_coords.dropna()

In [16]:
station_coords.columns = ["station_name", "country", "lat", "lon"]

In [17]:
station_coords.head()

Unnamed: 0,station_name,country,lat,lon
0,Funafuti,Tuvalu,-8.517488,179.20014
1,Nanumea,Tuvalu,-5.673444,176.114522
2,Nui,Tuvalu,-7.247021,177.14718
3,Nukufetau,Tuvalu,-8.027851,178.314435
4,Nukulaelae,Tuvalu,-9.3701,179.808997


In [18]:
from ICU_Water_Watch import utils

### open the NRT dataset 

In [19]:
dset = xr.open_mfdataset(lfiles_NRT, parallel=True)

In [20]:
dset = utils.roll_longitudes(dset)

In [None]:
dset = dset.sortby('lat')

In [None]:
dset = domains.extract_domain(dset, domain)

In [21]:
station_time_series_NRT_ds = dset[varname].sel(
    lon=xr.DataArray(station_coords.lon.values, dims="z"),
    lat=xr.DataArray(station_coords.lat.values, dims="z"),
    method="nearest",
)

In [22]:
with ProgressBar(): 
    station_time_series_NRT_ds = station_time_series_NRT_ds.compute()

[########################################] | 100% Completed | 99.31 s


In [23]:
station_time_series_NRT_ds = station_time_series_NRT_ds.load()

In [29]:
station_time_series_NRT_df = station_time_series_NRT_ds.to_pandas()

In [35]:
station_time_series_NRT_df.columns = station_coords.index

In [37]:
dset.close()

### open the *Past* dataset 

In [38]:
dset = xr.open_mfdataset(lfiles_Past, parallel=True)

In [39]:
dset = utils.roll_longitudes(dset)

In [None]:
dset = dset.sortby('lat')

In [None]:
dset = domains.extract_domain(dset, domain)

In [40]:
station_time_series_Past_ds = dset[varname].sel(
    lon=xr.DataArray(station_coords.lon.values, dims="z"),
    lat=xr.DataArray(station_coords.lat.values, dims="z"),
    method="nearest",
)

In [None]:
with ProgressBar(): 
    station_time_series_Past_ds = station_time_series_Past_ds.compute()

[##############################          ] | 76% Completed | 28m 5sss

In [None]:
station_time_series_Past_ds = station_time_series_Past_ds.load()

In [None]:
station_time_series_Past_df = station_time_series_Past_ds.to_pandas()

In [None]:
station_time_series_Past_df.columns = station_coords.index

In [None]:
dset.close()

### Below is the function used in an earlier version of this notebook to loop over files and perform extraction 

In [25]:
def extract_station_time_series(
    netcdf_lfiles, station_coords, varname="precipitation", dataframe=True
):
    sub_stations_time_series = []

    for filename in netcdf_lfiles:
        
        dset = xr.open_dataset(filename)

        dset = utils.roll_longitudes(dset)

        dset = dset.sortby("lat")

        date = datetime.strptime(f"{filename.name.replace('.nc','')}", "%Y%j")

        sub_stations = []

        for i, row in station_coords.iterrows():
            station_name = row.station_name
            country = row.country
            lat = row.lat
            lon = row.lon

            sub = dset.sel(lat=lat, lon=lon, method="nearest")

            sub = sub.expand_dims({"station": [f"{station_name}, {country}"]})

            sub_stations.append(sub)

        sub_stations = xr.concat(sub_stations, dim="station")

        sub_stations_time_series.append(sub_stations)

        dset.close()

    sub_stations_time_series = xr.concat(sub_stations_time_series, dim="time")

    if dataframe:
        return sub_stations_time_series[varname].to_pandas().T

    else:
        return sub_stations_time_series

### extract only Tuvalu

In [None]:
station_time_series_NRT_df_Tuvalu = station_time_series_NRT_df.loc[
    station_time_series_NRT_df.index.str.contains("Tuvalu"), :
]

In [None]:
station_time_series_Past_df

In [None]:
station_time_series_NRT_df = station_time_series_NRT_df.T

In [None]:
station_time_series_NRT_df.plot(subplots=True, figsize=(10, 20));

### calculate the quantile 0.99

In [None]:
q99 = station_time_series_Past_df.quantile(0.99)

In [None]:
q99

### we use that as a filter, anything above 5 X this is probably rubbish 

In [None]:
threshs = q99 * 5

In [None]:
station_time_series_Past_df = station_time_series_Past_df.mask(
    station_time_series_Past_df > threshs
)

In [None]:
station_time_series_NRT_df = station_time_series_NRT_df.mask(
    station_time_series_NRT_df > threshs
)

In [None]:
station_time_series_NRT_df.plot(subplots=True, figsize=(10, 20));

In [None]:
station_time_series_Past_df.plot(subplots=True, figsize=(10, 20));