In [1]:
import numpy as np
import xarray as xr
import netCDF4
import os
import datetime
import matplotlib.pyplot as plt 
from matplotlib import cm
import shutil
from datetime import datetime, timedelta
import glob
import datetime as dt
from os import path
import os
import cartopy.crs as ccrs
import fsspec
import git 
import json
import sys
import yaml
import requests
import cdsapi
from datetime import timezone
from functions_verification import *
from functions_rf import *


Using TensorFlow backend.


In [2]:
# change yaml location here
with open(r"/Volumes/lexplore_hd/scripts/folder_gap_filling.yaml", "r") as f:
    directories = yaml.load(f, Loader=yaml.FullLoader)

In [3]:
# change yaml location here
with open(r"/Volumes/lexplore_hd/scripts/boundary_conditions_gap_filling.yaml", "r") as f:
    conditions = yaml.load(f, Loader=yaml.FullLoader)

In [4]:

focus_start=conditions["focus_period_start"]
focus_end=conditions["focus_period_end"]

Date: October 2023

Author: Martin Wegmann

Contact: martinwegmann@pm.me



This notebook will postprocess the MeteoSwiss station data of the Pully station. Data has been downloaded through IDAWEB (https://www.meteoswiss.admin.ch/services-and-publications/service/weather-and-climate-products/data-portal-for-teaching-and-research.html)

We look at one datasets here:

* The automatic weather station data from the MeteoSwiss station Pully

The postprocessing and data reduction includes:

* Selecting a certain time frame

* Storing data in netcdf format

#### Metadata provided by MeteoSwiss:

Data provider:
    
Bundesamt fuer Meteorologie und Klimatologie, MeteoSchweiz
Operation Center 1
Postfach 257
8058 Zuerich-Flughafen

Station metadata:
    
* stn: PUY
    
* Name: Pully
    
* Latitude: 46°31' 

* Longitude: 6°40'

* Elevation: 455 meter above sea level

* Time in UTC

* Start Date Data Order: 2019-01-01

* End Date Data Order: 2023-07-31

Variables:
    
* gre000h0  [W/m^2]  Globalstrahlung; Stundenmittel

* prestah0  [hPa]    Luftdruck auf Barometerhoehe (QFE); Stundenmittel

* tre200h0  [°C]    Lufttemperatur 2 m ueber Boden; Stundenmittel

* rre150h0  [mm]     Niederschlag; Stundensumme

* ure200h0  [%]      Relative Luftfeuchtigkeit 2 m ueber Boden; Stundenmittel

* sre000h0  [min]    Sonnenscheindauer; Stundensumme

* fkl010h0  [m/s]    Windgeschwindigkeit skalar; Stundenmittel

* dkl010h0  [°]      Windrichtung; Stundenmittel

## Note:


The original data order had - as an indicator for missing data. I replaced ;-; with ;-9999; in the provided data before reading it in here.

Original quote from MeteoSwiss: Fehlende Messwerte sind gekenntzeichnet mit '-'

### Folder Setup

In [5]:
for d in directories.values():
    if not os.path.exists(d):
        os.makedirs(d)

In [6]:
# defining folders
output_folder=directories["g2s_input_folder"]

tchain_folder = directories["tchain_folder"]

thetis_folder = directories["thetis_folder"]

meteo_folder = directories["meteo_folder"]

idronaut_folder = directories["idronaut_folder"]

scripts_folder=directories["scripts_folder"]

era5_folder=directories["era5_folder"]

era5_land_folder=directories["era5_land_folder"]

ms_folder=directories["ms_folder"]

### read meteosuisse data

In [7]:
df1 = pd.read_csv(ms_folder+"PUY_meteoswiss_data.txt", delimiter = ";")

In [8]:
df1.columns

Index(['stn', 'time', 'gre000h0', 'prestah0', 'tre200h0', 'rre150h0',
       'ure200h0', 'sre000h0', 'fkl010h0', 'dkl010h0'],
      dtype='object')

#### rename columns

In [9]:
df1.columns=['station','time','rad','sp','t2m','precip','rh','sun','ws','wd']

In [10]:
df1

Unnamed: 0,station,time,rad,sp,t2m,precip,rh,sun,ws,wd
0,PUY,2019010100,2,978.1,5.7,0.0,85.5,0,0.7,104
1,PUY,2019010101,3,978.0,5.7,0.0,83.4,0,1.3,59
2,PUY,2019010102,3,978.3,5.5,0.0,82.0,0,1.9,31
3,PUY,2019010103,3,978.2,5.0,0.0,85.2,0,2.6,54
4,PUY,2019010104,3,977.6,4.2,0.0,88.8,0,2.2,35
...,...,...,...,...,...,...,...,...,...,...
40147,PUY,2023073119,21,960.2,23.2,0.0,51.1,0,0.7,27
40148,PUY,2023073120,1,960.3,22.6,0.0,48.7,0,0.9,36
40149,PUY,2023073121,1,960.1,22.1,0.0,51.6,0,0.7,39
40150,PUY,2023073122,1,959.6,21.9,0.0,53.7,0,1.3,10


#### create a data frame with a nice time axis

In [11]:
df2=df1.iloc[:,2:]

In [12]:
start_date=dt.datetime(2019, 1, 1,0)
end_date=dt.datetime(2023, 7, 31,23)
end_date_string=end_date.strftime('%Y-%m-%d %H')#('%d-%m-%y %H')
start_date_string=start_date.strftime('%Y-%m-%d %H')#('%d-%m-%y %H')

In [13]:
df2.index=pd.date_range(start_date_string, end_date_string,freq="1H")

#### turn rh and precip into floats, just in case

In [14]:
df2['precip']=df2['precip'].astype('float64')
df2['rh']=df2['rh'].astype('float64')

#### create an xarray dataset

In [15]:
df2_ds = xr.Dataset(
...     data_vars=dict(
...         rad=(["time"], df2.rad.values),
...         sp=(["time"], df2.sp.values),
...         t2m=(["time"], df2.t2m.values),
...         precip=(["time"], df2.precip.values),
...         rh=(["time"], df2.rh.values),
...         sun=(["time"], df2.sun.values),
...         ws=(["time"], df2.ws.values),
...        wd=(["time"], df2.wd.values),
...     ),
...     coords=dict(
...         time=df2.index,
...     ),
...     attrs=dict(description="For metadata see jupyter notebook get_ms_data_for_gapfill"),
... )

#### turn the missing values into actual nan

In [16]:
df2_ds_nan = df2_ds.where(df2_ds != -9999.)  

In [17]:
df2_ds_nan

#### write the data to netcdf

In [18]:
df2_ds_nan.sel(time=slice(focus_start,focus_end)).to_netcdf(output_folder+"ms_pully_1hr_g2s.nc")

In [19]:
xr.open_dataset(output_folder+"ms_pully_1hr_g2s.nc")