In [20]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 19 14:13:29 2020

@author: jessicaruijsch

Updated on 25 April 2022 
@mikmagni vectorized functions for parallel
"""
#=================================================s=======================
#
# * This script extracts values from all netCDF files in a folder and 
#   outputs the values as a csv file. 
# * The values at different locations are saved in different csv files 
#   in the output file path, with location names indicated at the end of the file names.
#
#========================================================================

from multiprocess import Pool
import xarray as xr
import pandas as pd
import netCDF4
import numpy as np
import os 
import glob
import re
import tqdm

In [21]:
directory = '/Users/niekcollotdescury/Desktop/Applied data science/Thesis/R_code/data/satellite_data/'
os.chdir(directory)

In [22]:
def check_dir_or_make(path):
    isExist = os.path.exists(path)
    if not isExist:
        # Create a new directory because it does not exist
        os.makedirs(path)

In [44]:
# Choose satellite product to filter
# sat = 'ESA'
# sat_var = 'sm'

# sat = 'MODIS'
# sat_var = 'sc'

sat = 'GRACE'
sat_var = 'lwe'

In [45]:
filePath = f'{sat}/upstream/'
outputPath = f'{sat}/upstream_station_all/'
check_dir_or_make(outputPath)

loc = pd.read_csv('../stationLatLon.csv')
# loc = loc[loc.wmo_reg == 6]
fileName = glob.glob(f'{sat}/upstream/*.nc')

nc_sample = netCDF4.Dataset(fileName[0])



In [46]:
def near(array,value):
    idx=(np.abs(array-value)).argmin()
    return idx

In [47]:
def get_latlon():  
    
    xin, yin = np.array(loc['lon']), np.array(loc['lat']) 		#real life lon, lat

    lon = nc_sample.variables['x'][:]   	#netcdf lon    
    lat = nc_sample.variables['y'][:]		#netcdf lat
    
    #find nearest point to desired location
    get_latlon.ix = [None] * len(xin)
    get_latlon.iy = [None] * len(yin)
    
    for i in range(len(xin)):
        get_latlon.ix[i] = near(lon, xin[i])
        get_latlon.iy[i] = near(lat, yin[i])
    get_latlon.ix = np.array(get_latlon.ix)
    get_latlon.iy = np.array(get_latlon.iy)

In [48]:
def read_write_statevars(station):
    
    statevar_matrix = []
    #read statevars and write to pd.dataframe
    names = []
    for i in range(len(fileName)):

        nc = netCDF4.Dataset(fileName[i])
        
        var = nc.variables[sat_var]
        #find nearest point to desired location
        ix = get_latlon.ix[station] # why chance it this way? Just using get_laton.ix is exactly the same??
        iy = get_latlon.iy[station]
        
        mask = np.ma.getdata(var).mask[iy,ix]
        data = np.ma.getdata(var[iy,ix])
        if mask == True:
            data = np.nan
#         print(data)
#         print(mask)
        statevar_matrix.append(data)
        

        
        name = re.search('\d+_\d+', fileName[i])[0] + '-01'
        name = name.replace('_', '-')
        
        names.append(name)
    data = np.array(statevar_matrix)

    upstream = pd.DataFrame({'datetime':names, sat_var:data})
    upstream['datetime'] = pd.to_datetime(upstream['datetime'])

    upstream.sort_values(by = ['datetime'], inplace=True)

    station_no = str(loc['grdc_no'].iloc[station])
    upstream.to_csv(outputPath+f'{station_no}_{sat_var}.csv', index=False)



In [49]:
get_latlon()

In [50]:
station_idx = np.array(range(len(loc))) #set vector of indexes
pool = Pool(processes=36) # set number of cores

for _ in tqdm.tqdm(pool.imap_unordered(read_write_statevars, station_idx), total=len(station_idx)):
    pass

100%|███████████████████████████████████████| 2287/2287 [00:51<00:00, 44.07it/s]
