In [1]:
import glob
from netCDF4 import Dataset
import datetime
import numpy as np
import pandas as pd
import os

In [2]:
def write_bui(fname, data_array, locations, timestepsize, startday, length):
    """
    write a .bui from datainput
    """
    with open(fname, 'w') as f: 
        f.write('*Name of this f: \[sobekversion]\FIXED\\{}\n'.format(fname)) 
        date = "{:%d-%m-%Y %H:%M:%S}".format(datetime.datetime.now())
        f.write('*Date and time of construction: {}\n'.format(date))
        f.write('*Gebruik de default dataset (1) of de volledige reeks (0) voor overige invoer\n')
        f.write('0\n')
        f.write('*Aantal stations\n')
        f.write(str(len(locations))+'\n')
        f.write('*Namen van stations\n')
        for location in locations:
            f.write("'" + location + "'\n")
        f.write('*Aantal gebeurtenissen (omdat het 1 bui betreft is dit altijd 1)\n')
        f.write('*en het aantal seconden per waarnemingstijdstap\n')
        #timestepsize = 3600*24
        f.write(' 1  '+ str(timestepsize)+'\n')
        f.write('*Elke commentaarregel wordt begonnen met een * (asteriks).\n')
        f.write('*Eerste record bevat startdatum en -tijd, lengte van de gebeurtenis in dd hh mm ss\n')
        f.write('*Het format is: yyyymmdd:hhmmss:ddhhmmss\n')
        f.write('*Daarna voor elk station de neerslag in mm per tijdstap.\n')                

        init_date = datetime.datetime(startday.year, startday.month, startday.day)# + datetime.timedelta(day)
        #length = len(f.variables['time'])
        f.write(init_date.strftime("%Y %m %d %H %M %S") +' '+str(length) + ' 0 0 0\n' )
        f.write('\n'.join(' '.join('%0.2f' %x for x in y) for y in data_array))                
    return 

In [3]:
def write_evp(fname, df_evp):
    """
    * Evaporation file generated by METEO at 19/10/2017 12:04:57
    * 
    * Meteo data: Evaporation stations; for each station: evaporation intensity in mm
    *First record: start date, data in mm/day
    *Datum (year month day), verdamping (mm/dag) voor elk weerstation
    *jaar maand dag verdamping[mm]
    1998 2 7 .2
    1998 2 8 .6
    1998 2 9 .7    
    """
    date = "{:%d-%m-%Y %H:%M:%S}".format(datetime.datetime.now())
    with open(fname, 'w') as f:     

        f.write('* Evaporation file generated by METEO at {}\n'.format(date))
        f.write('*\n')
        f.write('* Meteo data: Evaporation stations; for each station: evaporation intensity in mm\n')
        f.write('*First record: start date, data in mm/day\n')
        f.write('*Datum (year month day), verdamping (mm/dag) voor elk weerstation\n')
        f.write('*jaar maand dag verdamping[mm]\n')
        df_evp.index= ['{}'.format(x.strftime("%Y %m %d")) for x in df_evp.index]
        df_evp.reset_index(inplace=True)
        #df_evp.iloc[:,1] = df_evp.iloc[:,1].round(decimals=2)
        f.write('\n'.join(' '.join('%s' %x for x in y) for y in df_evp.values))
    return

In [4]:
def write_tmp(fname, avg_tmp, startday, length):
    date = "{:%d-%m-%Y %H:%M:%S}".format(datetime.datetime.now())
    with open(fname, 'w') as f:

        f.write('*Name of this file: \[sobekversion]\FIXED\\'+fname+'.tmp\n') 
        f.write('*Date and time of construction: {}\n'.format(date))
        f.write('*Enige algemene wenken:\n')
        f.write('*Gebruik de default dataset (1) of de volledige reeks (0) voor overige invoer\n')
        f.write('1\n')
        f.write('*Aantal stations\n')
        f.write('1\n')
        f.write("*Namen van stations\n'KNMI TEMP'\n")
        f.write('*Aantal gebeurtenissen (omdat het 1 bui betreft is dit altijd 1)\n')
        f.write('*en het aantal seconden per waarnemingstijdstap\n')
        f.write(' 1  86400\n')
        f.write('*Elke commentaarregel wordt begonnen met een * (asteriks).\n')
        f.write('*Eerste record bevat startdatum en -tijd, lengte van de gebeurtenis in dd hh mm ss\n')
        f.write('*Het format is: yyyymmdd:hhmmss:ddhhmmss\n')
        f.write('*Daarna voor elk station de neerslag in mm per tijdstap.\n')
        init_date = datetime.datetime(startday.year, startday.month, startday.day)# + datetime.timedelta(day)
        #length = len(f.variables['time'])
        f.write(init_date.strftime("%Y %m %d %H %M %S") +' '+str(length) + ' 0 0 0\n' )
        f.write('\n'.join('%0.2f' %x for x in avg_tmp))
#         for i in f.variables['air_temperature']:
#             avg_tmp = np.average(i)
#             file.write(str(avg_tmp)+'\n')


In [5]:
def get_idx_start_end(in_files, date_start, date_end, idx_file):
    """
    do function
    """
    
    # print( 'START_FILE')
    # first check in which file need to look for start_date
    while True:   
        # print( 'init idx_file: {}'.format(idx_file))
        # print( 'nc-file: {}'.format(in_files[idx_file]))
        f_start = Dataset(in_files[idx_file])
        var_date = f_start.variables['date'][:]
        # print( 'looking for date_start: {}'.format(date_start))
        idx_array_start = np.where(var_date == int(date_start))        

        # check if value exist    
        if idx_array_start[0].size == 0:
            # print( 'idx_start NOT found: {}'.format(idx_array_start[0]))
            # start_date not found add idx value
            idx_file += 1            
            # print( 'set new idx_file: {}'.format(idx_file))
            
            # safety check
            if idx_file > 25:
                # print( 'not in files, set idx_start to none')                
                idx_array_start = 'none'
                break
        else:
            # print( 'idx_start found: {}'.format(idx_array_start[0]))
            idx_file_start = idx_file
            # start_date found, break out while statement
            break

    # print( 'END_FILE')            
    # then check in which file need to look for end_date
    # idx is always after start_date
    while True:     
        # print( 'init idx_file: {}'.format(idx_file))
        # print( 'nc-file: {}'.format(in_files[idx_file]))
        f_end = Dataset(in_files[idx_file])
        var_date = f_end.variables['date'][:]
        # print( 'looking for date_end: {}'.format(date_end))
        idx_array_end = np.where(var_date == int(date_end))

        # check if value exist    
        if idx_array_end[0].size == 0:
            # print( 'idx_array_end NOT found: {}'.format(idx_array_end[0]))            
            # end_date not found add idx value
            idx_file += 1
            # print( 'set new idx_file: {}'.format(idx_file))            

            # safety check            
            if idx_file > 25:
                # print( 'not in files, set idx_end to none')
                idx_array_end = 'none'
                break
        else:
            # print( 'idx_end found: {}'.format(idx_array_end[0]))
            idx_file_end = idx_file
            # end_date found, break out while statement            
            break     
    return f_start, f_end, idx_file_start, idx_file_end, idx_array_start, idx_array_end

In [6]:
def get_prc_evp_tmp(
        f_start, f_end, idx_file_start, idx_file_end, idx_array_start, idx_array_end):
    
    # possibility 1 both idx_start and idx_end in same file
    if idx_file_start == idx_file_end:
        # print( 'enter poss:1')        
        
        # get prec
        rainfall_array = f_start.variables['rainfall_rate'][idx_array_start[0][0]:idx_array_end[0][0]]
        # get evap
        evaporat_array = f_start.variables['reference_evaporation_rate'][idx_array_start[0][0]:idx_array_end[0][0]]
        # get tmp
        temperat_array = f_start.variables['air_temperature'][idx_array_start[0][0]:idx_array_end[0][0]]
        
        # print( 'rainfall array shape: {}'.format(rainfall_array.shape))
    
    # possibility 2 idx_start in nth-nc file and idx_end in n+1-nc file
    elif idx_file_end == (idx_file_start+1):
        # print( 'enter poss:2')
        
        rainfall_array_start = f_start.variables['rainfall_rate'][idx_array_start[0][0]::]
        rainfall_array_end = f_end.variables['rainfall_rate'][:idx_array_end[0][0]]

        # concatenate start and end arrays
        rainfall_array = np.concatenate(
            (rainfall_array_start, rainfall_array_end), 
            axis=0)              
        
        # evaporation
        evaporat_array_start = f_start.variables['reference_evaporation_rate'][idx_array_start[0][0]::]
        evaporat_array_end = f_end.variables['reference_evaporation_rate'][:idx_array_end[0][0]]   
        
        # concatenate start and end arrays
        evaporat_array = np.concatenate(
            (evaporat_array_start, evaporat_array_end), 
            axis=0)   
        
        # air temperature
        temperat_array_start = f_start.variables['air_temperature'][idx_array_start[0][0]::]
        temperat_array_end = f_end.variables['air_temperature'][:idx_array_end[0][0]]   
        
        # concatenate start and end arrays
        temperat_array = np.concatenate(
            (temperat_array_start, temperat_array_end), 
            axis=0)           
  
        # print( 'rainfall array shape: {}'.format(rainfall_array.shape))
    
    # possibilty 3 idx_end not in file, as batch is too big:
    else:
        # print( 'enter poss:3')
        rainfall_array = f_start.variables['rainfall_rate'][idx_array_start[0][0]::]       

        evaporat_array = f_start.variables['reference_evaporation_rate'][idx_array_start[0][0]::]               

        temperat_array = f_start.variables['air_temperature'][idx_array_start[0][0]::]                       
        # print( 'rainfall array shape: {}'.format(rainfall_array.shape))  
    return rainfall_array, evaporat_array, temperat_array

In [28]:
in_dir = r'D:\jupyter notebooks\3694.10 vecht\20171130_GRADE Neerslag from Deltares FTP\\'
in_files = glob.glob('{}*'.format(in_dir))
out_dir = r'D:\Projects\Pr\3694.10'
start_event = '1999-01-01'
readme_file = os.path.join(out_dir,'overview_batching.csv')

In [8]:
# make a complete list of all batches
# goes from 2001-01-01 till 50001-01-01
# start with idx_file = 0
idx_file = 0
year_batch = 0

year_from = []
year_to = []

batch_from = []
batch_to = []

batch_years = []
for year in range(2001, 52001, 100):
    year_start = year
    year_end = year + 101    
    date_start = str(year_start)+'0101'
    date_end = str(year_end)+'1231'
    print('from {0} to {1}'.format(date_start, date_end))
    
    try:
        # determine which nc-files need to be used for the year batch
        f_start, f_end, idx_file_start, idx_file_end, idx_array_start, idx_array_end = get_idx_start_end(
            in_files, date_start, date_end, idx_file)
    except (ValueError, IndexError) as e:
        raise e        
    
    # first get column names    
    station_names = [name.tostring().decode().replace('\x00', '') for name in f_start.variables['station_id']]
    # print( station_names[0:2])     

    # get rainfall, evaporation and temperature array
    rainfall_array, evaporat_array, temperat_array = get_prc_evp_tmp(
        f_start, f_end, idx_file_start, idx_file_end, idx_array_start, idx_array_end)
   
    # create pandas dataframe not necesssary, but useful in debugging    
    # date_range gebeurtenis
    event_date_range = pd.date_range(
        start=start_event, 
        end=None, 
        periods=rainfall_array.shape[0], 
        freq='{0}S'.format(60*60*24)
    )
    df_prc = pd.DataFrame(data=rainfall_array, index=event_date_range, columns=station_names)
    
    batch_no = in_files[idx_file_start][-6:-3]
    year_bat = str(year_batch).zfill(3)
    out_file = '{0}_{1}'.format(batch_no, year_bat)    
    fname = os.path.join(out_dir,'{}.bui'.format(out_file))
    # print( fname)
    
    # prepare input array for bui-file
    data_array = df_prc.values
    locations = df_prc.columns
    timestepsize = 60*60*24
    startday = df_prc.index[0]
    length = df_prc.shape[0]
    write_bui(fname, data_array, locations, timestepsize, startday, length)

    # prepare evaporation file
    avg_evp = evaporat_array.mean(axis=1)
    df_evp = pd.DataFrame(avg_evp, index= event_date_range)
    fname_evp = os.path.join(out_dir,'{}.evp'.format(out_file))
    write_evp(fname_evp, df_evp)    

    # prepare air temperature
    avg_tmp = temperat_array.mean(axis=1)
    fname_tmp = os.path.join(out_dir,'{}.tmp'.format(out_file))
    write_tmp(fname_tmp, avg_tmp, startday, length)
    
    year_from.append(year_start)
    year_to.append(year_end)

    batch_from.append(idx_file_start)
    batch_to.append(idx_file_end)

    batch_years.append(year_batch)
    
    # go back one index file and go to next round    
    if idx_file > 0:
        idx_file -= 1
    # add a number to the year batch
    year_batch += 1
    # print( 'go to next 100 year loop')

from 20010101 to 21021231
from 21010101 to 22021231
from 22010101 to 23021231
from 23010101 to 24021231
from 24010101 to 25021231
from 25010101 to 26021231
from 26010101 to 27021231
from 27010101 to 28021231
from 28010101 to 29021231
from 29010101 to 30021231
from 30010101 to 31021231
from 31010101 to 32021231
from 32010101 to 33021231
from 33010101 to 34021231
from 34010101 to 35021231
from 35010101 to 36021231
from 36010101 to 37021231
from 37010101 to 38021231
from 38010101 to 39021231
from 39010101 to 40021231
from 40010101 to 41021231
from 41010101 to 42021231
from 42010101 to 43021231
from 43010101 to 44021231
from 44010101 to 45021231
from 45010101 to 46021231
from 46010101 to 47021231
from 47010101 to 48021231
from 48010101 to 49021231
from 49010101 to 50021231
from 50010101 to 51021231
from 51010101 to 52021231
from 52010101 to 53021231
from 53010101 to 54021231
from 54010101 to 55021231
from 55010101 to 56021231
from 56010101 to 57021231
from 57010101 to 58021231
from 5801010

IndexError: list index out of range

In [1]:
# collects metadata from the batch and write to csv-file
df_readme = pd.DataFrame(np.column_stack([year_from,year_to,batch_from,batch_to,batch_years]), 
                         columns=['year_from', 'year_to', 'batch_from','batch_to','batch_years'])

df_readme.loc[:,'batch_from'] = df_readme['batch_from'].astype(str).str.zfill(3)
df_readme.loc[:,'batch_to'] = df_readme['batch_to'].astype(str).str.zfill(3)
df_readme.loc[:,'batch_years'] = df_readme['batch_years'].astype(str).str.zfill(3)

df_readme.to_csv(readme_file)

NameError: name 'pd' is not defined