# Example Script: Working with NYS Mesonet data 

### Available time range: 01/01/2017-12/31/2020

### NOTE: 2017 is not complete (field stations were still installed in that year)
### work with years 2018, 2019, 2020
### Temporal resolution: hourly averages

The time format over is good to work with, except for the change between 
Eastern Standard Time (EST) to Eastern Daylight Time (EDT).
This is more than annoying for data analysis 
[(see here for ways how to deal with it in Python)](https://towardsdev.com/giant-mess-dealing-with-timezones-and-daylight-saving-time-in-python-7222d37658cf)

### Locations: 126 stations


### Meteorological observed variables include:
 - temperature
 - relative humidity
 - precipitation
 - wind 
 - solar insolation
 
### Data format: 
Data are arranged in form of a spreadsheet table.
Rows are used to store the temporal data samples, colulmns are used to
orangize the observations into meteorological variables.


# Example: Tropical nights (tmin > 20.0 deg C)


- Process all stations and create daily tmax time series for summer months June-July-August 2018.
- create a new DataFrame with daily tmax in rows, stations organized in columns.
- summary statistics: exceedance rates tmin >20.0 deg C




### We import a support package called pandas. 
More a about package import comes later.
Here we just use it to give us access to Mesonet data in Python.

In [None]:
# code cell with import statements
import matplotlib.pyplot as plt
import numpy as np
# for the reading and handling of the Mesonet data
import pandas as pd
# for supporting date and time 
import datetime as dt 

In [None]:
def time_support(timestring,daily=False):
    """helper function to deal with the EST and EDT time zone problem
    
    Input parameter:
        a string with date string (obtained from csv files)
        daily (Boolean): keyword parameter toggles between daily and hourly 
                         datetime string support. Default is hourly.
                         If True, daily is used, it does not read the hour/min information
    Return value:
        datetime object that include a time zone information"""
    if timestring.upper().find("EST")>0:
        #print ("Standard time: set timezone with UTC offset -5")
        ny_tz = dt.timezone(dt.timedelta(hours=-5))
    elif timestring.upper().find("EDT")>0:
        #print ("Daylight savings time: set timezone with UTC offset -4")
        ny_tz = dt.timezone(dt.timedelta(hours=-4))
    #print("test:" , timestring[0:10])
    if not daily:
        # hour information (hourly data)
        t=dt.datetime.strptime(timestring[0:19],'%Y-%m-%d %H:%M:%S')
        t=dt.datetime(t.year,t.month,t.day, t.hour, t.minute, t.second,tzinfo=ny_tz)
    else:
        t=dt.datetime.strptime(timestring[0:10],'%Y-%m-%d')
        t=dt.datetime(t.year,t.month,t.day, 0, 0, 0,tzinfo=ny_tz)
    return t

In [None]:
def get_time(dfg,column='time_end'):
    ntime=dfg.shape[0] # number of 1 hour observations
    n=0
    mytime=[] # new empty list later to convert into numpy array
    while n<ntime:
        timestring=dfg[column].iloc[n]
        thelp=time_support(timestring)
        mytime.append(thelp)
        n=n+1
    return mytime

In [None]:
# get for each day mean 
def get_daily_mean(time,data,startindex=0,test=False):
    """calculates for all days the mean value
    
    The hourly data are analyzed in 24 hour intervals
    and the mean data values are calculated using 24 time windows.
    The 24 period depends on the start position in the arrays. 
    Use the optional parameter to adjust the 24-hour intervals to 
    the preferred  day ranges.
    
    Input parameter:
        time (1-d numpy array): array with datetime values
        data (1-d numpy array): array with corresponding data values
        startindex (integer): optional parameter to change the start position in the arrays
        test (boolean): if True then this function prints some diagnostics to the screen
                        (defaut it is set False)
    Returns:
        day, mean:  numpy arrays (new size) with the dates (days) and daily mean
    """    
    d0=time[0]
    day0=d0.day
    hour0=d0.hour
    d1=d0+dt.timedelta(1)
    time_ret=[]
    mean_ret=[]
    while d0<=time[-1]:
        ifind=np.logical_and(time>=d0,time<d1)
        mtime=d0+(d1-d0)/2 # center time of the 24h window
        mdata=data[ifind].mean()
        time_ret.append(mtime)
        mean_ret.append(mdata)
        if test:
            print("date-range used: ")
            print(d0.strftime("%Y-%m-%d %H:%M:%S") + " to " + d1.strftime("%Y-%m-%d %H:%M:%S"))
            print(mtime.strftime("%Y-%m-%d"),np.round(mdata,4))
        # increment start and end dates by +24h
        d0=d1
        d1=d1+dt.timedelta(1)
    return np.array(time_ret), np.array(mean_ret)
    
    

In [None]:
# get from each day the min value
def get_daily_min(time,data,startindex=0,test=False):
    """calculates for all days the min value
    
    The hourly data are analyzed in 24 hour intervals
    and the min data values are calculated using 24 time windows.
    The 24 period depends on the start position in the arrays. 
    Use the optional parameter to adjust the 24-hour intervals to 
    the preferred  day ranges.
    
    Input parameter:
        time (1-d numpy array): array with datetime values
        data (1-d numpy array): array with corresponding data values
        startindex (integer): optional parameter to change the start position in the arrays
        test (boolean): if True then this function prints some diagnostics to the screen
                        (defaut it is set False)
    Returns:
        day, min:  numpy arrays (new size) with the dates (days) and daily min
    """    
    d0=time[0]
    day0=d0.day
    hour0=d0.hour
    d1=d0+dt.timedelta(1)
    time_ret=[]
    min_ret=[]
    while d0<=time[-1]:
        ifind=np.logical_and(time>=d0,time<d1)
        mtime=d0+(d1-d0)/2 # center time of the 24h window
        mdata=data[ifind].min()
        time_ret.append(mtime)
        min_ret.append(mdata)
        if test:
            print("date-range used: ")
            print(d0.strftime("%Y-%m-%d %H:%M:%S") + " to " + d1.strftime("%Y-%m-%d %H:%M:%S"))
            print(mtime.strftime("%Y-%m-%d"),np.round(mdata,4))
        # increment start and end dates by +24h
        d0=d1
        d1=d1+dt.timedelta(1)
    return np.array(time_ret), np.array(min_ret)
    
    

In [None]:
# get from each day the max value
def get_daily_max(time,data,startindex=0,test=False):
    """calculates for all days the max value
    
    The hourly data are analyzed in 24 hour intervals
    and the max data values are calculated using 24 time windows.
    The 24 period depends on the start position in the arrays. 
    Use the optional parameter to adjust the 24-hour intervals to 
    the preferred  day ranges.
    
    Input parameter:
        time (1-d numpy array): array with datetime values
        data (1-d numpy array): array with corresponding data values
        startindex (integer): optional parameter to change the start position in the arrays
        test (boolean): if True then this function prints some diagnostics to the screen
                        (defaut it is set False)
    Returns:
        day, max:  numpy arrays (new size) with the dates (days) and daily max
    """    
    d0=time[0]
    day0=d0.day
    hour0=d0.hour
    d1=d0+dt.timedelta(1)
    time_ret=[]
    max_ret=[]
    while d0<=time[-1]:
        ifind=np.logical_and(time>=d0,time<d1)
        mtime=d0+(d1-d0)/2 # center time of the 24h window
        mdata=data[ifind].max()
        time_ret.append(mtime)
        max_ret.append(mdata)
        if test:
            print("date-range used: ")
            print(d0.strftime("%Y-%m-%d %H:%M:%S") + " to " + d1.strftime("%Y-%m-%d %H:%M:%S"))
            print(mtime.strftime("%Y-%m-%d"),np.round(mdata,4))
        # increment start and end dates by +24h
        d0=d1
        d1=d1+dt.timedelta(1)
    return np.array(time_ret), np.array(max_ret)
    
    

In [None]:
# reading the data and show the data table
shared_data_folder="/home11/staff/timm/Public/Data/"
subfolder="MESONET/"

# open one example file month December 2020 (202012)
# file names are in in format YYYYMM.csv with YYYY the 4-digit year
# and MM the two digit month with leading zeros

# creates list with year and months represented as strings
years= [ '%4.4d' %(yr+2017) for yr in range (4) ]
months=[ '%2.2d' %(m+1) for m in range(12)]


filelist=[]

# do one year at a time - better for daily stats on concatenated data frames
# jumps in time stepping between years not handled by the functions
# that create daily statistics.
print(">>>> USER INPUT <<<<")
yr=input("enter a year you want to analyze (2018,2019,or 2020):")

for m in months[5:8]:
        filelist.append(yr+m+'.csv')
filelist




## We use the methods and objects provided in package Pandas
 to import spreadsheet table data (text files in CSV format)



In [None]:
# one filecontains hourly data for one months from all 126 stations in NY.
# one can use two methods to select just the data from one station
# open the file and create a 'big' spreadsheet-like data object

frames=[]
for filename in filelist:
    folder=shared_data_folder+subfolder 
    print("open file "+folder+filename)
    df0=pd.read_csv(folder+filename)
    frames.append(df0.copy())
# season data into one data frame.
df= pd.concat(frames)

## Here we select now one station after another to create daily time series


In [None]:
# show the first couple of rows of the spreadsheet table (top part)
df.head()
dfg=df.groupby("station")

# To see all available station ID strings:
# you can use the list station_ids

station_ids=dfg.groups.keys()
# create a column oriented table (126 stations in columns)
buffer={} # use dictionary to give columns their station ID names
for i,sid in enumerate(station_ids):
    print(f"\r {i} {sid} \t",end="")
    dfg1=dfg.get_group(sid)
    time1=np.array(get_time(dfg1))
    # we select two data columns with meteorological observations
    # 2m air temperature (1 hour maximum and 1 hour minimum values)
    dtime1, dtmin1=get_daily_min(time1,dfg1["temp_2m_min [degC]"].values)
    buffer[sid]=dtmin1
# finally adding the time coordinates 
#(without further checking we assume all stations
# report at the same time)
print("Done with processing hourly data ...")

In [None]:
# can add time if needed but for summary statistics on the station columns 
# we can leave it out
# buffer['time']=dtime1
dfout=pd.DataFrame(buffer)
dfout.shape

outfile="tmin_jja_"+yr+".cvs"
dfout.to_csv(outfile)
print("exported daily min temp data from JJA season to CSV file "+outfile)
dfout.head()

In [None]:
# calculate number of station days with temp > 35 deg C
# we need to do two important data process selections
# on the temperature arrays for each station
# (1) check for nan values and subsample the good data from array temp
# (2) find the good data values where the critical temperature is exceeded

In [None]:
tcrit=20.0
itotal=0
# collect all hot temperature values in list
t_list = []
record_max=-300 # used to find record max temperature
for c in dfout.columns:
    temp=dfout[c].values.flatten()
    # remove np.nan
    iuse=np.logical_not(np.isnan(temp))
    tuse=temp[iuse]
    ihot=tuse>tcrit
    itotal=itotal+np.sum(ihot)
    #print (c,tuse[ihot],itotal)
    if np.max(tuse>record_max):
        record_max=np.max(tuse)
        record_station=c
    if any(ihot):
        for t in tuse[ihot]:
                t_list.append(t)
        print (t_list)
t_array=np.array(t_list)

In [None]:
print(80*"=")
print("total station days with daily min temp > "+str(tcrit)+" : "+str(len(t_list)))
print("highest tmin: "+str(record_max)+" deg C")
print("observed at station "+record_station)
print(80*"=")
