# Example Script: Working with NYS Mesonet data 

### Available time range: 01/01/2017-12/31/2020

### Temporal resolution: hourly averages

The time format over is good to work with, except for the change between 
Eastern Standard Time (EST) to Eastern Daylight Time (EDT).
This is more than annoying for data analysis 
[(see here for ways how to deal with it in Python)](https://towardsdev.com/giant-mess-dealing-with-timezones-and-daylight-saving-time-in-python-7222d37658cf)

### Locations: 126 stations


### Meteorological observed variables include:
 - temperature
 - relative humidity
 - precipitation
 - wind 
 - solar insolation
 
### Data format: 
Data are arranged in form of a spreadsheet table.
Rows are used to store the temporal data samples, colulmns are used to
orangize the observations into meteorological variables.


### We import a support package called pandas. 
More a about package import comes later.
Here we just use it to give us access to Mesonet data in Python.

In [None]:
# code cell with import statements
import matplotlib.pyplot as plt
import numpy as np
# for the reading and handling of the Mesonet data
import pandas as pd
# for supporting date and time 
import datetime as dt 

In [None]:
# reading the data and show the data table
shared_data_folder="/home11/staff/timm/Public/Data/"
subfolder="MESONET/"

# open one example file month December 2020 (202012)
# file names are in in format YYYYMM.csv with YYYY the 4-digit year
# and MM the two digit month with leading zeros

# creates list with year and months represented as strings
years= [ '%4.4d' %(yr+2017) for yr in range (4) ]
months=[ '%2.2d' %(m+1) for m in range(12)]

# you can create a filename this way
filename=years[3]+months[1]+'.csv'
print(filename)



In [None]:
folder=shared_data_folder+subfolder 
print("open file "+folder+filename)
# we use the methods and objects provided in package Pandas
# to import spreadsheet table data (text files in CSV format)

# open the file and create a 'big' spreadsheet-like data object
df=pd.read_csv(folder+filename)

# show the first couple of rows of the spreadsheet table (top part)
df.head()

In [None]:
# this table contains data from all 126 stations in NY.
# we just want to get data from one station, Ballston Spa
# ("BSPA" is the Mesonet station ID string)

# one can use two methods to select just the data from one station
dfg=df.groupby("station")

# To see all available station ID strings:
# you can use the list station_ids

station_ids=dfg.groups.keys()


bspa=dfg.get_group("BSPA")

bspa

In [None]:
def time_support(timestring):
    """helper function to deal with the EST and EDT time zone problem
    
    Input parameter:
        a string with date string (obtained from csv files)
        
    Return value:
        datetime object that include a time zone information"""
    if timestring.upper().find("EST")>0:
        #print ("Standard time: set timezone with UTC offset -5")
        ny_tz = dt.timezone(dt.timedelta(hours=-5))
    elif timestring.upper().find("EDT")>0:
        #print ("Daylight savings time: set timezone with UTC offset -4")
        ny_tz = dt.timezone(dt.timedelta(hours=-4))
    #print("test:" , timestring[0:10])
    t=dt.datetime.strptime(timestring[0:19],'%Y-%m-%d %H:%M:%S')
    t=dt.datetime(t.year,t.month,t.day, t.hour, t.minute, t.second,tzinfo=ny_tz)
    return t

In [None]:
ntime=bspa.shape[0] # number of 1 hour observations
n=0
mytime=[] # new empty list later to convert into numpy array
while n<ntime:
    timestring=bspa['time_end'].iloc[n]
    thelp=time_support(timestring)
    mytime.append(thelp)
    n=n+1
time=np.array(mytime)


In [None]:
# we select two data columns with meteorological observations
# 2m air temperature (1 hour maximum and 1 hour minimum values)
tmax=bspa["temp_2m_max [degC]"]
tmin=bspa["temp_2m_min [degC]"]
tavg=bspa["temp_2m_avg [degC]"]

# time information from spreadsheet directly
# better use the time obtained from the code above!

#time=pd.to_datetime(bspa["time_end"])



In [None]:
%matplotlib inline

fig=plt.figure(figsize=(8,6))
plt.plot(time,tmax,color='orange',label='tmax')
plt.plot(time,tmin,color='cyan',label='on my time')
plt.title("Ballston Spa hourly temperature Dec 2020")
plt.legend()

In [None]:
# another nice plotting package is 
# bokeh
from bokeh.plotting import figure, output_file, output_notebook, show
# output to static HTML file
#output_file("lines.html")
output_notebook()
# create a new plot with a title and axis labels
p = figure(title="Mesonet station example plot in Bokeh ", 
           x_axis_label='time', x_axis_type="datetime", 
           y_axis_label='temperature [deg C]', )

# add a line renderer with legend and line thickness
p.line(time, tmax, legend_label="tmax", line_width=2,color='orange')
p.line(time, tmin, legend_label="tmin", line_width=2,color='cyan')

# show the results
show(p)

In [None]:
# get for each day mean 
def get_daily_mean(time,data,startindex=0,test=False):
    """calculates for all days the mean value
    
    The hourly data are analyzed in 24 hour intervals
    and the mean data values are calculated using 24 time windows.
    The 24 period depends on the start position in the arrays. 
    Use the optional parameter to adjust the 24-hour intervals to 
    the preferred  day ranges.
    
    Input parameter:
        time (1-d numpy array): array with datetime values
        data (1-d numpy array): array with corresponding data values
        startindex (integer): optional parameter to change the start position in the arrays
        test (boolean): if True then this function prints some diagnostics to the screen
                        (defaut it is set False)
    Returns:
        day, mean:  numpy arrays (new size) with the dates (days) and daily mean
    """    
    d0=time[0]
    day0=d0.day
    hour0=d0.hour
    d1=d0+dt.timedelta(1)
    time_ret=[]
    mean_ret=[]
    while d0<=time[-1]:
        ifind=np.logical_and(time>=d0,time<d1)
        mtime=d0+(d1-d0)/2 # center time of the 24h window
        mdata=data[ifind].mean()
        time_ret.append(mtime)
        mean_ret.append(mdata)
        if test:
            print("date-range used: ")
            print(d0.strftime("%Y-%m-%d %H:%M:%S") + " to " + d1.strftime("%Y-%m-%d %H:%M:%S"))
            print(mtime.strftime("%Y-%m-%d"),np.round(mdata,4))
        # increment start and end dates by +24h
        d0=d1
        d1=d1+dt.timedelta(1)
    return np.array(time_ret), np.array(mean_ret)
    
    

In [None]:
# get from each day the min value
def get_daily_min(time,data,startindex=0,test=False):
    """calculates for all days the min value
    
    The hourly data are analyzed in 24 hour intervals
    and the min data values are calculated using 24 time windows.
    The 24 period depends on the start position in the arrays. 
    Use the optional parameter to adjust the 24-hour intervals to 
    the preferred  day ranges.
    
    Input parameter:
        time (1-d numpy array): array with datetime values
        data (1-d numpy array): array with corresponding data values
        startindex (integer): optional parameter to change the start position in the arrays
        test (boolean): if True then this function prints some diagnostics to the screen
                        (defaut it is set False)
    Returns:
        day, min:  numpy arrays (new size) with the dates (days) and daily min
    """    
    d0=time[0]
    day0=d0.day
    hour0=d0.hour
    d1=d0+dt.timedelta(1)
    time_ret=[]
    min_ret=[]
    while d0<=time[-1]:
        ifind=np.logical_and(time>=d0,time<d1)
        mtime=d0+(d1-d0)/2 # center time of the 24h window
        mdata=data[ifind].min()
        time_ret.append(mtime)
        min_ret.append(mdata)
        if test:
            print("date-range used: ")
            print(d0.strftime("%Y-%m-%d %H:%M:%S") + " to " + d1.strftime("%Y-%m-%d %H:%M:%S"))
            print(mtime.strftime("%Y-%m-%d"),np.round(mdata,4))
        # increment start and end dates by +24h
        d0=d1
        d1=d1+dt.timedelta(1)
    return np.array(time_ret), np.array(min_ret)
    
    

In [None]:
# get from each day the max value
def get_daily_max(time,data,startindex=0,test=False):
    """calculates for all days the max value
    
    The hourly data are analyzed in 24 hour intervals
    and the max data values are calculated using 24 time windows.
    The 24 period depends on the start position in the arrays. 
    Use the optional parameter to adjust the 24-hour intervals to 
    the preferred  day ranges.
    
    Input parameter:
        time (1-d numpy array): array with datetime values
        data (1-d numpy array): array with corresponding data values
        startindex (integer): optional parameter to change the start position in the arrays
        test (boolean): if True then this function prints some diagnostics to the screen
                        (defaut it is set False)
    Returns:
        day, max:  numpy arrays (new size) with the dates (days) and daily max
    """    
    d0=time[0]
    day0=d0.day
    hour0=d0.hour
    d1=d0+dt.timedelta(1)
    time_ret=[]
    max_ret=[]
    while d0<=time[-1]:
        ifind=np.logical_and(time>=d0,time<d1)
        mtime=d0+(d1-d0)/2 # center time of the 24h window
        mdata=data[ifind].max()
        time_ret.append(mtime)
        max_ret.append(mdata)
        if test:
            print("date-range used: ")
            print(d0.strftime("%Y-%m-%d %H:%M:%S") + " to " + d1.strftime("%Y-%m-%d %H:%M:%S"))
            print(mtime.strftime("%Y-%m-%d"),np.round(mdata,4))
        # increment start and end dates by +24h
        d0=d1
        d1=d1+dt.timedelta(1)
    return np.array(time_ret), np.array(max_ret)
    
    

In [None]:
x,y= get_daily_max(time,tmax, test=True)

In [None]:
# another nice plotting package is 
# bokeh
from bokeh.plotting import figure, output_file, output_notebook, show
# output to static HTML file
#output_file("lines.html")
output_notebook()
# create a new plot with a title and axis labels
p = figure(title="Mesonet station example plot in Bokeh ", 
           x_axis_label='time', x_axis_type="datetime", 
           y_axis_label='temperature [deg C]', )

# add a line renderer with legend and line thickness
p.line(time, tmax, legend_label="tmax", line_width=2,color='orange')
p.scatter(x, y, marker='o',legend_label="daily mean of tmax", line_width=2,color='cyan')

# show the results
show(p)

In [None]:
# calculate mean value for tavg, min values with tmin, and max values with tmax
# we only need the new datetime data from one the three function calls
# variable dtime is therefore used three times here 
dtime, dmean= get_daily_mean(time,tavg)
dtime, dmin = get_daily_min(time,tmin)
dtime, dmax = get_daily_max(time,tmax)


In [None]:
# another nice plotting package is 
# bokeh
from bokeh.plotting import figure, output_file, output_notebook, show
# output to static HTML file
#output_file("lines.html")
output_notebook()
# create a new plot with a title and axis labels
p = figure(title="Mesonet station example plot in Bokeh ", 
           x_axis_label='time', x_axis_type="datetime", 
           y_axis_label='temperature [deg C]', )

# add a line renderer with legend and line thickness
p.line(dtime, dmean, legend_label="daily mean temp", line_width=2,color='black')
p.line(dtime, dmax, line_width=1,color='orange')
p.line(dtime, dmin, line_width=1,color='cyan')
p.scatter(dtime,dmax, marker='+',legend_label="daily max of tmax", line_width=2,color='orange')
p.scatter(dtime,dmin, marker='o',legend_label="daily min of tmin", line_width=2,color='cyan')


# show the results
show(p)

In [None]:
# check the simple daily mean calculation from (tmin+tmax)/2

In [None]:
#
test=(dmin+dmax)/2


In [None]:



plt.figure(figsize=[6,6])
plt.xlabel('1h based daily mean [deg C]')
plt.ylabel('(tmin+tmax) /2 [deg C]' )

# add a line renderer with legend and line thickness
plt.scatter(dmean,(dmax+dmin)/2, marker='+',s=80,label="daily mean temperature",color='orange')
plt.plot([-20,10],[-20,10],linewidth=1,color='gray')
plt.legend()
plt.show()

bias=dmean-(dmax+dmin)/2

print("difference:", np.mean(bias))

In [None]:
# saving data stored in numpy arrays to a local file
#
# Several options exist, including basic Python file operations
# see options here
# https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781783553358/5/ch05lvl1sec50/writing-csv-files-with-numpy-and-pandas
output_file="test.csv"
nrow=np.size(dmean)
# ncol depends on how many data variables we want to save
# time, mean, min, and max (daily values)
matrix=np.empty(shape=[nrow,4])
# fill in the columns
# date and time causes trouble when we want to put it into a 2-d array
matrix[:,0]=np.arange(0,nrow,1) # day index
matrix[:,1]=dmean
matrix[:,2]=dmin
matrix[:,3]=dmax

np.savetxt(output_file,matrix,fmt='%.4f',delimiter=',',header='time, mean, min, max')


In [None]:
# with pandas data frames

output_file="test2.csv"
df=pd.DataFrame()
df['time']=dtime
df['daily mean']=dmean
df['daily min']=dmin
df['daily max']=dmax

df.to_csv(output_file)

