In [1]:
from datetime import datetime, timezone
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import pytz

# Target Variable
## USGS Site AEK201
Site [AEK201](https://cida.usgs.gov/ngwmn/provider/WAECY/site/100018881/) is a well monitored by the Washington State Department of Ecology, which makes data avaliable via their [Environmental Information Manament System](https://apps.ecology.wa.gov/eim/search/Eim/EIMSearchResults.aspx?ResultType=TimeSeriesLocationList&EIMSearchResultsFirstPageVisit=false&LocationSystemId=100018881&LocationUserIds=AEK201&LocationUserIdSearchType=Equals&LocationUserIDAliasSearchFlag=True). 

For our project, our goal is to forcast:
- Water Levels, in feet below land surface

### Importing and preparing the data
From the raw data, we focus our attention on thee following columns:
- `Field_Collection_Date_Time` - The date and time at which the Water Level measurment was recorded
    - Reported in either PST (GMT-8) or PDT (GMT-7)
- `Result_Value` - The Water Level (when `Result_Parameter_Name=='Water level in well (depth below measuring point)'`)
    -  Measured in feet below the land surface

Measurements are reported hourly.

To ensure that our target data lines up with our feature data, we will group the measurments by day, and record their average as the 'well_depth'.

To do this we:
- Load the raw data
- Restrict the data to rows where `Result_Parameter_Name=='Water level in well (depth below measuring point)'`
- Restrict our attention to the `Field_Collection_Date_Time`, `Result_Value`, and `Time_Zone` columns
- Construct a loalized timestame for each measurment and store it in a `datetime_recorded` column
- Extract the `year`, `month`, and `day` as columns from the timestamp
- Group measurments recorded on the same date, and compute their average as the daily `avg_well_depth`

### The result

The result is a dataframe called `level_data` with the following columns:
- `date`
- `avg_well_depth`

which summarizes the average well depth measurment, in feet, for every day that we have data.

In [2]:
## Load raw data
level_data = pd.read_csv('../data-collection/data/EIM-data-AEK201/EIMTimeSeriesResults_2023Oct22_222975.csv',
                         low_memory=False)

In [3]:
## Restrict the data to rows where Result_Parameter_Name=='Water level in well (depth below measuring point)'
level_data = level_data.loc[level_data['Result_Parameter_Name']=='Water level in well (depth below measuring point)']
level_data = level_data.rename(columns={'Result_Value':'well_depth'})

In [4]:
## Restrict our attention to the Field_Collection_Date_Time, Result_Value, and Time_Zone columns
level_data = level_data[['Field_Collection_Date_Time','well_depth','Time_Zone']]

In [5]:
## Construct a localized timestame for each measurment and store it in a datetime_recorded column
tz_dict = {'PDT - Pacific Daylight Time (GMT-7)':'Etc/GMT-7', 
           'PST - Pacific Standard Time (GMT-8)':'Etc/GMT-8'}

level_data['Time_Zone']=level_data['Time_Zone'].apply(lambda x: tz_dict[x])

level_data['Field_Collection_Date_Time'] = pd.to_datetime(
    level_data['Field_Collection_Date_Time'], format = '%m/%d/%Y %H:%M:%S %p', utc=False)

times = level_data.Field_Collection_Date_Time.values
zones = level_data.Time_Zone.values
localized_times = []
for time, zone in zip(times, zones):
    localized_times.append(pd.Timestamp(time).tz_localize(zone))

level_data['datetime_recorded'] = localized_times

## Sort by the timestamps
level_data = level_data.sort_values('datetime_recorded')
level_data = level_data.reset_index(drop=True)

In [6]:
## Extract the date to a column
level_data['date'] = level_data['datetime_recorded'].dt.date

In [7]:
## Group measurments recorded on the same date, and compute their average as the daily avg_well_depth
level_data['avg_well_depth'] = level_data.groupby('date')['well_depth'].transform('mean')

In [8]:
## Gather the columns we want, in the order we want
level_data = level_data.drop_duplicates('date')[['date','avg_well_depth']]

# Feature Variables

## Surface Water Data from the USGS

USGS Site No: 12422500 [link](https://waterdata.usgs.gov/nwis/inventory?site_no=12422500)

This site is reports the following data for the Spokane River in Spokane, WA:
- Discharge, cubic feet per second (Mean)
- Gage height, feet (Mean)

### Importing and prepping
- Load the raw data
- Get the columns we want: `datetime_recorded`, `discharge_cfs`, and `gage_ht`)
- Make the datatypes make sense
- Extract the `year`, `month`, and `day` as columns from the timestamp
- Break the data into two sets:
    - Gage height:
        - Restrict to 2005 and beyond
        - Fill in missing values with the last non-missing value before the gap
    - Discharge rate:
        - Keep all the data
     
The result is a two dataframes:
- `sw_data_gage_ht` with the following columns:
    - `date`
    - `gage_ht`
- `sw_data_discharge_cfs` with the following columns:
    - `date`
    - `discharge_cfs` 

In [9]:
## Load the raw data
sw_data = pd.read_csv('../data-collection/data/USGS-Surface-Water-Site-12422500.tsv',
                      low_memory=False,
                      delimiter='\t',
                      comment='#')

## Drop meaningless top row
sw_data = sw_data.drop(0, axis=0)

In [10]:
## Grab the columns we want
sw_data = sw_data[['datetime','149640_00060_00003','149641_00065_00003']]

## Rename the columns to something more meaningful
headers = {'datetime':'datetime_recorded', '149640_00060_00003':'discharge_cfs', '149641_00065_00003':'gage_ht'}
sw_data = sw_data.rename(columns=headers)

## Make the column datatypes useful
sw_data['datetime_recorded'] = pd.to_datetime(sw_data['datetime_recorded'])
sw_data['discharge_cfs'] = sw_data['discharge_cfs'].astype(float)
sw_data['gage_ht'] = sw_data['gage_ht'].astype(float)

## Sort the data by the timestamp
sw_data = sw_data.sort_values('datetime_recorded')
sw_data = sw_data.reset_index(drop=True)

In [11]:
## Extract the date as columns from the timestamp
sw_data['date']=sw_data.datetime_recorded.dt.date

In [12]:
## Restrict our attention to 2005 and beyond for the gage_ht
sw_data_gage_ht = sw_data.loc[sw_data.datetime_recorded>=datetime(2005,1,1)][['date','gage_ht']].copy()
## Fill missing gage_ht values with the last value before the gap
sw_data_gage_ht = sw_data_gage_ht.fillna(method='ffill')

## Keep all of the discharge data
sw_data_discharge_cfs = sw_data[['date','discharge_cfs']].copy()

## Weather Data from Openweather.com

Bulk weather history data is available for purchase [here](https://home.openweathermap.org/marketplace)

For Spokane, WA, the following hourly measurments (starting in 1979) are available:
- Temperature (Fahrenheit)
- Min temperature (Fahrenheit)
- Max temperature (Fahrenheit)
- Feels like (Fahrenheit)
- Pressure (hPa)
- Humidity (%)
- Clouds (%)
- Weather conditions
- Rain (mm/h)
- Snow (mm/h)
- Dew point (Fahrenheit)
- Visibility (metres)
- Wind (speed, direction, gust) (miles/hour, degrees, miles/hour)

Of theses, we will be keeping:
- Temperature (Fahrenheit)
    - As the average daily temperature `temp_avg`, max daily temperature `temp_max`, and minimum daily temperature `temp_min`
- Pressure (hPa)
    - As the average daily pressure `hPa_avg` 
- Humidity (%)
    - As the average daily `hum_avg`, max daily `hum_max`, min daily `hum_min`
- Rain (mm/h)
    - As the cumulative daily rain total in mm `rain` calulcated based the hourly rain reoported
- Snow (mm/h)
    - As the cumulative daily rain total in mm `snow` calulcated based the hourly rain reoported
- Wind Speed (avg, max, min mph) `wind_avg`, `wind_max`, `wind_min`
- Wind Gust (avg, max, min mph) `gust_avg`, `gust_max`, `gust_min`

### Importing and Prepping
- Import raw data
- Create localized timestamps
- Add a date column
- Restrict to the columns of interest
- Fill `NaN` with `0`
- Compute `temp_avg`, `temp_max`, `temp_min`, `hPa_avg`, `hum_avg`, `hum_max`, `hum_min`, `rain`, `snow`, `wind_avg`, `wind_max`, `wind_min`, `gust_avg`, `gust_max`, and `gust_min`

The result is a dataframed called `wx_data` with the following columns:
- `date`
- `temp_avg`
- `temp_max`
- `temp_min`
- `hPa_avg`
- `hum_avg`
- `hum_max`
- `hum_min`
- `rain`
- `snow`
- `wind_avg`
- `wind_max`
- `wind_min`
- `gust_avg`
- `gust_max`
- `gust_min`

In [13]:
## Import raw data
wx_data = pd.read_csv('../data-collection/data/open-weather-spokane.csv')

In [14]:
## Create localized timestamps
def trunc(isodt):
    return isodt[0:-10]

wx_data['dt_iso'] = wx_data['dt_iso'].apply(trunc)

wx_data['dt_iso'] = pd.to_datetime(wx_data['dt_iso'],
                                       utc=True)
wx_data['datetime_recorded'] = wx_data['dt_iso'].dt.tz_convert('US/Pacific')

wx_data = wx_data.sort_values('datetime_recorded')
wx_data = wx_data.reset_index(drop=True)

In [15]:
## Add a date column
wx_data['date'] = wx_data.datetime_recorded.dt.date

In [16]:
## Restrict to the columns of interest
wx_data = wx_data[['date',
                   'temp',
                   'pressure', 
                   'humidity', 
                   'wind_speed',
                   'wind_gust', 
                   'rain_1h',
                   'snow_1h',]].copy()

In [17]:
## Fill NaN values with zeros
wx_data = wx_data.fillna(0)
## Fix outliers
wx_data.loc[287040,'temp']=10.09
##
wx_data.loc[134923,'rain_1h']=0
wx_data.loc[134924,'rain_1h']=0
wx_data.loc[375773,'rain_1h']=0
wx_data.loc[398946,'rain_1h']=0
wx_data.loc[403544,'rain_1h']=0

In [18]:
'''
Compute the following:
`temp_avg`, `temp_max`, `temp_min`, 
`hPa_avg`, 
`hum_avg`, `hum_max`, `hum_min`, 
`rain`, 
`snow`, 
`wind_avg`, `wind_max`, `wind_min`, 
`gust_avg`, `gust_max`, `gust_min`
''' 
wx_data['temp_avg'] = wx_data.groupby('date')['temp'].transform('mean')
wx_data['temp_max'] = wx_data.groupby('date')['temp'].transform('max')
wx_data['temp_min'] = wx_data.groupby('date')['temp'].transform('min')
wx_data['hPa_avg'] = wx_data.groupby('date')['pressure'].transform('mean')
wx_data['hum_avg'] = wx_data.groupby('date')['humidity'].transform('mean')
wx_data['hum_max'] = wx_data.groupby('date')['humidity'].transform('max')
wx_data['hum_min'] = wx_data.groupby('date')['humidity'].transform('min')
wx_data['rain'] = wx_data.groupby('date')['rain_1h'].transform('sum')
wx_data['snow'] = wx_data.groupby('date')['snow_1h'].transform('sum')
wx_data['wind_avg'] = wx_data.groupby('date')['wind_speed'].transform('mean')
wx_data['wind_max'] = wx_data.groupby('date')['wind_speed'].transform('max')
wx_data['wind_min'] = wx_data.groupby('date')['wind_speed'].transform('min')
wx_data['gust_avg'] = wx_data.groupby('date')['wind_gust'].transform('mean')
wx_data['gust_max'] = wx_data.groupby('date')['wind_gust'].transform('max')
wx_data['gust_min'] = wx_data.groupby('date')['wind_gust'].transform('min')

In [19]:
wx_data = wx_data.drop_duplicates('date')[['date',
                                               'temp_avg', 'temp_max', 'temp_min', 
                                               'hPa_avg',
                                               'hum_avg', 'hum_max', 'hum_min', 
                                               'rain', 
                                               'snow', 
                                               'wind_avg', 'wind_max', 'wind_min', 
                                               'gust_avg', 'gust_max', 'gust_min']].copy()

# Merging and Pickling
After merging, the result is a dataframe called `all_data` with the following columns:
- `date` - The date the measurements were recorded
- `avg_well_depth` - The average of the daily well measurements, in feet from the surface
- `gage_ht` - The gage height of the river, in feet
- `discharge_cfs` - The discharge rate of the river in cubic feet per second
- `temp_avg` - The average daily temperature in Fahrenheit
- `temp_max` - The daily maximum temperature in Fahrenheit
- `temp_min` - The daily minimum temperature in Fahrenheit
- `hPa_avg` - The daily average pressure in hectopascals
- `hum_avg` - The average daily humidity in percent
- `hum_max` - The daily maximum humidity in percent
- `hum_min` - The daily minimum humidity in percent
- `rain` - The daily rain total in millimeters
- `snow` - The daily snow total in millimeters
- `wind_avg` - The average daily wind speed in miles per hour
- `wind_max` - The daily maximum (hourly) wind speed in miles per hour
- `wind_min` - The daily minimum (hourly) wind speed in miles per hour
- `gust_avg` - The average daily wind gust speed in miles per hour
- `gust_max` - The daily maximum wind gust speed in miles per hour
- `gust_min` - The daily minimum wind gust speed in miles per hour

In [20]:
with open('../data-collection/data/level_data.pkl', 'wb') as f:
    pickle.dump(level_data, f)
with open('../data-collection/data/sw_data_gage_ht.pkl', 'wb') as f:
    pickle.dump(sw_data_gage_ht, f)
with open('../data-collection/data/sw_data_discharge_cfs.pkl', 'wb') as f:
    pickle.dump(sw_data_discharge_cfs, f)
with open('../data-collection/data/wx_data.pkl', 'wb') as f:
    pickle.dump(wx_data, f)

In [21]:
all_data = level_data.merge(sw_data_gage_ht, how='outer', on='date')
all_data = all_data.merge(sw_data_discharge_cfs, how='outer', on='date')
all_data = all_data.merge(wx_data, how='outer', on='date')
all_data = all_data.sort_values('date')
wx_data = wx_data.reset_index(drop=True)

In [22]:
with open('../data-collection/data/all_data.pkl', 'wb') as f:
    pickle.dump(all_data, f)