In [1]:
import pandas as pd
import datetime

## Define paths to pickles
X_pickle = '../data/pickled_data/all_feature_data.pkl'
X_AEK201_pickle = '../data/pickled_data/AEK201_all_data.pkl'
X_AEK201_short_pickle = '../data/pickled_data/AEK201_short.pkl'
X_AFL259_pickle = '../data/pickled_data/AFL259_all_data.pkl'
X_AFL259_short_pickle = '../data/pickled_data/AFL259_short.pkl'
X_APK309_pickle = '../data/pickled_data/APK309_all_data.pkl'
X_APK309_short_pickle = '../data/pickled_data/APK309_short.pkl'
X_APK310_pickle = '../data/pickled_data/APK310_all_data.pkl'
X_APK310_short_pickle = '../data/pickled_data/APK310_short.pkl'


## Define the paths to raw data
## Wells
## -----
AEK201_path = '../data/raw_data/EIM-data-AEK201/EIMTimeSeriesResults_2023Oct22_222975.csv'
APK309_path = '../data/raw_data/EIM-data-APK309/GroundwaterTimeSeriesResults_2023Oct19_96705.csv'
APK310_path = '../data/raw_data/EIM-data-APK310/GroundwaterTimeSeriesResults_2023Nov02_190488.csv'
AFL259_PT1_path = '../data/raw_data/EIM-data-AFL259/GroundwaterTimeSeriesResults_2023Nov12_218669_PT1.csv'
AFL259_PT2_path = '../data/raw_data/EIM-data-AFL259/GroundwaterTimeSeriesResults_2023Nov12_218669_PT2.csv'
## Weather
## -------
## openweather
OPENWX_path = '../data/raw_data/open-weather-spokane.csv'
## noaa
NOAA_pt1_path = '../data/raw_data/noaa-data.csv'
NOAA_pt2_path = '../data/raw_data/noaa-data-2.csv'

## Surface water
## -------------
RIVER_path = '../data/raw_data/USGS-Surface-Water-Site-12422500.tsv'

## Load the raw data into dataframes
## Wells
## -----
raw_AEK201 = pd.read_csv(AEK201_path, low_memory=False)
raw_AFL259_PT1 = pd.read_csv(AFL259_PT1_path, low_memory=False)
raw_AFL259_PT2 = pd.read_csv(AFL259_PT2_path, low_memory=False)
raw_AFL259 = pd.concat([raw_AFL259_PT1, raw_AFL259_PT2])
raw_APK309 = pd.read_csv(APK309_path, low_memory=False)
raw_APK309 = pd.read_csv(APK309_path, low_memory=False)
raw_APK310 = pd.read_csv(APK310_path, low_memory=False)

## Weather
## -------
## openweather
raw_OPENWX = pd.read_csv(OPENWX_path)
## noaa
raw_NOAA_pt1 = pd.read_csv(NOAA_pt1_path, parse_dates=['DATE'], low_memory=False)
raw_NOAA_pt2 = pd.read_csv(NOAA_pt2_path, parse_dates=['DATE'], low_memory=False)

## Surface Water
## -------------
raw_RIVER = pd.read_csv(RIVER_path, low_memory=False, delimiter='\t', comment='#')


def add_dty_signal(data, date_col):
    data['year'] = data[date_col].dt.year
    data['month'] = data[date_col].dt.month
    data['day'] = data[date_col].dt.day
    return data.copy()

def preprocess_well_data(data):
    '''
        Gather the hourly 'meas_time' and 'well_depth' data from the raw data
    '''
    ## Format `Field_Collection_Date_Time` as datetime
    data['Field_Collection_Date_Time'] = pd.to_datetime(data['Field_Collection_Date_Time'], format='mixed')
    
    ## Gather the rows with water level data
    water_level_name = 'Water level in well (depth below measuring point)'
    data = data.loc[data['Result_Parameter_Name']==water_level_name][['Field_Collection_Date_Time','Result_Value']].copy()

    ## Rename the columns
    short_names={'Field_Collection_Date_Time':'meas_time', 'Result_Value':'well_depth'}
    data = data.rename(columns=short_names).copy()

    ## Sort by measurment time and reset the index
    data = data.sort_values('meas_time').reset_index(drop=True).copy()

    return data


def compress_well_data(data):
    ''' Compress the full dataset down to daily measurments rather than hourly.
        Replaces 'meas_time' with 'date'
        Replaces 'well_depth with 'avg_well_depth'

        Parameters
        ----------
        data : pandas.DataFrame

        Returns
        -------
        pandas.DataFrame

        Returns a dataframe with daily 'date' and 'avg_well_depth' data
    '''
    ## Add year-month-day signals
    data = add_dty_signal(data,'meas_time')

    ## Compress the hourly data to daily by averaging the depth values
    data['avg_well_depth'] = data.groupby(['year', 'month', 'day'])['well_depth'].transform('mean')
    data = data.drop_duplicates(['year', 'month', 'day']).copy()

    data['date'] = data['meas_time']

    return data[['date','avg_well_depth']].reset_index(drop=True).copy()

def preprocess_openwx_data(data):
    ## Create localized timestamps
    def trunc(isodt):
        return isodt[0:-10]
    
    data['dt_iso'] = data['dt_iso'].apply(trunc)
    
    data['dt_iso'] = pd.to_datetime(data['dt_iso'], utc=True)
    data['date'] = data['dt_iso'].dt.tz_convert('US/Pacific')
    
    data = data.sort_values('date').reset_index(drop=True).copy()

    ## Restrict to the columns of interest
    data = data[['date', 'temp', 'pressure',  'humidity', 'wind_speed', 'wind_gust']].copy()
    
    ## Fill NaN values with zeros
    data = data.fillna(0).copy()
    
    ## Fix crazy outlier
    data.loc[287040,'temp']=10.09

    return data.copy()

def compress_openwx_data(data):
    data = add_dty_signal(data, 'date')
    data['temp_avg'] = data.groupby(['year', 'month', 'day'])['temp'].transform('mean')
    data['temp_max'] = data.groupby(['year', 'month', 'day'])['temp'].transform('max')
    data['temp_min'] = data.groupby(['year', 'month', 'day'])['temp'].transform('min')
    data['hPa_avg'] = data.groupby(['year', 'month', 'day'])['pressure'].transform('mean')
    data['hum_avg'] = data.groupby(['year', 'month', 'day'])['humidity'].transform('mean')
    data['hum_max'] = data.groupby(['year', 'month', 'day'])['humidity'].transform('max')
    data['hum_min'] = data.groupby(['year', 'month', 'day'])['humidity'].transform('min')
    data['wind_avg'] = data.groupby(['year', 'month', 'day'])['wind_speed'].transform('mean')
    data['wind_max'] = data.groupby(['year', 'month', 'day'])['wind_speed'].transform('max')
    data['gust_avg'] = data.groupby(['year', 'month', 'day'])['wind_gust'].transform('mean')
    data['gust_max'] = data.groupby(['year', 'month', 'day'])['wind_gust'].transform('max')

    data = data.drop_duplicates(['year', 'month', 'day'])[['date',
                                                           'temp_avg', 'temp_max', 'temp_min', 
                                                           'hPa_avg',
                                                           'hum_avg', 'hum_max', 'hum_min',
                                                           'wind_avg', 'wind_max',
                                                           'gust_avg', 'gust_max']].copy()
    return data

def process_noaa_data(data):
    data = data.rename(columns={'DATE':'date'}).copy()
    data = data.loc[data.STATION=='USW00024157'][['date','PRCP']].copy()
    data = data.rename(columns={'PRCP':'prcp'}).copy()
    data['prcp_lag_45D'] = data['prcp'].shift(45)
    return data

def process_river_data(data):
    ## Drop meaningless top row
    data = data.drop(0, axis=0).copy()
    
    ## Grab the columns we want
    data = data[['datetime','149640_00060_00003','149641_00065_00003']].copy()
    
    ## Rename the columns to something more meaningful
    headers = {'datetime':'datetime_recorded', '149640_00060_00003':'discharge_cfs', '149641_00065_00003':'gage_ht'}
    data = data.rename(columns=headers)
    
    ## Make the column datatypes useful
    data['date'] = pd.to_datetime(data['datetime_recorded'])
    data['discharge_cfs'] = data['discharge_cfs'].astype(float)
    data['gage_ht'] = data['gage_ht'].astype(float)
    
    ## Sort the data by the timestamp
    data = data.sort_values('date').reset_index(drop=True).copy()

    ## Impute missing gage_ht values
    data['gage_ht'] = data['gage_ht'].fillna(method='ffill')

    return data[['date','gage_ht','discharge_cfs']].copy()

## Process all of the well data
AEK201_data = add_dty_signal(compress_well_data(preprocess_well_data(raw_AEK201)),'date')
AFL259_data = add_dty_signal(compress_well_data(preprocess_well_data(raw_AFL259)),'date')
APK309_data = add_dty_signal(compress_well_data(preprocess_well_data(raw_APK309)),'date')
APK310_data = add_dty_signal(compress_well_data(preprocess_well_data(raw_APK310)),'date')

## Process all of the feature data
OPENWX_data = add_dty_signal(compress_openwx_data(preprocess_openwx_data(raw_OPENWX)),'date')
NOAA_pt1 = process_noaa_data(raw_NOAA_pt1)
NOAA_pt2 = process_noaa_data(raw_NOAA_pt2)
NOAA_data = add_dty_signal(pd.concat([NOAA_pt1,NOAA_pt2]),'date')
RIVER_data = add_dty_signal(process_river_data(raw_RIVER),'date')

## Combine all of the feature data
feature_data = OPENWX_data.merge(NOAA_data, how='outer', on=['year', 'month', 'day'])
feature_data = feature_data.merge(RIVER_data, how='outer', on=['year', 'month', 'day'])
feature_data = feature_data.sort_values(['year', 'month', 'day']).reset_index(drop=True).copy()

## Make the dates nice
X = feature_data[['temp_avg', 'temp_max', 'temp_min', 'hPa_avg', 'hum_avg',
                  'hum_max', 'hum_min', 'wind_avg', 'wind_max', 'gust_avg', 'gust_max',
                  'year', 'month', 'day', 'prcp', 'prcp_lag_45D',
                  'gage_ht', 'discharge_cfs']].copy()

date_rng = pd.date_range(start='1900-10-21', end='2023-10-21', freq='D')
X['date'] = date_rng

## Get the AEK201 data together
X_AEK201 = X.merge(AEK201_data, how='outer', on=['year', 'month', 'day'])
X_AEK201['date'] = X_AEK201['date_x']

X_AEK201 = X_AEK201.drop(['date_x','date_y', 'year', 'month', 'day'], axis=1).copy()

min_date = datetime.datetime(2005,8,22)
max_date = datetime.datetime(2018,6,6)
X_AEK201_short = X_AEK201.loc[(X_AEK201['date'] >= min_date) & (X_AEK201['date'] <= max_date)].copy()
X_AEK201_short = X_AEK201_short.dropna().copy()

## Get the AFL259 data together
X_AFL259 = X.merge(AFL259_data, how='outer', on=['year', 'month', 'day'])
X_AFL259['date'] = X_AFL259['date_x']

X_AFL259 = X_AFL259.drop(['date_x','date_y', 'year', 'month', 'day'], axis=1).copy()

min_date = datetime.datetime(2005,8,21)
max_date = datetime.datetime(2017,9,28)
X_AFL259_short = X_AFL259.loc[(X_AFL259['date'] >= min_date) & (X_AFL259['date'] <= max_date)].copy()
X_AFL259_short = X_AFL259_short.dropna().copy()

## Get the APK309 data together
X_APK309 = X.merge(APK309_data, how='outer', on=['year', 'month', 'day'])
X_APK309['date'] = X_APK309['date_x']

X_APK309 = X_APK309.drop(['date_x','date_y', 'year', 'month', 'day'], axis=1).copy()

min_date = datetime.datetime(2006,6,21)
max_date = datetime.datetime(2017,9,28)

X_APK309_short = X_APK309.loc[(X_APK309['date'] >= min_date) & (X_APK309['date'] <= max_date)].copy()
X_APK309_short = X_APK309_short.dropna().copy()

## Get the APK310 data together
X_APK310 = X.merge(APK310_data, how='outer', on=['year', 'month', 'day'])
X_APK310['date'] = X_APK310['date_x']

X_APK310 = X_APK310.drop(['date_x','date_y', 'year', 'month', 'day'], axis=1).copy()

min_date = datetime.datetime(2006,6,21)
max_date = datetime.datetime(2017,5,4)

X_APK310_short = X_APK310.loc[(X_APK310['date'] >= min_date) & (X_APK310['date'] <= max_date)].copy()
X_APK310_short = X_APK310_short.dropna().copy()

X = X.drop(['year', 'month', 'day'], axis=1).copy()
## Pickles!!
X.to_pickle(X_pickle)
X_AEK201.to_pickle(X_AEK201_pickle)
X_AEK201_short.to_pickle(X_AEK201_short_pickle)
X_AFL259.to_pickle(X_AFL259_pickle)
X_AFL259_short.to_pickle(X_AFL259_short_pickle)
X_APK309.to_pickle(X_APK309_pickle)
X_APK309_short.to_pickle(X_APK309_short_pickle)
X_APK310.to_pickle(X_APK310_pickle)
X_APK310_short.to_pickle(X_APK310_short_pickle)

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw_data/EIM-data-AFL259/GroundwaterTimeSeriesResults_2023Nov12_218669_PT1.csv'