In [1]:
import sys
sys.path.insert(0, '../')
from gw_tools import data_prep
from gw_tools.gw_cnn import gw_cnn
from gw_tools.gw_LSTM import gw_LSTM
from gw_tools import model_params

import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
prediction_path = '../data/prediction_pickles/'
score_path = '../data/prediction_pickles/'
future_pred_path = '../data/prediction_pickles/'
future_score_path = '../data/prediction_pickles/'


score_summary = pd.DataFrame({'Model':['Baseline', 'Linear Regression', 'CNN', 'LSTM'],'RMSE':[0,0,0,0], 'MAE':[0,0,0,0]})
score_summary_future = pd.DataFrame({'Model':['Baseline', 'Linear Regression', 'CNN', 'LSTM'],'RMSE':[0,0,0,0], 'MAE':[0,0,0,0]})

In [3]:
## Define a scaler
scaler_cnn =  StandardScaler(copy=True)
scaler_lstm =  StandardScaler(copy=True)


## note: parameters are stored in model_params, imported above
for well, params, LSTMparams in zip(model_params.wells, 
                                    model_params.CNN_well_params, 
                                    model_params.LSTM_well_params):
    ## Prepare the data for training
    df = data_prep.load_data(well)
    well_dates = df['date'].copy()
    df = data_prep.select_features(df)
    df = data_prep.add_toy_signal(df)
    
    ## LSTM specific prep (requires a warmup, target as a feature)
    # we also need the window size later for the future predictions
    window = LSTMparams['model__WINDOW_SIZE']
    X_tr_L, X_te_L, well_tr_mean, well_tr_std = data_prep.LSTM_data_prep(df,window)
    # Set the mean and standard deviation of the training set as parameters
    LSTMparams['model__tmean'] = well_tr_mean
    LSTMparams['model__tsd'] = well_tr_std
    
    
    ## CNN data
    X_train, X_holdout, y_train, y_holdout, dt_train, dt_holdout = data_prep.prep_data_for_training(df)
    
    
    ## Get average for baseline prediction
    train_mean = y_train.mean()
    
    ## Train the linear regression model
    model_lr = LinearRegression(copy_X=True)
    model_lr.fit(X_train, y_train)
    
    ## Train the CNN
    model_cnn = gw_cnn()
    pipe_cnn = Pipeline([('scaler', scaler_cnn), ('model', model_cnn)])
    pipe_cnn.set_params(**params)
    pipe_cnn.fit(X_train, y_train)

    ## Train the LSTM
    model_LSTM = gw_LSTM()
    pipe_LSTM = Pipeline([('scaler', scaler_lstm), ('model', model_LSTM)])
    pipe_LSTM.set_params(**LSTMparams)
    pipe_LSTM.fit(X_tr_L, y_train)

    
    ## Prepare future data for predictions and scoring
    features = data_prep.load_data('FEATS')
    features = data_prep.select_features(features, no_target=True)
    features = data_prep.add_toy_signal(features)
    future_dates = features.loc[features.date > data_prep.get_end_date(well)]['date'].copy()
    # CNN specific future data
    X_future = features.loc[features.date > data_prep.get_end_date(well)].drop('date', axis=1).copy()
    all_dates = pd.concat([well_dates, future_dates])
    
    ## Keep track of the length of predictions
    len_pred_test = len(y_holdout)
    len_pred_future = all_dates.shape[0]-len_pred_test
    
    ## Make baseline predictions
    pred_avg_train = np.ones(len_pred_test)*train_mean
    pred_avg_future = np.ones(len_pred_future)*train_mean
    pred_avg_full = np.append(pred_avg_train, pred_avg_future)
    
    ## Make linear regression predictions
    pred_lin_train = model_lr.predict(X_train)
    pred_lin_test = model_lr.predict(X_holdout)
    pred_lin_future = model_lr.predict(X_future.values)
    pred_lin_full = np.append(pred_lin_train, 
                              np.append(pred_lin_test, pred_lin_future))
    
    ## Make CNN predictions
    pred_cnn_train = pipe_cnn.predict(X_train)
    pred_cnn_test = pipe_cnn.predict(X_holdout)
    pred_cnn_future = pipe_cnn.predict(X_future.values)
    pred_cnn_full = np.append(pred_cnn_train, 
                              np.append(pred_cnn_test, pred_cnn_future))

    ## Make LSTM prediction
    # the LSTM requires a warmup set, so the first window=WINDOW_SIZE predictions for the training
    # set will just be the actual values
    ytr_warmup = y_train[:window].to_numpy()
    pred_LSTM_train = np.concatenate((ytr_warmup, pipe_LSTM.predict(X_tr_L)))
    pred_LSTM_test = pipe_LSTM.predict(X_te_L)
    # LSTM specific future data (requires warmup set built from the predictions on the test set
    # or from the actual values, tbd)
    # if from actual values, replace pred_LSTM_test with y_holdout in the call of LSTM_future
    X_fut_L = data_prep.LSTM_future(X_future, pred_LSTM_test, X_te_L, window)
    pred_LSTM_future = pipe_LSTM.predict(X_fut_L)
    pred_LSTM_full = np.append(pred_LSTM_train, 
                              np.append(pred_LSTM_test, pred_LSTM_future))
    
    ## Gather the predictions and actual data into a single dataframe
    well_data = data_prep.load_data(well)
    full_predict = well_data[['date', 'avg_well_depth']].merge(all_dates, on='date', how='outer')
    full_predict.rename(columns={'avg_well_depth':'Actual'}, inplace=True)
    full_predict['Baseline'] = pred_avg_full
    full_predict['Linear Reg'] = pred_lin_full
    full_predict['CNN'] = pred_cnn_full
    full_predict['LSTM'] = pred_LSTM_full
    
    ## Isolate the prediction on the holdout set for scoring
    test_predict = full_predict.loc[full_predict['date']<=data_prep.get_end_date(well)][-365:].copy()
    
    ## Compute scores on the holdout set
    RMSE_baseline = np.sqrt(np.mean((test_predict.Actual - test_predict.Baseline)**2))
    MAE_baseline = np.mean(np.abs(test_predict.Actual - test_predict.Baseline))
    
    RMSE_lin_reg = np.sqrt(np.mean((test_predict.Actual - test_predict['Linear Reg'])**2))
    MAE_lin_reg = np.mean(np.abs(test_predict.Actual - test_predict['Linear Reg']))
    
    RMSE_cnn = np.sqrt(np.mean((test_predict.Actual - test_predict.CNN)**2))
    MAE_cnn = np.mean(np.abs(test_predict.Actual - test_predict.CNN))

    RMSE_lstm = np.sqrt(np.mean((test_predict.Actual - test_predict.LSTM)**2))
    MAE_lstm = np.mean(np.abs(test_predict.Actual - test_predict.LSTM))
    
    ## Update the scores in the dataframe
    score_summary.RMSE = [RMSE_baseline, RMSE_lin_reg, RMSE_cnn, RMSE_lstm]
    score_summary.MAE = [MAE_baseline, MAE_lin_reg, MAE_cnn, MAE_lstm]
    
    ## Save the results for this well in a dataframe
    prediction_filename = 'model_predictions_'+well+'.pkl'
    score_filename = 'model_scores_'+well+'.pkl'

    full_predict.to_pickle(prediction_path+prediction_filename)
    score_summary.to_pickle(score_path+score_filename)

2023-11-24 14:26:14.095193: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2023-11-24 14:26:14.095209: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2023-11-24 14:26:14.095213: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2023-11-24 14:26:14.095248: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-24 14:26:14.095261: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/100


2023-11-24 14:26:14.593960: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100




Epoch 1/30


2023-11-24 14:26:28.548440: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-11-24 14:26:28.634410: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-11-24 14:26:28.733915: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

2023-11-24 14:26:53.124201: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [None]:
pred_AEK201 = pd.read_pickle('../data/prediction_pickles/model_predictions_AEK201.pkl')
pred_AFL259 = pd.read_pickle('../data/prediction_pickles/model_predictions_AFL259.pkl')
pred_APK309 = pd.read_pickle('../data/prediction_pickles/model_predictions_APK309.pkl')
pred_APK310 = pd.read_pickle('../data/prediction_pickles/model_predictions_APK310.pkl')

future_AEK201 = pd.read_csv('../data/raw_data/EIM-data-AEK201/EIMDiscreteResults_2023Oct22_39.csv')
future_AEK201 = future_AEK201[['Result_Value','Field_Collection_End_Date']].copy()
future_AEK201['date'] = pd.to_datetime(future_AEK201['Field_Collection_End_Date'])
future_AEK201 = future_AEK201.drop('Field_Collection_End_Date', axis=1).copy()
future_AEK201 = future_AEK201[['date','Result_Value']].sort_values('date').reset_index(drop=True)
future_AEK201 = pred_AEK201[['date', 'Baseline', 'Linear Reg', 'CNN', 'LSTM']].merge(future_AEK201, on='date').copy()
future_AEK201 = future_AEK201.rename(columns={'Result_Value': 'actual_depth'}).copy()
future_AEK201 = future_AEK201.loc[future_AEK201.date > data_prep.get_end_date('AEK201')]

future_AFL259 = pd.read_csv('../data/raw_data/EIM-data-AFL259/GroundwaterLevelsDiscreteResults_2023Nov12_24.csv')
future_AFL259 = future_AFL259[['Water_Level_Value','Field_Collection_Date']].copy()
future_AFL259['date'] = pd.to_datetime(future_AFL259['Field_Collection_Date'])
future_AFL259 = future_AFL259.drop('Field_Collection_Date', axis=1).copy()
future_AFL259 = future_AFL259[['date','Water_Level_Value']].sort_values('date').reset_index(drop=True)
future_AFL259 = pred_AFL259[['date', 'Baseline', 'Linear Reg', 'CNN', 'LSTM']].merge(future_AFL259, on='date').copy()
future_AFL259 = future_AFL259.rename(columns={'Water_Level_Value': 'actual_depth'}).copy()
future_AFL259 = future_AFL259.loc[future_AFL259.date > data_prep.get_end_date('AFL259')]

future_APK309 = pd.read_csv('../data/raw_data/EIM-data-APK309/GroundwaterLevelsDiscreteResults_2023Oct19_19.csv')
future_APK309 = future_APK309[['Water_Level_Value','Field_Collection_Date']].copy()
future_APK309['date'] = pd.to_datetime(future_APK309['Field_Collection_Date'])
future_APK309 = future_APK309.drop('Field_Collection_Date', axis=1).copy()
future_APK309 = future_APK309[['date','Water_Level_Value']].sort_values('date').reset_index(drop=True)
future_APK309 = pred_APK309[['date', 'Baseline', 'Linear Reg', 'CNN', 'LSTM']].merge(future_APK309, on='date').copy()
future_APK309 = future_APK309.rename(columns={'Water_Level_Value': 'actual_depth'}).copy()
future_APK309 = future_APK309.loc[future_APK309.date > data_prep.get_end_date('APK309')]

future_APK310 = pd.read_csv('../data/raw_data/EIM-data-APK310/GroundwaterLevelsDiscreteResults_2023Nov02_19.csv')
future_APK310 = future_APK310[['Water_Level_Value','Field_Collection_Date']].copy()
future_APK310['date'] = pd.to_datetime(future_APK310['Field_Collection_Date'])
future_APK310 = future_APK310.drop('Field_Collection_Date', axis=1).copy()
future_APK310 = future_APK310[['date','Water_Level_Value']].sort_values('date').reset_index(drop=True)
future_APK310 = pred_APK310[['date', 'Baseline', 'Linear Reg', 'CNN', 'LSTM']].merge(future_APK310, on='date').copy()
future_APK310 = future_APK310.rename(columns={'Water_Level_Value': 'actual_depth'}).copy()
future_APK310 = future_APK310.loc[future_APK310.date > data_prep.get_end_date('APK310')]

future_AEK201.to_pickle(future_pred_path+'future_data_compare_AEK201.pkl')
future_AFL259.to_pickle(future_pred_path+'future_data_compare_AFL259.pkl')
future_APK309.to_pickle(future_pred_path+'future_data_compare_APK309.pkl')
future_APK310.to_pickle(future_pred_path+'future_data_compare_APK310.pkl')


future_preds = [future_AEK201, future_AFL259, future_APK309, future_APK310]
for well, pred in zip(wells, future_preds):
    ## Compute scores on the holdout set
    RMSE_baseline = np.sqrt(np.mean((pred.actual_depth - pred.Baseline)**2))
    MAE_baseline = np.mean(np.abs(pred.actual_depth - pred.Baseline))
    
    RMSE_lin_reg = np.sqrt(np.mean((pred.actual_depth - pred.['Linear Reg'])**2))
    MAE_lin_reg = np.mean(np.abs(pred.actual_depth - pred.['Linear Reg']))
    
    RMSE_cnn = np.sqrt(np.mean((pred.actual_depth - pred.CNN)**2))
    MAE_cnn = np.mean(np.abs(pred.actual_depth - pred.CNN))

    RMSE_lstm = np.sqrt(np.mean((pred.actual_depth - pred.LSTM)**2))
    MAE_lstm = np.mean(np.abs(pred.actual_depth - pred.LSTM))
    
    ## Update the scores in the dataframe
    score_summary.RMSE = [RMSE_baseline, RMSE_lin_reg, RMSE_cnn, RMSE_lstm]
    score_summary.MAE = [MAE_baseline, MAE_lin_reg, MAE_cnn, MAE_lstm]
    
    ## Save the results for this well in a dataframe
    score_filename = 'model_future_scores_'+well+'.pkl'
    score_summary.to_pickle(score_path+score_filename)

In [None]:
pd.read_pickle('../data/prediction_pickles/model_scores_AEK201.pkl')

In [None]:
pd.read_pickle('../data/prediction_pickles/model_scores_AFL259.pkl')

In [None]:
pd.read_pickle('../data/prediction_pickles/model_scores_APK309.pkl')

In [None]:
pd.read_pickle('../data/prediction_pickles/model_scores_APK310.pkl')

In [None]:
pd.read_pickle('../data/prediction_pickles/model_future_scores_AEK201.pkl')

In [None]:
pd.read_pickle('../data/prediction_pickles/model_future_scores_AFL259.pkl')

In [None]:
pd.read_pickle('../data/prediction_pickles/model_future_scores_APK309.pkl')

In [None]:
pd.read_pickle('../data/prediction_pickles/model_future_scores_APK310.pkl')