# Data formatting

In this script the data is formatted to fit the different models and variable settings used.

In [1]:
#set the parameters
TIME_STEPS = 60   #positive integer, set to 1 for no lag
TEST_SPLIT = 0.2
LOCATION = 'lobith'

#import libraries
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
from sklearn.preprocessing import StandardScaler

#set input and output folders
in_folder = '/content/drive/MyDrive/ADS/Final Thesis Project/data/'
out_folder = f'/content/drive/MyDrive/ADS/Final Thesis Project/temp/formatted_data/{LOCATION}/'

#read the data
pred = pd.read_csv(in_folder + f'pred_{LOCATION}.csv')
q = pd.read_csv(in_folder + f'q_{LOCATION}.csv')

#define some more parameters based on previous ones
n_obs = pred.shape[0] - (TIME_STEPS - 1)
test_window = int(n_obs*TEST_SPLIT)
is_lag = TIME_STEPS > 1

#define function to find the year day
def date_to_yday(date):
  datetime_obj = datetime.strptime(date, '%Y-%m-%d')
  yday = datetime_obj.timetuple().tm_yday
  return yday

#add day of the year in predictors
pred['year_day'] = pred['datetime'].apply(date_to_yday)

#remove the first column (date)
pred = pred.iloc[:,1:]

if is_lag:
  #add the lagged variables to the dataframe
  for i, var in enumerate(pred.columns):
    for step in range(0, TIME_STEPS - 1):
      pred.insert(i*(TIME_STEPS) + 1, 
                        f'{var}_lag_{TIME_STEPS - 1 - step}', 
                        pred[var].shift(TIME_STEPS - 1 - step))

  #remove the first TIME_STEPS - 1 rows since they will contain NA values
  pred = pred.iloc[TIME_STEPS - 1:,:].reset_index(drop=True)
  q = q.iloc[TIME_STEPS - 1:,:].reset_index(drop=True)

#loop for every different testing sample
for i,test_start in enumerate(range(0,n_obs - 1,test_window), start = 1):

  #split train and test for predictors vars
  X_train = pd.concat([pred.iloc[0:test_start, :], 
                       pred.iloc[test_start+test_window:, :]]
                      ).to_numpy()
  X_test = pred.iloc[test_start:test_start+test_window, :].to_numpy()

  #split train and test for predicted var
  y_train = pd.concat([q.iloc[0:test_start, -1], 
                       q.iloc[test_start+test_window:, -1]]
                      ).to_numpy()
  y_test = q.iloc[test_start:test_start+test_window, -1].to_numpy()

  #extract the observations
  obs = q.iloc[test_start:test_start+test_window, 1].to_numpy()
  
  #normalize the input
  scaler = StandardScaler()
  X_train[:,range(0, X_train.shape[1], TIME_STEPS)] = scaler.fit_transform(X_train[:,range(0, X_train.shape[1], TIME_STEPS)])
  X_test[:,range(0, X_train.shape[1], TIME_STEPS)] = scaler.transform(X_test[:,range(0, X_train.shape[1], TIME_STEPS)])
  for step in range(1,TIME_STEPS):
    X_train[:,range(step, X_train.shape[1], TIME_STEPS)] = scaler.transform(X_train[:,range(step, X_train.shape[1], TIME_STEPS)])
    X_test[:,range(step, X_train.shape[1], TIME_STEPS)] = scaler.transform(X_test[:,range(step, X_train.shape[1], TIME_STEPS)])
    
  scaler_params = scaler.get_params()
  with open(out_folder + 'lagged_'*is_lag + f'scaler_params_{i}.pkl', 'wb') as f:
    pickle.dump(scaler_params, f)
  
  #save all the data
  np.savez(out_folder + 'no_'*(not is_lag) + f'lag_{i}',
           X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test, obs = obs)


  if self.run_code(code, result):


SUBSET MODE? --- use code below

In [None]:
#subset only the defined variables
vars = ['surfaceWaterStorage', 'snowCoverSWE', 'storUppTotal', 'storGroundwater',
        'snowFreeWater', 'gwRecharge', 'baseflow']
pred_basel = pred_basel[vars]