The goal of this notebook is to try some experiments to assess the baseline performance of single task (city) performance on post COVID data. This gives us an indication on what to improve upon.

In [1]:
# STEP 0 set up train test validation split

# STEP 1: understand baseline, single task performance. (probably in another notebook)
# Regression on
# a) -----
# - Weather data
# - timestamp (fix the 24 hour thing) [Might remove, or use as the time of day]
# - mobility (PC's of the mobility data?) [NEW]
# - past values of y [NEW]
# - past seasonal values of y [NEW]
# - jam into ridge regression and neural network (can be done pretty easily today)
# b) ---- harder: past and past seasonal values of e (can be thought about today)
# - (optional) holiday?

# STEP 2: investigate errors (bias variance)

# STEP 3: PL set up
# representations from just old data without mobility, increasing modality for forward transfer
# same is true for L2N

In [5]:
import numpy as np

from sklearn.utils.validation import check_X_y

In [12]:
# The city_name field can take on "Boston", "Chicago", "Dallas", "Houston", "Phil", "SA", or "Seattle".
def load_data(city_name, standardize = False, verbose = False):
    df = np.genfromtxt('data/City_Level/%s_mobility_all.csv' % city_name, delimiter=',')
    
    X_weather = df[:, 1:1+len(weather_col_names)].astype(float)
    X_holiday = df[:, 1+len(holiday_col_names)+len(weather_col_names)+len(timestamp_col_names)].astype(float).reshape(-1, 1)
    X_mobility = df[:, 1+len(holiday_col_names)+len(weather_col_names)+len(timestamp_col_names):].astype(float)    
    X = np.hstack((X_weather, X_holiday, X_mobility))
    
    if standardize: X = scale(X)
    
    timestamps = df[:, 1+len(weather_col_names):1+len(weather_col_names)+len(timestamp_col_names)].astype(int)
    y = df[:, 0].astype(float)
    
    metadata = {"city_name" : city_name, 
                "X_weather" : X_weather, 
                "X_holiday": X_holiday,
                "X_mobility" : X_mobility, 
                "timestamps" : timestamps}

    if verbose:
        print(city_name, "Energy Consumption Data:")
        print("Sample size n =", X.shape[0])
        print("Number of weather features d_w =", X_weather.shape[1])
        print("Number of holiday features d_h =", X_holiday.shape[1])
        print("Number of mobility features d_m =", X_mobility.shape[1])
        print("Total number features d =", X.shape[1])
    
    return X, y, metadata

In [27]:
# This function adds past output values as inputs. 'p' is the number of lagged values to append.
# 'ps' is an array of lags. 'scales' is an array which determines the intervals of those lags.
def append_past_values(X, y, p_hour, p_day, p_week):
    X, y = check_X_y(X, y)
    if p_hour == 0 and p_day == 0 and p_week == 0:
        return X
    
    hour_time = 1
    day_time = hour_time * 24
    week_time = day_time * 7
    
    ps = np.array([p_hour, p_day, p_week])
    times = np.array([hour_time, day_time, week_time])
    
    burnin = np.max(ps * times)
    n_new = n - burnin
    X_new = X[burnin:]
    
    # Make numpy array of indices, and use it to index/map y.
    # i is the training example, j is the lag.
    for p in range(3):
        if ps[p] != 0:
            idx = np.fromfunction(lambda i, j: burnin + i - times[p]*(j + 1), (n_new, ps[p]), dtype=int)
            X_new = np.hstack((X_new, y[idx]))
    
    return X_new

In [29]:
# Test append past values.
n = 300
d = 2

X = np.zeros((n, d))
y = np.arange(n)

p_hour = 5
p_day = 3
p_week = 1

X = append_past_values(X, y, p_hour, p_day, p_week)

print("Expected X size = (132, 11)")
print("Observed X size =", X.shape)

print("Expected X[0] = [0. 0. 167. 166. 165. 144. 120. 0.]")
print("Observed X[0] =", X[0])

Expected X size = (132, 11)
Observed X size = (132, 11)
Expected X[0] = [0. 0. 167. 166. 165. 144. 120. 0.]
Observed X[0] = [  0.   0. 167. 166. 165. 164. 163. 144. 120.  96.   0.]


In [8]:
# Because the data are time series, this can only be done with one fold.
def train_val_test_split(X, y):
    X, y = check_X_y(X, y)
    
    n = len(y)
    n_train = int(0.8 * n)
    n_val = (n - n_train) // 2
    n_test = n - n_train - n_val
    
    X_train, y_train = X[:n_train], y[:n_train]
    X_val, y_val = X[n_train:n_train+n_val], y[n_train:n_train+n_val]
    X_test, y_test = X[n_train+n_val:], y[n_train+n_val:]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [24]:
# Test append past values.
n = 50
d = 10
X = np.zeros((n, d))
y = np.arange(n)

X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y)

print("Expected y_test = [45 46 47 48 49]")
print("Observed y_test =", y_test)

Expected y_test = [45 46 47 48 49]
Observed y_test = [45 46 47 48 49]


In [None]:
# generate hyper parameters array

In [None]:
# test

In [None]:
# compute relative error

# compute val error

In [None]:
# test

In [None]:
# X, y = load_data
# hyperparams = generate hyper parameters array
# best_hyp = {}
# best val_Err = 1

# for all hyper params (array of kwargs)
#    X, y = append_past_values
#    X, y, Xv, yv, Xt, yt = train test split
#    compute_val error(X, y )
#    if small: update_best

# display best