In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input,LSTM, Dropout, Input, Activation
from tensorflow.keras.initializers import RandomNormal, he_normal
from tensorflow.keras import optimizers

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [2]:
'''
Good references:

1. https://stackoverflow.com/questions/54416322/expected-ndim-3-found-ndim-2
2. https://stackoverflow.com/questions/36992855/keras-how-should-i-prepare-input-data-for-rnn/62570576#62570576
3. https://keras.io/examples/timeseries/timeseries_traffic_forecasting/
4. https://stackoverflow.com/questions/67957105/how-to-apply-lstm-to-predict-parking-availability
'''

'\nGood references:\n\n1. https://stackoverflow.com/questions/54416322/expected-ndim-3-found-ndim-2\n2. https://stackoverflow.com/questions/36992855/keras-how-should-i-prepare-input-data-for-rnn/62570576#62570576\n3. https://keras.io/examples/timeseries/timeseries_traffic_forecasting/\n4. https://stackoverflow.com/questions/67957105/how-to-apply-lstm-to-predict-parking-availability\n'

In [3]:
# Reference: https://stackoverflow.com/questions/67957105/how-to-apply-lstm-to-predict-parking-availability
# class Lstm_model(tf.keras.Model):
#     def __init__(self, **kwargs):
#         super(Lstm_model, self).__init__()   
#         self.Lstm1 = tf.keras.layers.LSTM(32,return_sequences=True)
#         self.Lstm2 = tf.keras.layers.LSTM(32,return_sequences=True) 
#         self.Regressor = tf.keras.layers.Dense(1, )

#     def call(self, inputs):

#         input_A=inputs
#         x = self.Lstm1(input_A)
#         x = self.Lstm2(x)
#         pred = self.Regressor(x) 
        
#         return  pred

In [4]:
# df = pd.read_csv("camera_data/Non-covid_data.csv")
# df = pd.read_csv("parkade_data/jerry_sorry.csv")
df = pd.read_csv("parkade_data/jerry_thunderbird_processed.csv")
df = df.rename(columns={'TimestampLocal': 'Timestamp', 'TimestampLocal.total_count': 'total_count'})
df.head(10)

Unnamed: 0,Timestamp,total_count,parkade
0,2017-01-01 0:00,63,Thunderbird
1,2017-01-02 0:00,132,Thunderbird
2,2017-01-03 0:00,786,Thunderbird
3,2017-01-04 0:00,750,Thunderbird
4,2017-01-05 0:00,709,Thunderbird
5,2017-01-06 0:00,886,Thunderbird
6,2017-01-07 0:00,207,Thunderbird
7,2017-01-08 0:00,351,Thunderbird
8,2017-01-09 0:00,1644,Thunderbird
9,2017-01-10 0:00,1698,Thunderbird


In [5]:
print(df["Timestamp"].min())
print(df["Timestamp"].max())
print(df["Timestamp"].count())

2017-01-01 0:00
2023-02-02 0:00
2224


In [6]:
def get_season(month):
    SUMMER_MONTHS = [6, 7, 8] 
    SPRING_MONTHS = [3, 4, 5]
    WINTER_MONTHS = [12, 1, 2]
    FALL_MONTHS = [9, 10, 11]
    if month in WINTER_MONTHS:
        return "Winter"
    elif month in FALL_MONTHS:
        return "Autumn"
    elif month in SUMMER_MONTHS:
        return "Summer"
    else:
        return "Fall"

In [7]:
def create_lag_df(df, lag, cols):
    return df.assign(
        **{f"{col}_{n}_days_ago": df[col].shift(n) for n in range(1, lag + 1) for col in cols}
    )

In [8]:
# Create new features
df['year'] = pd.DatetimeIndex(df['Timestamp']).year
df['month'] = pd.DatetimeIndex(df['Timestamp']).month
df['day'] = pd.DatetimeIndex(df['Timestamp']).day
df['day_of_week'] = pd.DatetimeIndex(df['Timestamp']).day_name()
df = df.assign(season=df["month"].apply(get_season))

df = create_lag_df(df, 7, ['total_count'])

df.head(10)

Unnamed: 0,Timestamp,total_count,parkade,year,month,day,day_of_week,season,total_count_1_days_ago,total_count_2_days_ago,total_count_3_days_ago,total_count_4_days_ago,total_count_5_days_ago,total_count_6_days_ago,total_count_7_days_ago
0,2017-01-01 0:00,63,Thunderbird,2017,1,1,Sunday,Winter,,,,,,,
1,2017-01-02 0:00,132,Thunderbird,2017,1,2,Monday,Winter,63.0,,,,,,
2,2017-01-03 0:00,786,Thunderbird,2017,1,3,Tuesday,Winter,132.0,63.0,,,,,
3,2017-01-04 0:00,750,Thunderbird,2017,1,4,Wednesday,Winter,786.0,132.0,63.0,,,,
4,2017-01-05 0:00,709,Thunderbird,2017,1,5,Thursday,Winter,750.0,786.0,132.0,63.0,,,
5,2017-01-06 0:00,886,Thunderbird,2017,1,6,Friday,Winter,709.0,750.0,786.0,132.0,63.0,,
6,2017-01-07 0:00,207,Thunderbird,2017,1,7,Saturday,Winter,886.0,709.0,750.0,786.0,132.0,63.0,
7,2017-01-08 0:00,351,Thunderbird,2017,1,8,Sunday,Winter,207.0,886.0,709.0,750.0,786.0,132.0,63.0
8,2017-01-09 0:00,1644,Thunderbird,2017,1,9,Monday,Winter,351.0,207.0,886.0,709.0,750.0,786.0,132.0
9,2017-01-10 0:00,1698,Thunderbird,2017,1,10,Tuesday,Winter,1644.0,351.0,207.0,886.0,709.0,750.0,786.0


In [9]:
# split the data into training set and test set
# Non-covid_data.csv splits
# 85-15 spilit
# split_date = "2022-11-15 0:00"

# jerry_sorry.csv splits
# 80-20 split
# split_date = "2022-12-14 0:00"

# jerry_thunderbird_processed.csv splits
split_date = "2021-11-15 0:00"
df_train = df[df["Timestamp"] <= split_date]
df_test = df[df["Timestamp"] > split_date]

assert(len(df_train) + len(df_test) == len(df))

print(df_train.shape)
print(df_test.shape)

(1780, 15)
(444, 15)


In [10]:
def preprocess_features(
    train_df,
    test_df,
    numeric_features,
    categorical_features,
    drop_features,
    target
):
    all_features = set(numeric_features + categorical_features + drop_features + target)
    
    if set(train_df.columns) != all_features:
        print("Missing columns", set(train_df.columns) - all_features)
        print("Extra columns", all_features - set(train_df.columns))
        raise Exception("Columns do not match")

    # Apply imputation and scaling to numerical features 
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy="median"), StandardScaler()
    )
    
    # Apply imputation and One-Hot encoding to categorical features
    categorical_transformer = make_pipeline(
        SimpleImputer(strategy="constant", fill_value="missing"),
        OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    )
    
    # Define a column transformer
    preprocessor = make_column_transformer(
        (numeric_transformer, numeric_features),
        (categorical_transformer, categorical_features),
        ("drop", drop_features),
    )
    
    preprocessor.fit(train_df)
    ohe_feature_names = (
        preprocessor.named_transformers_["pipeline-2"]
        .named_steps["onehotencoder"]
        .get_feature_names_out(categorical_features)
        .tolist()
    )
    new_columns = numeric_features + ohe_feature_names

    X_train_enc = pd.DataFrame(
        preprocessor.transform(train_df), index=train_df.index, columns=new_columns
    )
    X_test_enc = pd.DataFrame(
        preprocessor.transform(test_df), index=test_df.index, columns=new_columns
    )

    y_train = train_df["total_count"]
    y_test = test_df["total_count"]

    return X_train_enc, y_train, X_test_enc, y_test, preprocessor

In [11]:
# Categorize and preprocess features
numeric_features = ["year", "month", "day", "total_count_1_days_ago", "total_count_2_days_ago", "total_count_3_days_ago", 
                    "total_count_4_days_ago", "total_count_5_days_ago", "total_count_6_days_ago", "total_count_7_days_ago"]
categorical_features = ["day_of_week", "season"]
# drop_features = ["Timestamp"]
drop_features = ["Timestamp", "parkade"]
target = ["total_count"]

X_train_enc, y_train, X_test_enc, y_test, preprocessor = preprocess_features(
    df_train,
    df_test,
    numeric_features,
    categorical_features,
    drop_features,
    target
)

X_train_enc.head(5)

Unnamed: 0,year,month,day,total_count_1_days_ago,total_count_2_days_ago,total_count_3_days_ago,total_count_4_days_ago,total_count_5_days_ago,total_count_6_days_ago,total_count_7_days_ago,...,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,season_Autumn,season_Fall,season_Summer,season_Winter
0,-1.397362,-1.590192,-1.666251,-0.247428,-0.246739,-0.247024,-0.246527,-0.246051,-0.246874,-0.247449,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.397362,-1.590192,-1.552605,-1.467955,-0.246739,-0.247024,-0.246527,-0.246051,-0.246874,-0.247449,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-1.397362,-1.590192,-1.43896,-1.349673,-1.467682,-0.247024,-0.246527,-0.246051,-0.246874,-0.247449,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,-1.397362,-1.590192,-1.325314,-0.228571,-1.34936,-1.468115,-0.246527,-0.246051,-0.246874,-0.247449,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-1.397362,-1.590192,-1.211668,-0.290283,-0.227876,-1.349779,-1.468681,-0.246051,-0.246874,-0.247449,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
print(X_train_enc.shape)
print(y_train.shape)
print(X_test_enc.shape)
print(y_test.shape)

(1780, 21)
(1780,)
(444, 21)
(444,)


In [13]:
# model = Sequential([
#     LSTM(100, input_shape=(1, X_train_enc.shape[1]), return_sequences=True),
#     LSTM(5, input_shape=(1, X_train_enc.shape[1]), return_sequences=True),
#     Activation('relu'),
#     Dense(1)
# ])

# model = Sequential()
# model.add(LSTM(100, input_shape=(1, X_train_enc.shape[1]), return_sequences=True))
# model.add(LSTM(5, input_shape=(1, X_train_enc.shape[1]), return_sequences=True))

# TODO: Need to tune # of neurons
model = Sequential()
model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(1, X_train_enc.shape[1])))
model.add(LSTM(50, activation='relu'))
model.add(Dense(1))

In [14]:
X_train_enc = X_train_enc.to_numpy().reshape(-1, 1, X_train_enc.shape[1])
X_test_enc  = X_test_enc.to_numpy().reshape(-1, 1, X_test_enc.shape[1])
y_train = y_train.to_numpy().reshape(-1, 1, 1)
y_test = y_test.to_numpy().reshape(-1, 1, 1)

print(X_train_enc.shape)
print(y_train.shape)
print(X_test_enc.shape)
print(y_test.shape)

(1780, 1, 21)
(1780, 1, 1)
(444, 1, 21)
(444, 1, 1)


In [15]:
# lstms_ = Lstm_model()
# lstms_.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())
# lstms_.fit(X_train_enc, y_train, epochs=50)

In [16]:
model.compile(loss="mean_absolute_error", optimizer="adam", metrics= [tf.keras.metrics.MeanAbsolutePercentageError()])
model.fit(X_train_enc, y_train, epochs=100, validation_data=(X_test_enc, y_test))

Epoch 1/100


2023-02-09 16:46:47.196302: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x2814fa6e0>