In [24]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.utils import shuffle
%matplotlib inline




In [25]:
data = pd.read_csv('data/hour.csv')
print(data.head())

   instant      dteday  season  yr  mnth  hr  holiday  weekday  workingday  \
0        1  2011-01-01       1   0     1   0        0        6           0   
1        2  2011-01-01       1   0     1   1        0        6           0   
2        3  2011-01-01       1   0     1   2        0        6           0   
3        4  2011-01-01       1   0     1   3        0        6           0   
4        5  2011-01-01       1   0     1   4        0        6           0   

   weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  
0           1  0.24  0.2879  0.81        0.0       3          13   16  
1           1  0.22  0.2727  0.80        0.0       8          32   40  
2           1  0.22  0.2727  0.80        0.0       5          27   32  
3           1  0.24  0.2879  0.75        0.0       3          10   13  
4           1  0.24  0.2879  0.75        0.0       0           1    1  


In [26]:
#def features and dropping

categorical_columns = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']
numerical_columns = ['temp', 'atemp', 'hum', 'windspeed']
target_column = 'cnt'
columns_to_drop = ['instant', 'dteday', 'casual', "registered"]
data_cleaned = data.drop(columns=columns_to_drop, errors='ignore') #drop the data that we will not use



In [27]:
#OHE

X_processed_df = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True) #OHE
Y_series = X_processed_df.pop(target_column) #extract cnt and remove it from X_processed
X_df = X_processed_df # x contains all features

#X_df = X_df.reset_index(drop=True)
#Y_series = Y_series.reset_index(drop=True) 

In [28]:
# 60-20-20 split

m = len(X_df)
train_end = int(m*0.6)
val_end = int(m*0.8)

X_train_df = X_df.iloc[:train_end]
y_train = Y_series.iloc[:train_end].values

X_val_df = X_df.iloc[train_end:val_end]
y_val = Y_series.iloc[train_end:val_end].values

X_test_df = X_df.iloc[val_end:]
y_test = Y_series.iloc[val_end:].values


#conversion to numpy arrays for using keras (as floats)

X_train_processed = X_train_df.values.astype('float32')
X_val_processed = X_val_df.values.astype('float32')
X_test_processed = X_test_df.values.astype('float32')

y_train = y_train.astype('float32')
y_val = y_val.astype('float32')
y_test = y_test.astype('float32')

In [29]:
# deep learning model

input_shape = X_train_processed.shape[1] #nr features

model = Sequential(
    [
        tf.keras.Input(shape=(input_shape,)),    #specify input size
        Dense(50, activation='relu', name = "L1"), 
        Dense(25, activation='relu', name = "L2"), 
        Dense(1, activation='linear', name = "L3")
    ]
)
model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics = ["mae"]
             )

model.summary()

In [30]:
#training 

history = model.fit(
    X_train_processed, 
    y_train, 
    epochs = 100, 
    batch_size = 32, 
    validation_data = (X_val_processed, y_val)
    #verbose = 0 #to hide data being trained
)

print("\nTraining finished (eindelijk)")

Epoch 1/100
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 34245.0469 - mae: 133.7070 - val_loss: 44557.4609 - val_mae: 158.0275
Epoch 2/100
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 11705.1475 - mae: 83.7550 - val_loss: 26428.7852 - val_mae: 121.0235
Epoch 3/100
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6840.6816 - mae: 59.8026 - val_loss: 19001.5977 - val_mae: 105.5898
Epoch 4/100
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5774.6138 - mae: 54.4068 - val_loss: 17490.3086 - val_mae: 103.5163
Epoch 5/100
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5452.8120 - mae: 52.2888 - val_loss: 16866.2070 - val_mae: 100.2439
Epoch 6/100
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5302.2861 - mae: 51.5101 - val_loss: 16211.5352 - val_mae: 96.5865
Epoch 7/100
[1m326/

In [32]:
#evaluation + comparing (w RMSE)

Y_pred = model.predict(X_test_processed)
rmse_nn = np.sqrt(mean_squared_error(y_test, Y_pred))
print(f"Deep Learning Model Test Set RMSE: {rmse_nn:.2f} rentals ")

rmse_linear_regression = 144 #other model
print("\n            Comparison")
print(f"Linear Regression RMSE: {rmse_linear_regression:.2f} rentals ")
print(f"Deep Learning RMSE:     {rmse_nn:.2f} rentals")

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Deep Learning Model Test Set RMSE: 78.05 rentals 

            Comparison
Linear Regression RMSE: 144.00 rentals 
Deep Learning RMSE:     78.05 rentals
