# Time-series Forecasting For Weather Station - Stacked Recurrent Network
- About: 14 attributes, 10 minutes interval in several years.

## 1. Data Preprocessing

In [1]:
import os
fname = os.path.join("data/jena_climate_2009_2016.csv")

# Open the file and read the header
with open(fname) as f:
    data = f.read()
    
lines = data.split("\n") # split all the rows
header = lines[0].split(",")
lines = lines[1:] # don't get the title column
print(header)
print(len(lines))

['"Date Time"', '"p (mbar)"', '"T (degC)"', '"Tpot (K)"', '"Tdew (degC)"', '"rh (%)"', '"VPmax (mbar)"', '"VPact (mbar)"', '"VPdef (mbar)"', '"sh (g/kg)"', '"H2OC (mmol/mol)"', '"rho (g/m**3)"', '"wv (m/s)"', '"max. wv (m/s)"', '"wd (deg)"']
420451


In [2]:
import numpy as np

# Create an array for temperature with the number of rows
temperature = np.zeros((len(lines),))

# Create an array of tuples
raw_data = np.zeros((len(lines), len(header) - 1))

# Go thru each lines and store in our array
for i, line in enumerate(lines):
    # Split the row into individual values except for datetime
    values = [float(x) for x in line.split(",")[1:]] 
    temperature[i] = values[1] # get the temp
    raw_data[i, :] = values[:] # store everything, including the label

In [3]:
num_train_samples = int(0.5 * len(raw_data))
num_val_samples = int(0.25 * len(raw_data))
num_test_samples = len(raw_data) - num_train_samples - num_val_samples
print(f"num_train_samples: {num_train_samples}")
print(f"num_val_samples: {num_val_samples}")
print(f"num_test_samples: {num_test_samples}")

num_train_samples: 210225
num_val_samples: 105112
num_test_samples: 105114


In [4]:
mean = raw_data[:num_train_samples].mean(axis=0)
raw_data -= mean
std = raw_data[:num_train_samples].std(axis=0)
raw_data /= std

In [6]:
from tensorflow import keras

sampling_rate = 6 # once every hour
sequence_length = 120 # 5 x 24 = 5 days = 120 hours
delay = sampling_rate * (sequence_length + 24 - 1)
batch_size = 256

train_dataset = keras.utils.timeseries_dataset_from_array(
    raw_data[:-delay],
    targets=temperature[delay:],
    sampling_rate=sampling_rate,
    sequence_length=sequence_length,
    shuffle=True,
    batch_size=batch_size,
    start_index=0,
    end_index=num_train_samples
)

val_dataset = keras.utils.timeseries_dataset_from_array(
    raw_data[:-delay],
    targets=temperature[delay:],
    sampling_rate=sampling_rate,
    sequence_length=sequence_length,
    shuffle=True,
    batch_size=batch_size,
    start_index=num_train_samples,
    end_index=num_train_samples + num_val_samples
)

test_dataset = keras.utils.timeseries_dataset_from_array(
    raw_data[:-delay],
    targets=temperature[delay:],
    sampling_rate=sampling_rate,
    sequence_length=sequence_length,
    shuffle=False,
    batch_size=batch_size,
    start_index=num_train_samples + num_val_samples
)

2024-03-12 22:45:36.843412: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-12 22:45:36.845020: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-12 22:45:36.874649: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-12 22:45:36.875000: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [7]:
for samples, targets in train_dataset:
    print(f"samples shape: {samples.shape}")
    print(f"targets shape: {targets.shape}")
    break

samples shape: (256, 120, 14)
targets shape: (256,)


2024-03-12 22:45:47.196978: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_28' with dtype int32 and shape [209506]
	 [[{{node Placeholder/_28}}]]
2024-03-12 22:45:47.197584: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [210225,14]
	 [[{{node Placeholder/_0}}]]


In [None]:
from tensorflow.keras import layers

inputs = keras.Input(shape=(sequence_length, raw_data.shape[-1]))
x = layers.GRU(32, recurrent_dropout=0.5, return_sequences=True)(inputs)
x = layers.GRU(32, recurrent_dropout=0.5)(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

callbacks = [
    keras.callbacks.ModelCheckpoint("models/jena_stacked_gru_dropout_optimized.keras", save_best_only=True)
]

model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
history = model.fit(train_dataset, epochs=15, validation_data=val_dataset, callbacks=callbacks)
model = keras.models.load_model("models/jena_stacked_gru_dropout_optimized.keras")
print(f"Test MAE: {model.evaluate(test_dataset)[1]:.2f}")

Epoch 1/15
 27/819 [..............................] - ETA: 1:42 - loss: 103.1662 - mae: 8.5395

In [None]:
import plotly.graph_objs as go
from tensorflow import keras

# Step 1: Load the trained model
model = keras.models.load_model("models/jena_stacked_gru_dropout_optimized.keras")

# Step 2 and 3: Prepare input and make predictions
# Assuming you want to predict for the first few days in your test dataset
num_days_to_predict = 7  # Number of days to predict
num_predictions = num_days_to_predict * 24 * (60 // sampling_rate)  # Predict every hour, adjust depending on your data's time resolution

predicted_temperatures = []
actual_temperatures = []
for batch in test_dataset.take(num_days_to_predict):  # Assuming each batch is a day, adjust if necessary
    inputs, targets = batch
    predictions = model.predict(inputs).flatten()
    predicted_temperatures.extend(predictions[:num_predictions])
    actual_temperatures.extend(targets.numpy()[:num_predictions])

# Step 4: Plot actual vs. predicted temperature using Plotly
actual_trace = go.Scatter(
    x=list(range(num_predictions)),
    y=actual_temperatures,
    mode='lines',
    name='Actual Temperature'
)

predicted_trace = go.Scatter(
    x=list(range(num_predictions)),
    y=predicted_temperatures,
    mode='lines',
    name='Predicted Temperature'
)

layout = go.Layout(
    title='Actual vs Predicted Temperature',
    xaxis={'title': 'Time'},
    yaxis={'title': 'Temperature (normalized)'}
)

fig = go.Figure(data=[actual_trace, predicted_trace], layout=layout)
fig.show()