You should build an end-to-end machine learning pipeline using a recurrent neural network model. In particular, you should do the following:


- Load the Jena Climate dataset using Pandas. You can find this dataset in the keras repository.
- Preprocess this time series dataset.
- Split the dataset into training, validation, and test sets. Note that you cannot split time series using Scikit-Learn.
- Build an end-to-end machine learning pipeline, including a recurrent neural network model.
- Optimize vour pipeline by validating vour design decisions.
- Test the best pipeline on the test set and report various evaluation metrics.
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

In [None]:
import os
import zipfile
import pandas as pd 
import sklearn.metrics 
import sklearn.preprocessing 
import matplotlib.pyplot as plt 
import tensorflow.keras as keras 
import plotly.graph_objects as go

In [None]:
uri = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"
zip_path = keras.utils.get_file(fname="jena_climate_2009_2016.csv.zip", origin=uri)
zip_file = zipfile.ZipFile(zip_path)
zip_file.extractall()
csv_path = "jena_climate_2009_2016.csv"
df = pd.read_csv(csv_path)
df.head()


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip


Unnamed: 0,Date Time,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
0,01.01.2009 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
1,01.01.2009 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1
2,01.01.2009 00:30:00,996.53,-8.51,264.91,-9.31,93.9,3.21,3.01,0.2,1.88,3.02,1310.24,0.19,0.63,171.6
3,01.01.2009 00:40:00,996.51,-8.31,265.12,-9.07,94.2,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.5,198.0
4,01.01.2009 00:50:00,996.51,-8.27,265.15,-9.04,94.1,3.27,3.08,0.19,1.92,3.09,1309.0,0.32,0.63,214.3


In [None]:
# Preprocessing

df[["Date", "Time"]] = df["Date Time"].str.split(" ", expand=True)
df = df.drop("Date Time", axis=1)
df = df[df["Time"] == "12:00:00"]
df = df.drop("Time", axis=1)
df = df.set_index("Date")
df.head()

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
01.01.2009,1000.3,-6.87,266.27,-8.28,89.6,3.64,3.27,0.38,2.03,3.26,1306.98,1.84,2.63,184.4
02.01.2009,998.91,-3.12,270.13,-4.5,90.1,4.85,4.37,0.48,2.73,4.38,1286.47,1.54,2.0,127.0
03.01.2009,999.02,-5.96,267.28,-8.72,80.7,3.91,3.16,0.75,1.97,3.16,1300.9,1.31,1.88,213.9
04.01.2009,988.94,-1.21,272.82,-2.36,91.8,5.59,5.13,0.46,3.23,5.19,1264.3,1.99,3.88,225.8
05.01.2009,990.42,-3.65,270.26,-5.54,86.6,4.66,4.04,0.62,2.54,4.08,1278.21,5.49,8.38,17.44


In [None]:
df["target"] = df["T (degC)"].shift(-1)
df = df.drop(df.tail(1).index)
df.head()

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg),target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
01.01.2009,1000.3,-6.87,266.27,-8.28,89.6,3.64,3.27,0.38,2.03,3.26,1306.98,1.84,2.63,184.4,-3.12
02.01.2009,998.91,-3.12,270.13,-4.5,90.1,4.85,4.37,0.48,2.73,4.38,1286.47,1.54,2.0,127.0,-5.96
03.01.2009,999.02,-5.96,267.28,-8.72,80.7,3.91,3.16,0.75,1.97,3.16,1300.9,1.31,1.88,213.9,-1.21
04.01.2009,988.94,-1.21,272.82,-2.36,91.8,5.59,5.13,0.46,3.23,5.19,1264.3,1.99,3.88,225.8,-3.65
05.01.2009,990.42,-3.65,270.26,-5.54,86.6,4.66,4.04,0.62,2.54,4.08,1278.21,5.49,8.38,17.44,-14.52


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df["T (degC)"]))
fig.update_layout(
    #title="Temperature Changes"
    xaxis_title="Date",
    yaxis_title="Temperature",
)
fig.show()

In [None]:
# Splitting the dataset
train_start_index = 0
validation_start_index = int(df.shape[0] * 0.70)
test_start_index = int(df.shape[0] * 0.85)

df_train = df.iloc[train_start_index:validation_start_index]
df_validation = df.iloc[validation_start_index:test_start_index]
df_test = df.iloc[test_start_index:]

x_train = df_train.drop("target", axis=1)
x_validation = df_validation.drop("target", axis=1)
x_test = df_test.drop("target", axis=1)

y_train = df_train["target"]
y_validation = df_validation["target"]
y_test = df_test["target"]

print("Train size:", x_train.shape, y_train.shape)
print("Validation size:", x_validation.shape, y_validation.shape)
print("Test size:", x_test.shape, y_test.shape)

Train size: (2043, 14) (2043,)
Validation size: (439, 14) (439,)
Test size: (438, 14) (438,)


In [None]:
# Standaridizing Feature 

scaler = sklearn.preprocessing.StandardScaler()
x_train = scaler.fit_transform(x_train)
x_validation = scaler. transform(x_validation)
x_test = scaler.transform(x_test)

In [None]:
# Creating Sequences

SEQUENCE_LENGTH = 10
BATCH_SIZE = 250

In [None]:
dataset_train = keras.preprocessing.timeseries_dataset_from_array(
    x_train,
    y_train,
    sequence_length=SEQUENCE_LENGTH,
    batch_size=BATCH_SIZE,
)

In [None]:
dataset_validation = keras.preprocessing.timeseries_dataset_from_array(
    x_validation,
    y_validation,
    sequence_length=SEQUENCE_LENGTH,
    batch_size=BATCH_SIZE,
)

In [None]:
dataset_test = keras.preprocessing.timeseries_dataset_from_array(
    x_test,
    y_test,
    sequence_length=SEQUENCE_LENGTH,
    batch_size=BATCH_SIZE,
)

In [None]:
# Building the Model

for batch in dataset_train.take (1):
  inputs, targets = batch
  input_shape = inputs.shape[1:]

In [None]:
model = keras. models.Sequential()
model.add(keras.layers.Input(shape=input_shape))
model.add(keras.layers.LSTM(32))
model.add(keras.layers.Dense (1))
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse") 
model.summary ()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 32)                6016      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 6,049
Trainable params: 6,049
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Define the number of epochs for training the model
EPOCHS = 500

# Define an EarlyStopping callback to stop training if validation loss doesn't improve
# for a given number of epochs (patience)
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=20)

# Train the model on the training dataset for the specified number of epochs,
# with validation on the validation dataset
model.fit(dataset_train, epochs=EPOCHS, validation_data=dataset_validation, shuffle=False, callbacks=[es_callback])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7f27a8efbfd0>

In [None]:
# Testing Model

actual = []
predicted = []
for x, y in dataset_test:
  actual += y.numpy().tolist()
  p = model.predict(x)
  predicted += p.reshape (p.shape[0]).tolist()


sklearn.metrics.r2_score (actual, predicted)



0.990283076984508

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_test.index, y=actual, mode='lines', name='Actual')) 
fig.add_trace(go.Scatter(x=df_test.index, y=predicted, mode='lines', name='Predicted')) 
fig.update_layout(
    title="Model Performance on the Test Set",
    xaxis_title="Date",
    yaxis_title="Temperature",
)
fig.show()