Our data is in folder data/jena_data/jena_climate_2009_2016.csv

In [77]:
import pandas as pd
import numpy as np
datafile = "../data/jena_data/jena_climate_2009_2016.csv"

In [7]:
df = pd.read_csv(datafile)

In [12]:
print("data shape:", df.shape)
df.head()

data shape: (420551, 15)


Unnamed: 0,Date Time,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
0,01.01.2009 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
1,01.01.2009 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1
2,01.01.2009 00:30:00,996.53,-8.51,264.91,-9.31,93.9,3.21,3.01,0.2,1.88,3.02,1310.24,0.19,0.63,171.6
3,01.01.2009 00:40:00,996.51,-8.31,265.12,-9.07,94.2,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.5,198.0
4,01.01.2009 00:50:00,996.51,-8.27,265.15,-9.04,94.1,3.27,3.08,0.19,1.92,3.09,1309.0,0.32,0.63,214.3


In [None]:
df.drop("Date Time", axis=1, inplace=True)

In [26]:
# Take the temperature column as target data, convert it to numpy array
targets = df['T (degC)'].values
# convert the dataframe into a numpy matrix called `data`
data = df.values

The data contains measurements done each 10 minutes, so there are 144 measurements in a day. We can define the following parameters of the model:
- lookback = 1440 - we'll take data of last 10 days as direct input to the model
- steps = 6 - we won't use all data points since many features don't change much in less then an hour. We'll sample data using steps of 1 hour 
- delay = 144 - how further in the future we'd like to predict (24 hours)

In [57]:
lookback = 1440
steps = 6
delay = 144

In [31]:
train_samples = 200000
val_samples = 100000
# test samples would take the rest of data (or 120551 data rows)

We need to normalize and standardize the data - subtract the mean and divide with the standard deviation of data. Here, the mean and st. dev are calculated only on the training set, but applied to the whole data.

In [50]:
mean = data[:train_samples].mean(axis=0)
data -= mean
std = data[:train_samples].std(axis=0)
data /= std

# print 2 lines of the processed data
print(data[:2, :])

[[ 0.90014748 -1.93135845 -1.98211036 -1.86280029  1.07285236 -1.30742164
  -1.47375773 -0.79868641 -1.4762674  -1.47815522  2.12375056 -0.72950452
  -0.78067973 -0.27613603]
 [ 0.9060434  -1.97541381 -2.02567    -1.91582958  1.07883061 -1.32042698
  -1.4951961  -0.80075238 -1.49502455 -1.49932141  2.17199852 -0.93124017
  -0.88794488 -0.46317443]]


In [66]:
data.shape

(420551, 14)

Now we can define a Python generator function that will take as input our data and output batches of data, in the suitable format (batch_size, timesteps, num_features) to be inserted in the model. 

In [84]:
def generator(data, lookback, delay, min_index, max_index, shuffle=False, batch_size=128, step=6):
    if max_index is None:
        max_index = len(data) - delay - 1
    i = min_index + lookback
    
    while True:
        if shuffle:
            rows = np.random.randint(min_index + lookback, max_index, size = batch_size)
        else:
            # make sure you don't go over the length of data
            if i + batch_size >= max_index:
                i = min_index + lookback
            rows = np.arange(i, min(i + batch_size, max_index))
            i += len(rows)
        
        # output format: (batch_size, timesteps, num_features)
        samples = np.zeros((len(rows), lookback // step, data.shape[1]))
        targets = np.zeros((len(rows),))
        
        # loop over a batch and create samples and targets
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][1]
        yield samples, targets

The above generator function can now be used to generate the train, validation and test dataset generators.

In [85]:
batch_size = 128

train_gen = generator(data,
                     lookback=lookback,
                     delay=delay,
                     min_index=0,
                     max_index=train_samples,
                     shuffle=True,
                     step=steps,
                     batch_size=batch_size)

val_gen = generator(data,
                    lookback=lookback,
                    delay=delay,
                    min_index=train_samples + 1,
                    max_index=train_samples + val_samples,
                    shuffle=False,
                    step=steps,
                    batch_size=batch_size)

test_gen = generator(data,
                    lookback=lookback,
                    delay=delay,
                    min_index=train_samples + val_samples + 1,
                    max_index=None,
                    shuffle=False,
                    step=steps,
                    batch_size=batch_size)

# Number of steps to see the entire validation set
val_steps = train_samples + val_samples - train_samples - lookback - 1
# Number of steps to see the entire test set
test_steps = len(data) - train_samples - val_samples - lookback - 1

In [87]:
def evaluate2():
    batch_maes = []
    for samples, targets in val_gen:
        preds = samples[:, -1, 1]
        mae = np.mean(np.abs(preds - targets))
        batch_maes.append(mae)
    print(np.mean(batch_maes))
evaluate_naive_method()

0.28973058236861327


In [None]:
from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop

model = Sequential()
model.add(layers.Flatten(input_shape=(lookback // steps, data.shape[-1])))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1))

model.compile(optimizer=RMSprop(), loss='mae')

history = model.fit_generator(train_gen,
                             steps_per_epoch = 500,
                             epochs=20,
                             validation_data=val_gen,
                             validation_steps=val_steps)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

In [None]:
def plot_history(history):
    """
    Plots the history of a model training - its loss and accuracy.
    """
    
    import matplotlib.pyplot as plt
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()