### Project Notebook
This is the full and complete notebook that takes in the data from NOAA and processes it into frames to be used in the PredNet architecture and produce a resulting prediction.

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [None]:
#Getting a list of files in raw data folder
filenames = os.listdir('D:/Nico/Desktop/processed_data')

In [None]:
header_wanted = [
 'HOURLYVISIBILITY',
 'HOURLYDRYBULBTEMPC',
 'HOURLYWETBULBTEMPC',
 'HOURLYDewPointTempC',
 'HOURLYRelativeHumidity',
 'HOURLYWindSpeed',
 'HOURLYWindGustSpeed',
 'HOURLYStationPressure',
 'HOURLYPressureTendency',
 'HOURLYPressureChange',
 'HOURLYSeaLevelPressure',
 'HOURLYPrecip',
 'HOURLYAltimeterSetting']

In [None]:
usecols = ['DATE','STATION'] + header_wanted

In [None]:
#Loading all files into a pandas Dataframe
tqdm.pandas()
df = pd.concat([pd.read_csv('D:/Nico/Desktop/processed_data/{}'.format(x), usecols=usecols, low_memory=False) for x in tqdm(filenames)])

At this point all the data has been loaded into a single dataframe and any data changes have been made. The next step is to break the data up by WBAN and place in a 2D array at the appropriate grid cell. 

In [None]:
stations = pd.read_csv("../Playground/stations_unique.csv", usecols = ['STATION_ID', 'LON_SCALED', 'LAT_SCALED'])

In [None]:
height = 20
width = 40

In [None]:
mask = [([0] * width) for i in range(height)]

wban_loc = dict(zip(stations.STATION_ID,zip(stations.LON_SCALED,stations.LAT_SCALED)))

In [None]:
grid = [([pd.DataFrame()] * width) for i in range(height)]

In [None]:
for key, value in tqdm(wban_loc.items()):
    mask[value[1]][value[0]] = 1
    grid[value[1]][value[0]] = df.loc[df.STATION == key]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.imshow(mask)

In [None]:
#TODO Handle different sized data some stacks too short
def create_frames(data,height, width, depth):
    days = []
    frames = []
    for i in tqdm(range(depth)):
        frame = np.zeros((height,width,12))
        for y in range(height):
            for x in range(width):
                if(not data[y][x].empty):
                    frame[y][x] = data[y][x].iloc[[i],1:13].values.flatten()
        if((i+1)%24 != 0):
            frames.append(frame)
        else:
            frames.append(frame)
            days.append(frames)
            frames = []
    return days

In [None]:
def average_grid_fill(mask,data, height, width):
        
    for i in range(height):
        for j in range(width):
            if(mask[i][j] != 1):
                neighbors = get_neighbors(j,i,data)
                data[i][j] = np.mean(neighbors)
            
    return data

In [None]:
def get_neighbors(x,y,g):
    neighbors = []
    for i in [y-1,y,y+1]:
        for j in [x-1,x,x+1]:
            if(i >= 0 and j >= 0):
                if(i != y or j != x ):
                    try:
                        neighbors.append(g[i][j])
                    except:
                        pass
    return neighbors

In [None]:
def store_sequence(frames):
    import hickle as hkl
    source_list = []
    
    for days in range(len(frames)):
        for day in range(len(frames[days])):
            source_list += '{}'.format(days)
    
    hkl.dump(frames, './data/train/x_train.hkl')
    hkl.dump(source_list, './data/train/x_sources.hkl')
            

Splits is a dictionary holding train, test, val
the values for train, test, and val are lists of tuples holding category and folder name
in the end each image gets a source associated with it
there is only one data and one source hickle dump for each of train test and val

In [None]:
frames = create_frames(grid, height, width,504)

In [None]:
#TODO use loop to average each frame
for x in tqdm(range(len(frames))):
    for y in range(len(frames[0])):
        frames[x][y] = average_grid_fill(mask, frames[x][y], height, width )

In [None]:
store_sequence(frames)

In [None]:
np_frames = np.array(frames)
np_frames.shape

In [None]:
store_sequence(np_frames)

At this point I have processed the data and made it into discrete frames of data and it is time to run it through the PredNet architecture for training.

In [3]:
np.random.seed(123)
from six.moves import cPickle

from keras import backend as K
from keras.models import Model
from keras.layers import Input, Dense, Flatten
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
from keras.optimizers import Adam

from prednet import PredNet
from data_utils import SequenceGenerator

Using TensorFlow backend.


In [4]:
WEIGHTS_DIR = './weights/'
DATA_DIR = './data/'

In [5]:
save_model = True  # if weights will be saved
weights_file = os.path.join(WEIGHTS_DIR, 'prednet_weather_weights.hdf5')  # where weights will be saved
json_file = os.path.join(WEIGHTS_DIR, 'prednet_weather_model.json')

In [6]:
# Data files
#TODO: Use the files from NOAA and process them into proper frames
train_file = os.path.join(DATA_DIR,'train/', 'x_train.hkl')
train_sources = os.path.join(DATA_DIR, 'train/', 'x_sources.hkl')
#val_file = os.path.join(DATA_DIR, 'X_val.hkl')
#val_sources = os.path.join(DATA_DIR, 'sources_val.hkl')

In [7]:
# Training parameters
nb_epoch = 1
batch_size = 4
samples_per_epoch = 500
N_seq_val = 100  # number of sequences to use for validation

In [8]:
# Model parameters
n_channels, im_height, im_width = (12, 20, 40)
input_shape = (n_channels, im_height, im_width) if K.image_data_format() == 'channels_first' else (im_height, im_width, n_channels)
stack_sizes = (n_channels, 48, 96)
R_stack_sizes = stack_sizes
A_filt_sizes = (3, 3)
Ahat_filt_sizes = (3, 3, 3)
R_filt_sizes = (3, 3, 3)
layer_loss_weights = np.array([1., 0., 0.])  # weighting for each layer in final loss; "L_0" model:  [1, 0, 0, 0], "L_all": [1, 0.1, 0.1, 0.1]
layer_loss_weights = np.expand_dims(layer_loss_weights, 1)
nt = 24  # number of timesteps used for sequences in training
time_loss_weights = 1./ (nt - 1) * np.ones((nt,1))  # equally weight all timesteps except the first
time_loss_weights[0] = 0

In [9]:
prednet = PredNet(stack_sizes, R_stack_sizes,
                  A_filt_sizes, Ahat_filt_sizes, R_filt_sizes,
                  output_mode='error', return_sequences=True)

In [10]:
inputs = Input(shape=(nt,) + input_shape)
errors = prednet(inputs)  # errors will be (batch_size, nt, nb_layers)
errors_by_time = TimeDistributed(Dense(1, trainable=False), weights=[layer_loss_weights, np.zeros(1)], trainable=False)(errors)  # calculate weighted error by layer
errors_by_time = Flatten()(errors_by_time)  # will be (batch_size, nt)
final_errors = Dense(1, weights=[time_loss_weights, np.zeros(1)], trainable=False)(errors_by_time)  # weight errors by time
model = Model(inputs=inputs, outputs=final_errors)
model.compile(loss='mean_absolute_error', optimizer='adam')

In [11]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 24, 20, 40, 12)    0         
_________________________________________________________________
pred_net_1 (PredNet)         (None, 24, 3)             1645548   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 24, 1)             4         
_________________________________________________________________
flatten_1 (Flatten)          (None, 24)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 25        
Total params: 1,645,577
Trainable params: 1,645,548
Non-trainable params: 29
_________________________________________________________________


In [12]:
truth = []
for i in range(20):
    truth.append(np.random.randint(255,size=(1)))
output = np.array(truth)

In [13]:
train_generator = SequenceGenerator(train_file, train_sources, nt, batch_size=batch_size, shuffle=True)

In [None]:
lr_schedule = lambda epoch: 0.001 if epoch < 75 else 0.0001    # start with lr of 0.001 and then drop to 0.0001 after 75 epochs
callbacks = [LearningRateScheduler(lr_schedule)]
#history = model.fit(np_frames, output ,batch_size, nb_epoch, callbacks=callbacks)

In [None]:
history = model.fit_generator(train_generator, samples_per_epoch / batch_size, nb_epoch, callbacks=callbacks)

Epoch 1/1
