In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import keras
from keras.models import Sequential
from keras.layers import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
HOURS_PER_DAY = 20
DAYS_PER_WEEK = 7
DAYS_PER_YEAR = 365

def create_dataset_years(signal_data, hours=1, days=1, weeks=1, years=1):
    num_data = len(signal_data) - HOURS_PER_DAY * DAYS_PER_YEAR * years
    x_arr, y_arr = np.zeros((num_data, 4, max(hours, days, weeks, years))), np.zeros((num_data,))
    
    for i in range(num_data):
        index = i
        
        for j in range(years):
            x_arr[i, 3, j] = signal_data[index]
            index += HOURS_PER_DAY * DAYS_PER_YEAR
            
        index -= HOURS_PER_DAY * DAYS_PER_WEEK * weeks
        
        for j in range(weeks):
            x_arr[i, 2, j] = signal_data[index]
            index += HOURS_PER_DAY * DAYS_PER_WEEK
            
        index -= HOURS_PER_DAY * days
        
        for j in range(days):
            x_arr[i, 1, j] = signal_data[index]
            index += HOURS_PER_DAY
        
        x_arr[i, 0, 0:hours] = signal_data[(index-hours):index]
        y_arr[i] = signal_data[index]

    return x_arr, y_arr

def create_model():
    model = Sequential()  
    model.add(LSTM(64, return_sequences=True))
    model.add(Dropout(0.2)) 
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(64))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    return model

def run_model(data):
    hours = 3
    days = 4
    weeks = 5
    years = 5
    batch_size = 256
    
    # create model
    model = create_model()
    adam = keras.optimizers.Adam(lr=0.001)
    model.compile(optimizer=adam, loss='mse')
    
    # prepare data
    x_data, y_data = create_dataset_years(data, hours, days, weeks, years)
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, shuffle=False)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=False)
    
    # run model
    history = model.fit(x_train, y_train, epochs=200, batch_size=batch_size, validation_data=(x_val, y_val))
    score = model.evaluate(x_test, y_test, batch_size=batch_size)

    # predict future values
    predictions = np.concatenate(model.predict(x_test, batch_size))
    #plt.plot(predictions, (predictions - y_test), 'rx')
    
    # evaluate model
    SMAPE = np.mean(abs(predictions - y_test) / (abs(predictions) + abs(y_test)))
    RMSE = np.sqrt(np.mean((predictions - y_test)**2))
    
    return SMAPE, RMSE

In [None]:
result = pd.read_csv('./data/result.csv', encoding='utf-8')
station_numbers = result['station_number'].tolist()

for i in range(len(station_numbers)):
    
    station_number = station_numbers[i]
    result = pd.read_csv('./data/result.csv', encoding='utf-8')
    
    # skip stations already processed
    if result[result['station_number'] == station_number].at[i, 'SMAPE'] != 0.0:
        continue
        
    print("Now processing station number %d" % station_number)
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    df = pd.read_csv('./data/departure/%d_2008_to_2017.csv' % station_number,encoding='utf-8', dtype='float64')
    data = np.concatenate(scaler.fit_transform(df.values.reshape(-1,1))) 
    
    SMAPE, RMSE = run_model(data)
    
    result.loc[result.station_number == station_number, 'SMAPE'] = SMAPE
    result.loc[result.station_number == station_number, 'RMSE'] = RMSE
    
    result.to_csv('./data/result.csv', index=False)