In [1]:
import pandas as pd
import numpy as np
import math
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error


spot = pd.read_csv('./data/spot/clarkson_data.csv', delimiter=';', parse_dates=['Date'], dayfirst=True)
pmx_forw = pd.read_csv('./data/ffa/PMAX_FFA.csv', delimiter=';', parse_dates=['Date'], dayfirst=True)
csz_forw = pd.read_csv('./data/ffa/CSZ_FFA.csv', delimiter=';', parse_dates=['Date'], dayfirst=True)
smx_forw = pd.read_csv('./data/ffa/SMX_FFA.csv', delimiter=';', parse_dates=['Date'], dayfirst=True)
oecd_ip_dev = pd.read_csv('./data/other/oecd_daily.csv', parse_dates=['Date'], dayfirst=True)
fleet_dev = pd.read_csv('./data/other/fleet_dev_daily.csv', parse_dates=['Date'], dayfirst=True)
eur_usd = pd.read_csv('./data/other/EUR_USD_historical.csv', parse_dates=['Date'], delimiter=";", dayfirst=True)
# Convert 'Last' column to numeric, replacing comma with dot for decimal point
eur_usd['Last'] = pd.to_numeric(eur_usd['Last'].str.replace(',', '.'), errors='coerce')


def pick_forw(key):
    if key == "PMX":
        return pmx_forw
    elif key == "CSZ":
        return csz_forw
    elif key == "SMX":
        return smx_forw

In [40]:
# Number of rounds based on the test set size and forecast horizon
num_rounds =  2  # Adjusted to ensure we don't exceed the test set
look_back = 10  # Adjust based on your temporal structure
hor = 5
scaler = MinMaxScaler()
s_col = "CSZ"
f_col = "1MON"
fleet_col = "CSZ fleet"
forw = pick_forw(s_col)


# Ensure 'Date' columns are in datetime format for all datasets
oecd_ip_dev['Date'] = pd.to_datetime(oecd_ip_dev['Date'])
fleet_dev['Date'] = pd.to_datetime(fleet_dev['Date'])
eur_usd['Date'] = pd.to_datetime(eur_usd['Date'])
spot['Date'] = pd.to_datetime(spot['Date'])
pmx_forw['Date'] = pd.to_datetime(pmx_forw['Date'])
csz_forw['Date'] = pd.to_datetime(csz_forw['Date'])
smx_forw['Date'] = pd.to_datetime(smx_forw['Date'])


prod_col = 'Ind Prod Excl Const VOLA'
eur_col = 'Last'

# Merge data frames on the Date column
data_combined = pd.merge(spot, forw, on='Date')
data_combined = pd.merge(data_combined, oecd_ip_dev[['Date', prod_col]], on='Date', how='inner')
data_combined = pd.merge(data_combined, fleet_dev[['Date', fleet_col]], on='Date', how='inner')
data_combined = pd.merge(data_combined, eur_usd[['Date', eur_col]], on='Date', how='inner')


# Filter out rows where the specified columns contain zeros or NA values
cols_to_check = [s_col, f_col, fleet_col, prod_col, eur_col]
data_combined = data_combined.dropna(subset=cols_to_check)  # Drop rows where NA values are present in the specified columns
data_combined = data_combined[(data_combined[cols_to_check] != 0).all(axis=1)]  # Drop rows where 0 values are present in the specified columns


# Remove rows with NA or 0 in specific columns (assuming 'SMX' and '1Q' are column names in 'data_combined')
#data_combined = data_combined[(data_combined[s_col].notna() & data_combined[s_col] != 0) & (data_combined[f_col].notna() & data_combined[f_col] != 0)]

# Transform data to log levels
data_log_levels = pd.DataFrame()
data_log_levels["spot"] = np.log(data_combined[s_col])
data_log_levels["forwp"] = np.log(data_combined[f_col])
data_log_levels[fleet_col] = np.log(data_combined[fleet_col])
data_log_levels[prod_col] = np.log(data_combined[prod_col])
data_log_levels[eur_col] = np.log(data_combined[eur_col])

data_log_levels.index = data_combined["Date"]

split_index = math.floor(len(data_log_levels) * 0.8)
print(len(data_log_levels))

3550


In [37]:



# Initialize dictionary to store MSE results for each model
mse_results = {
    'MLP_spot': [],
    'MLP_forwp': [],
    'LSTM_spot': [],
    'LSTM_forwp': [],
    'RW_spot': [],
    'RW_forwp': [],
}

def random_walk_predictions(training_data, testing_data):
    """
    Generates Random Walk predictions where the next value is assumed to be the last observed value.
    
    Parameters:
    - training_data: DataFrame containing the training data.
    - testing_data: DataFrame containing the test data.
    
    Returns:
    - predictions: Numpy array containing Random Walk predictions for the test set.
    """
    # Last observed values from the training set
    last_observed_spot = training_data['spot'].iloc[-1]
    last_observed_forwp = training_data['forwp'].iloc[-1]
    
    # Create an array of predictions, each one equal to the last observed values
    predictions = np.tile([last_observed_spot, last_observed_forwp], (len(testing_data), 1))
    
    return predictions


def create_even_odd_array(arr):
    """
    Returns an array where the first column contains values from even positions
    and the second column contains values from odd positions of the original array.
    """
    return arr.reshape(-1, 2)


# Convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=10, is_test=False):
    X, Y = [], []
    if is_test:  # for test data, we just need the last entry for 1-step ahead forecast
        X = dataset[-1:,:,]
        return X, None
    else:
        for i in range(look_back, len(dataset)- hor + 1):
            X.append(dataset[i-look_back:i])
            Y.append(dataset[i:i+hor])
    return np.array(X), np.array(Y)


# Adjust train and test sets for each forecast round
for round in range(1, num_rounds + 1):
    print("Round", round)
    # Define new split point for each round
    split_index = split_index +  hor
    print(split_index)
    
    # Update train and test sets
    train = data_log_levels.iloc[:split_index]
    test = data_log_levels.iloc[split_index:split_index+hor]

    #Scale train set
    train_scal = scaler.fit_transform(train)

    trainX, trainY = create_dataset(train_scal, look_back)
    # Create and fit the MLP model

    trainX_flat = trainX.reshape(trainX.shape[0], -1)
    trainY_flat = trainY.reshape(trainY.shape[0], -1)

    model_mlp = Sequential()
    model_mlp.add(Dense(32, input_dim=trainX_flat.shape[1], activation='relu'))
    model_mlp.add(Dense(trainY_flat.shape[1], activation="linear"))
    model_mlp.compile(loss='mean_squared_error', optimizer='adam')
    model_mlp.fit(trainX_flat, trainY_flat, epochs=30, batch_size=1, verbose=0)


    # Make predictions
    trainPredict_scal_flat = model_mlp.predict(trainX_flat)
    testX = train_scal[-look_back:].reshape(1, look_back, train_scal.shape[1])
    #testX, _ = create_dataset(trainX, look_back=look_back, is_test=True)
    testX_flat = testX.reshape(testX.shape[0], -1)
    testPredict_scal_flat = model_mlp.predict(testX_flat)
    testPredict_scal = create_even_odd_array(testPredict_scal_flat)

    # Invert predictions
    testPredict_mlp = scaler.inverse_transform(testPredict_scal)

    # Calculate mean squared error
    testScore = mean_squared_error(test["spot"], testPredict_mlp[:,0])
    testScoreForw = mean_squared_error(test["forwp"], testPredict_mlp[:,1])
    print('Test Score spot MLP: %.5f MSE' % (testScore))
    print('Test Score forw MLP: %.5f MSE' % (testScoreForw))



    # Define the LSTM model
    model_lstm = Sequential()
    model_lstm.add(LSTM(units=50, return_sequences=True, input_shape=(trainX.shape[1], trainX.shape[2])))
    model_lstm.add(LSTM(units=50))
    model_lstm.add(Dense(trainY_flat.shape[1], activation='linear'))  # Assuming multi-step forecasting

    model_lstm.compile(loss='mean_squared_error', optimizer='adam')
    model_lstm.fit(trainX, trainY_flat, epochs=30, batch_size=1, verbose=0)

    # Prepare the last sequence from the training set as the input for the first prediction
    #testX_last_sequence = train_scal[-look_back:].reshape(1, look_back, train_scal.shape[1])

    # Make predictions
    testPredict_scal_flat = model_lstm.predict(testX)

    # Since you're predicting `hor` steps ahead, you might need to adjust the code to generate
    # multiple steps if your LSTM model is set up for single-step predictions.
    # For simplicity, this example directly uses the LSTM output for multi-step predictions.

    # Invert scaling
    testPredict_scal = create_even_odd_array(testPredict_scal_flat)
    testPredict_lstm = scaler.inverse_transform(testPredict_scal)


    # Calculate and print MSE for each target
    testScore_spot_lstm = mean_squared_error(test["spot"].iloc[:hor].values, testPredict_lstm[:,0])
    testScore_forw_lstm = mean_squared_error(test["forwp"].iloc[:hor].values, testPredict_lstm[:,1])
    print('LSTM Test Score spot: %.5f MSE' % (testScore_spot_lstm))
    print('LSTM Test Score forw: %.5f MSE' % (testScore_forw_lstm))


    # Random Walk Predictions for comparison
    rw_predictions = random_walk_predictions(train, test)
    
    # Calculate and append MSE for each model for this round
    mse_results['MLP_spot'].append(mean_squared_error(test["spot"], testPredict_mlp[:, 0]))
    mse_results['MLP_forwp'].append(mean_squared_error(test["forwp"], testPredict_mlp[:, 1]))

    mse_results['LSTM_spot'].append(mean_squared_error(test["spot"], testPredict_lstm[:, 0]))
    mse_results['LSTM_forwp'].append(mean_squared_error(test["forwp"], testPredict_lstm[:, 1]))

    mse_results['RW_spot'].append(mean_squared_error(test["spot"], rw_predictions[:, 0]))
    mse_results['RW_forwp'].append(mean_squared_error(test["forwp"], rw_predictions[:, 1]))



Round 1
3567


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [4]:
for key, values in mse_results.items():
    mean = sum(values) / len(values) * 100
    print(f"Mean for {key}: {mean}")

Mean for MLP_spot: 2.2771149372540367
Mean for MLP_forwp: 0.5995018554328824
Mean for LSTM_spot: 1.3724467462538703
Mean for LSTM_forwp: 0.4940264509691707
Mean for RW_spot: 2.104689725191761
Mean for RW_forwp: 0.32988766264063507
