In [None]:
import pickle
import re
import numpy as np
from datapipeline import split_data, create_sequences_token
import pandas as pd
from model import build_transformer_model_regression, build_lstm_model_regression, build_ann_model_regression

In [None]:
data_path = '../results/data_row_locks_start_time_pageid-dedupe.csv'
# Load the data
data = pd.read_csv(data_path)
data.head()

In [None]:
percent_data = 0.1

data = data.iloc[:int(len(data) * percent_data)]

In [None]:
data['PAGEID_int'] = data['PAGEID'].apply(lambda x: int(x.replace(' ', ''))).astype(int)
data['ROWID_int'] = data['ROWID'].apply(lambda x: int(x.replace(' ', ''))).astype(int)

In [None]:
data['PAGEID_int'].min(), data['PAGEID_int'].max(), data['PAGEID_int'].mean(), data['PAGEID_int'].median()

In [None]:
data['PAGEID_int'].hist(bins=20)

In [None]:
data['PAGEID_int_normalized'] = (data['PAGEID_int'] - data['PAGEID_int'].min()) / (data['PAGEID_int'].max() - data['PAGEID_int'].min())
data['ROWID_int_normalized'] = (data['ROWID_int'] - data['ROWID_int'].min()) / (data['ROWID_int'].max() - data['ROWID_int'].min())

In [None]:
def inverse_min_max_scale(scaled_lst, original_min, original_max):
    return np.array([x * (original_max - original_min) + original_min for x in scaled_lst])

In [None]:
data[['PAGEID_int', 'ROWID_int']]

In [None]:
def create_sequences(df, col, window_size, horizon):
    values = df[col]
    X_lst = []
    y_lst = []
    y_naive_lst = []

    for i in range(len(values) - window_size - horizon + 1):
        X = values.iloc[i : i + window_size].to_numpy()
        y = values['PAGEID_int_normalized'].iloc[i + window_size : i + window_size + horizon].to_numpy()
        # The naive prediction is the last values (based on horizon) of X to predict the next values
        y_naive = values['PAGEID_int_normalized'].iloc[i + window_size - horizon : i + window_size].to_numpy()

        X_lst.append(X)
        y_lst.append(y)
        y_naive_lst.append(y_naive)

    X = np.array(X_lst)
    y = np.array(y_lst)
    y_naive = np.array(y_naive_lst)
    return X, y, y_naive


In [None]:
data[["PAGEID_int_normalized", "ROWID_int_normalized"]].iloc[0:2].to_numpy()

In [None]:
X, y, y_naive = create_sequences(data, ["PAGEID_int_normalized"], window_size=50, horizon=1)
# X = X.reshape(X.shape[0], X.shape[1], 1)
# y = y.reshape(y.shape[0], 1)

In [None]:
X[0][-5:], y[0], y_naive[0]

In [None]:
X[1][-5:], y[1], y_naive[1]

In [None]:
X, X.shape

In [None]:
y, y.shape

In [None]:
y_naive, y_naive.shape

In [None]:
test_size = 0.3

In [None]:
(
    x_train,
    x_test,
    y_train,
    y_train_naive,
    y_test,
    y_test_naive
) = split_data(X, y, y_naive, test_size, shuffle=False)

In [None]:
val_split = 0.2
split_idx = int(len(x_train) * (1 - val_split))
x_train, x_val = x_train[:split_idx], x_train[split_idx:]
y_train, y_val = y_train[:split_idx], y_train[split_idx:]

In [None]:
x_train.shape, y_train.shape, y_train_naive.shape

In [None]:
x_val.shape, y_val.shape

In [None]:
x_test.shape, y_test.shape, y_test_naive.shape

# Transformer Regression

In [None]:
model = build_transformer_model_regression(
    feature_dim=x_train.shape[-1],
    max_length=x_train.shape[1],
    horizon=y_train.shape[1],
)


In [None]:
model.summary()

In [None]:
np.random.seed(42)
idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]

In [None]:
epochs = 10

In [None]:
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mean_absolute_error'],
)
history = model.fit(
    x_train,
    y_train,
    validation_data=(x_val, y_val),
    epochs=epochs,
    batch_size=32,
)

In [None]:
history.history['val_loss'], history.history['val_mean_absolute_error']

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend()
plt.show()

In [None]:
# predict the test set
y_pred = model.predict(x_test)
y_pred.shape, y_test.shape

In [None]:
x_test

In [None]:
y_pred

In [None]:
original_min = data['PAGEID_int'].min()
original_max = data['PAGEID_int'].max()
# inverse the scaling
y_test_unscaled = inverse_min_max_scale(y_test, original_min, original_max)
y_pred_unscaled = inverse_min_max_scale(y_pred, original_min, original_max)
y_test_naive_unscaled = inverse_min_max_scale(y_test_naive, original_min, original_max)
y_test_unscaled[:5], y_pred_unscaled[:5], y_test_naive_unscaled[:5]

In [None]:
# calcluate the MAE
mae = np.mean(np.abs(y_test_unscaled - y_pred_unscaled))
mae

In [None]:
mae_naive = np.mean(np.abs(y_test_unscaled - y_test_naive_unscaled))
mae_naive

# LSTM

In [None]:
model = build_lstm_model_regression(
    feature_dim=x_train.shape[-1],
    max_length=x_train.shape[1],
    horizon=y_train.shape[1],
)
model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mean_absolute_error'],
)
history = model.fit(
    x_train,
    y_train,
    validation_data=(x_val, y_val),
    epochs=epochs,
    batch_size=32,
)
history.history['val_loss'], history.history['val_mean_absolute_error']

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend()
plt.show()

In [None]:
# predict the test set
y_pred = model.predict(x_test)
y_pred.shape, y_test.shape

In [None]:
y_test_unscaled = inverse_min_max_scale(y_test, original_min, original_max)
y_pred_unscaled = inverse_min_max_scale(y_pred, original_min, original_max)
y_test_naive_unscaled = inverse_min_max_scale(y_test_naive, original_min, original_max)
y_test_unscaled[:5], y_pred_unscaled[:5], y_test_naive_unscaled[:5]

In [None]:
# calcluate the MAE
mae = np.mean(np.abs(y_test_unscaled - y_pred_unscaled))
mae

In [None]:
mae_naive = np.mean(np.abs(y_test_unscaled - y_test_naive_unscaled))
mae_naive

# MLP

In [None]:
model = build_ann_model_regression(
    feature_dim=x_train.shape[-1],
    max_length=x_train.shape[1],
    horizon=y_train.shape[1],
)
model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mean_absolute_error'],
)
history = model.fit(
    x_train,
    y_train,
    validation_data=(x_val, y_val),
    epochs=epochs,
    batch_size=32,
)
history.history['val_loss'], history.history['val_mean_absolute_error']

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend()
plt.show()

In [None]:
# predict the test set
y_pred = model.predict(x_test)
y_pred.shape, y_test.shape

In [None]:
y_test_unscaled = inverse_min_max_scale(y_test, original_min, original_max)
y_pred_unscaled = inverse_min_max_scale(y_pred, original_min, original_max)
y_test_naive_unscaled = inverse_min_max_scale(y_test_naive, original_min, original_max)
y_test_unscaled[:5], y_pred_unscaled[:5], y_test_naive_unscaled[:5]

In [None]:
# calcluate the MAE
mae = np.mean(np.abs(y_test_unscaled - y_pred_unscaled))
mae

In [None]:
mae_naive = np.mean(np.abs(y_test_unscaled - y_test_naive_unscaled))
mae_naive