### Connect to Drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/ANNChallenge/training_dataset

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/ANNChallenge/training_dataset


### Import libraries

In [None]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

In [None]:
# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.15.0


In [None]:
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


### Load and process data

In [None]:
dataset = np.load('training_data.npy')
label = np.load('categories.npy')
valid = np.load('valid_periods.npy')

In [None]:
# Contains the sliced rows
valid_dataset = []

for i,v in enumerate(valid):
    start = v[0]
    end = v[1]
    valid_dataset.append(dataset[i][start:end+1])  # +1 to include the 'end' index

# Convert the list of valid rows back to a NumPy array
valid_dataset = np.array(valid_dataset)

valid_dataset.shape

(48000,)

### Delete shorter lengths (based on the limit)

In [None]:
limit = 200

In [None]:
len1 = len(valid_dataset)

reduced_valid_dataset = []
reduced_labels = []
for i,v in enumerate(valid_dataset):
  if len(v) >= limit:
    reduced_valid_dataset.append(v)
    reduced_labels.append(label[i])
valid_dataset = np.array(reduced_valid_dataset)
label = np.array(reduced_labels)

len2 = len(valid_dataset)

# New shapes and portion of the initial dataet that was kept
valid_dataset.shape, label.shape, (len2/len1)*100

((23057,), (23057,), 48.03541666666667)

### Build sequences

In [None]:
dataset_train_val, dataset_test, label_train_val, label_test = train_test_split(
    valid_dataset,
    label,
    test_size = 0.1,
    stratify = label,
    random_state = seed
)

In [None]:
dataset_train_val.shape, dataset_test.shape, label_train_val.shape, label_test.shape

((20751,), (2306,), (20751,), (2306,))

In [None]:
dataset_train, dataset_val, label_train, label_val = train_test_split(
    dataset_train_val,
    label_train_val,
    test_size = 0.1,
    stratify = label_train_val,
    random_state = seed
)

In [None]:
dataset_train.shape, dataset_val.shape, label_train.shape, label_val.shape

((18675,), (2076,), (18675,), (2076,))

In [None]:
window = 200
stride = 20
telescope = 9

In [None]:
def build_sequences(valid_dataset, categories, window, stride, telescope):
    # Sanity check to avoid runtime errors
    assert window % stride == 0
    dataset = [] # Slice seen
    unseen = [] # Unseen slices
    new_categories = [] # New categories

    for i,v in enumerate(valid_dataset):

      temp_dataset = np.array(v)

      # Do not consider the final telescope samples
      length = len(temp_dataset) - telescope

      # If the length is below 200, fill up to 200
      if length < window:
        padding_len = window - length
        padding = np.zeros(padding_len, dtype='float32')
        temp_dataset = np.concatenate((padding,temp_dataset))
        length = len(temp_dataset) - telescope
        assert length % window == 0
        assert length == 200

      # Now every stride has at least 209 samples
      # How many samples we should remove
      to_remove = length % stride
      # If we lose less than half the stride, we cut the sample
      if to_remove <= stride/2:
        # Remove the first to_remove samples of the series
        if to_remove != 0:
          temp_dataset = temp_dataset[to_remove:]
          length = len(temp_dataset) - telescope
        assert length % stride == 0
      else:
        # Compute padding length
        padding_len = stride - length % stride
        padding = np.zeros(padding_len, dtype='float32')
        temp_dataset = np.concatenate((padding,temp_dataset))
        length = len(temp_dataset) - telescope
        assert length % stride == 0
      #print("====> Sample:"+str(i)+" - Length:"+str(len(temp_dataset)))
      for idx in np.arange(0,len(temp_dataset)-window-telescope+1,stride):
          #print("Sample:"+str(i)+" - Index:"+str(idx))
          dataset.append(temp_dataset[idx:idx+window])
          unseen.append(temp_dataset[idx+window:idx+window+telescope])
          new_categories.append(categories[i])

    dataset = np.array(dataset)
    unseen = np.array(unseen)
    new_categories = np.array(new_categories)
    return dataset, unseen, new_categories

In [None]:
X_train, y_train, cat_train = build_sequences(dataset_train, label_train, window, stride, telescope)
X_val, y_val, cat_val = build_sequences(dataset_val, label_val, window, stride, telescope)
X_test, y_test, cat_test = build_sequences(dataset_test, label_test, window, stride, telescope)

In [None]:
# Expanding 1 dimension
X_train = np.expand_dims(X_train, axis=-1)
X_val = np.expand_dims(X_val, axis = -1)
X_test = np.expand_dims(X_test, axis=-1)

y_train = np.expand_dims(y_train, axis=-1)
y_val = np.expand_dims(y_val, axis = -1)
y_test = np.expand_dims(y_test, axis=-1)

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((111751, 200, 1),
 (111751, 9, 1),
 (12550, 200, 1),
 (12550, 9, 1),
 (13731, 200, 1),
 (13731, 9, 1))

TRAIN THE MODEL

In [None]:
input_shape = X_train.shape[1:]
output_shape = y_train.shape[1:]
batch_size = 64
epochs = 200

In [None]:
# Predict the test set using the model
predictions = model.predict(X_test, verbose=0)

# Print the shape of the predictions
print(f"Predictions shape: {predictions.shape}")

# Calculate and print Mean Squared Error (MSE)
mean_squared_error = tfk.metrics.mean_squared_error(y_test.flatten(), predictions.flatten()).numpy()
print(f"Mean Squared Error: {mean_squared_error}")

# Calculate and print Mean Absolute Error (MAE)
mean_absolute_error = tfk.metrics.mean_absolute_error(y_test.flatten(), predictions.flatten()).numpy()
print(f"Mean Absolute Error: {mean_absolute_error}")

Predictions shape: (13559, 12, 1)
Mean Squared Error: 0.006042276509106159
Mean Absolute Error: 0.049263499677181244


### Autoregressive prediction

In [None]:
model = tfk.models.load_model('Dense_nosotto200_senzaautoregressive')

In [None]:
# Autoregresssive telescope depends on the shape of the output
autoregressive_telescope = 18

In [None]:
X_test_reg, y_test_reg , cat_test_reg = build_sequences(dataset_test, label_test, window, stride, autoregressive_telescope)
X_test_reg.shape, y_test_reg.shape, cat_test_reg

((12552, 200),
 (12552, 18),
 array(['B', 'B', 'B', ..., 'A', 'A', 'A'], dtype='<U1'))

In [None]:
X_test_reg = np.expand_dims(X_test_reg, axis=-1)
y_test_reg = np.expand_dims(y_test_reg, axis=-1)

In [None]:
X_test_reg.shape, y_test_reg.shape

((12552, 200, 1), (12552, 18, 1))

In [None]:
temp_telescope = telescope
telescope = autoregressive_telescope
autoregressive_telescope = temp_telescope

In [None]:
# Autoregressive Forecasting
reg_predictions = np.array([])
X_temp = X_test_reg
for reg in range(0,telescope,autoregressive_telescope):
    pred_temp = model.predict(X_temp,verbose=0)
    if(len(reg_predictions)==0):
        reg_predictions = pred_temp
    else:
        reg_predictions = np.concatenate((reg_predictions,pred_temp),axis=1)
    X_temp = np.concatenate((X_temp[:,autoregressive_telescope:,:],pred_temp), axis=1)


In [None]:
# Print the shape of the predictions
print(f"Predictions shape: {reg_predictions.shape}")

# Calculate and print Mean Squared Error (MSE)
mean_squared_error = tfk.metrics.mean_squared_error(y_test_reg.flatten(), reg_predictions.flatten()).numpy()
print(f"Mean Squared Error: {mean_squared_error}")

# Calculate and print Mean Absolute Error (MAE)
mean_absolute_error = tfk.metrics.mean_absolute_error(y_test_reg.flatten(), reg_predictions.flatten()).numpy()
print(f"Mean Absolute Error: {mean_absolute_error}")

Predictions shape: (12552, 18, 1)
Mean Squared Error: 0.008783170953392982
Mean Absolute Error: 0.06190216913819313
