## Imports

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.callbacks import ReduceLROnPlateau

# Data Loader

So we can use big dataset

In [2]:
class DataSequence(tf.keras.utils.Sequence):
    
    def __init__(self, files, batch_size):
        self.files = files
        self.batch_size = batch_size
        file_lengths = [len(np.load(f)) for f in files]
        self.total_length = sum(file_lengths)
        self.cumulative_lengths = np.cumsum([0] + file_lengths)
        self.file_num = 0
        self.file_cache = pd.DataFrame(np.load(self.files[self.file_num])).sample(frac=1)

    def __len__(self):
        return int(np.ceil(self.total_length) / self.batch_size)
    
    def __getitem__(self, idx):
        # Start and end lengths of batch
        global_start = idx * self.batch_size
        global_end = global_start + self.batch_size

        # Get global bounds of current file
        fileLowerBound, fileUpperBound = self.cumulative_lengths[self.file_num], self.cumulative_lengths[self.file_num + 1]

        # check that the global_start and global_end are within the bounds of the current  file
        # If not, then enter this. Here, we fetch the next file.
        if global_end > fileUpperBound:
            self.file_num  += 1
            self.file_cache = pd.DataFrame()  # Clear the cache by setting it to an empty DataFrame
            self.file_cache = pd.DataFrame(np.load(self.files[self.file_num]), dtype=np.int16).sample(frac=1)

        # Enter this every epoch, or when batch number resets to 0        
        if global_start < fileLowerBound and global_start == 0:
            self.file_num = 0
            self.file_cache = pd.DataFrame()  # Clear the cache by setting it to an empty DataFrame
            self.file_cache = pd.DataFrame(np.load(self.files[self.file_num])).sample(frac=1)

        fileLowerBound, fileUpperBound = self.cumulative_lengths[self.file_num], self.cumulative_lengths[self.file_num + 1]

        
        local_start = global_start - fileLowerBound
        local_end = local_start + self.batch_size
        data = self.file_cache.iloc[local_start:local_end]

        x = data.iloc[:, :-1].to_numpy().astype(np.int8)
        y = data.iloc[:, -1].to_numpy().astype(np.int16)

        return (x, y)

### Fitting Model

Need to rerun the processedDataset, such that the evaluations are correct.

PARAMETERS BELOW

In [3]:
model = tf.keras.Sequential([
    # tf.keras.layers.Reshape((8, 8, 13), input_shape=(832,)),
    # tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal'),
    # tf.keras.layers.MaxPooling2D((2, 2)),
    # tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal'),
    # tf.keras.layers.MaxPooling2D((2, 2)),
    # tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1024, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(1024, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(1024, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1024, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(128, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(64, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(1, activation='linear', kernel_initializer='he_normal')
])

optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, clipnorm=1.0)
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, clipnorm = 1.0)

model.compile(optimizer=optimizer, loss='MeanSquaredError', metrics=['MeanAbsoluteError'])

In [4]:
batch_size = 8192
files = ['./../data/npyV1/mixedDataChunk' + str(num) +'.npy' for num in np.arange(13)]

# Make the DataSequence object to pass data to model
train = DataSequence(files[0:5], batch_size = batch_size)

valid = DataSequence(files[5:6], batch_size = batch_size)

In [5]:
# validation checking
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Reduce learning rate
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.00001, verbose=1)

model.fit(train, epochs=60, validation_data = valid, callbacks=[early_stopping, reduce_lr], shuffle = False)

model.save("../saved_models/lastlastlastlastmodel")

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 00055: ReduceLROnPlateau reducing learning rate to 9.999999310821295e-05.
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
INFO:tensorflow:Assets written to: ../saved_models/lastlastlastlastmodel\assets
