## Imports

In [5]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.python.keras.callbacks import EarlyStopping

# training.py

Contains code from relevant file

In [6]:
# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Reshape((8, 8, 13), input_shape=(832,)),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')
])

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # Try different learning rates
model.compile(optimizer=optimizer, loss='MeanSquaredError', metrics=['MeanAbsoluteError'])

# Data Loader

So we can use big dataset

In [7]:
class DataSequence(tf.keras.utils.Sequence):
    
    def __init__(self, files, batch_size):
        self.files = files
        self.batch_size = batch_size
        file_lengths = [len(np.load(f)) for f in files]
        self.total_length = sum(file_lengths)
        self.cumulative_lengths = np.cumsum([0] + file_lengths)
        self.file_num = 0
        self.file_cache = pd.DataFrame(np.load(self.files[self.file_num])).sample(frac=1)

    def __len__(self):
        return int(np.ceil(self.total_length) / self.batch_size)
    
    def __getitem__(self, idx):
        # Start and end lengths of batch
        global_start = idx * self.batch_size
        global_end = global_start + self.batch_size

        # Get global bounds of current file
        fileLowerBound, fileUpperBound = self.cumulative_lengths[self.file_num], self.cumulative_lengths[self.file_num + 1]

        # check that the global_start and global_end are within the bounds of the current  file
        # print("global_start", global_start, "LESS THAN fileLowerBound", fileLowerBound,"global_end", global_end, "GREATER THAN fileUpperBound", fileUpperBound)

        # If not, then enter this. Here, we fetch the next file.
        if global_end > fileUpperBound:
            self.file_num  += 1
            self.file_cache = pd.DataFrame(np.load(self.files[self.file_num]), dtype=np.int16).sample(frac=1)

        # Enter this every epoch, or when batch number resets to 0        
        if global_start < fileLowerBound and global_start == 0:
            self.file_num = 0
            self.file_cache = pd.DataFrame(np.load(self.files[self.file_num])).sample(frac=1)

        fileLowerBound, fileUpperBound = self.cumulative_lengths[self.file_num], self.cumulative_lengths[self.file_num + 1]

        
        local_start = global_start - fileLowerBound
        local_end = local_start + self.batch_size
        data = self.file_cache.iloc[local_start:local_end]

        x = data.iloc[:, :-1].to_numpy().astype(np.int8)
        y = data.iloc[:, -1].to_numpy().astype(np.int16)

        return (x, y)

### Fitting Model

Need to rerun the processedDataset, such that the evaluations are correct.

PARAMETERS BELOW

In [8]:
batch_size = 64
files = ['./../data/KaggleNPY/mixedDataChunk' + str(num) +'.npy' for num in np.arange(16)]

# Make the DataSequence object to pass data to model
trainFiles = files[0:14]
train = DataSequence(trainFiles, batch_size = batch_size)

validFiles = files[14:16]
valid = DataSequence(validFiles, batch_size = batch_size)

In [9]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(train, epochs=20, validation_data = valid, callbacks=[early_stopping], shuffle = False)

Epoch 1/20

In [None]:
model.save("../saved_models/newPreprocess_6mil_3epoch_1028batch_0.01learn")

# Continue Training

For overnight training 5/3/23

In [None]:
model = tf.keras.models.load_model("../saved_models/corrected_12mil_3epoch_64batch_0.0001learnRate")

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(train, epochs=22, validation_data = valid, callbacks=[early_stopping])

model.save("../saved_models/corrected_12mil_25epoch_64batch_0.0001learnRate")