In [None]:
import sys


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
np.set_printoptions(threshold=sys.maxsize)
lags = 12
train = "/Users/marleywetini/repos/intelligentSystems/data/Scats Data October 2006.csv"
test = "/Users/marleywetini/repos/intelligentSystems/data/SCAT Test Data.xlsx"

#used to get split as data for test needs to be a linear timeline for an accurate visual representation.
def custom_train_test_split(data):
    size = -504  # Use the last 504 rows for training/testing
    target_col = 14  # This is the column index you want to predict
    # Create x_train and x_test by dropping the target column (features)
    x_train = np.delete(data[:size],target_col, axis=1)  # Take all rows except the last 504 for training
    x_test = np.delete(data[size:], target_col, axis=1)   # Take the last 504 rows for testing

    # Create y_train and y_test (target) from the target column
    y_train = data[:size, target_col]  # Target for training
    y_test = data[size:, target_col]   # Target for testing

    return x_train, x_test, y_train, y_test
# Step 1: Load and clean the data
def process_flow_data(flow_data, max_len=None):
    """
    This function processes the flow data by flattening and padding/truncating to ensure uniform length.
    """
    # Flatten the lists inside each flow entry and find the maximum length if not provided
    flattened_flow_data = [np.array(x).flatten() for x in flow_data]
    if max_len is None:
        max_len = max(len(arr) for arr in flattened_flow_data)  # Find the max length of any array
    
    # Pad or truncate each array to the max length
    padded_flow_data = np.array([np.pad(arr, (0, max_len - len(arr)), 'constant') 
                                 if len(arr) < max_len else arr[:max_len] 
                                 for arr in flattened_flow_data])
    return padded_flow_data

# Step 1: Load and clean the data
df = pd.read_csv(train, header=1).fillna(0)
# Step 2: Define the flow columns (V00 to V95 for 96 time intervals)
flow_columns = [f"V{str(i).zfill(2)}" for i in range(96)]
grouped_data = df.groupby(['NB_LATITUDE', 'NB_LONGITUDE'])[flow_columns].apply(lambda x: x.values.tolist())
flow_data = grouped_data.values # shape=(140,0) | pandas
flow_data = np.array(flow_data) # shape=(140,0) | numpy.ndarray

#new_flow_data is used to allocate the Min/Max for data scaler, needs to be do before the for loop to ensure that min/max is set for entire dataset
new_flow_data = process_flow_data(flow_data)
data_scaler = MinMaxScaler(feature_range=(0, 1))
data_scaler = data_scaler.fit(new_flow_data.reshape(-1, 1))
latlong_scaler = MinMaxScaler(feature_range=(0, 1))

latlong_data = grouped_data.reset_index()[['NB_LATITUDE', 'NB_LONGITUDE']]

#Transform lat/long data, do before for loop to ensure its min/mnax is based on entire array
latlong_scaler = MinMaxScaler(feature_range=(0, 1)).fit(latlong_data)
latlong_scaled = latlong_scaler.transform(latlong_data)

train_data = []
for i, flow in enumerate(flow_data):
    flow = np.array(flow).flatten()
    flow = data_scaler.transform(flow.reshape(-1,1)).reshape(1, -1)
    for j in range(0, len(flow[0]) - lags + 1):  # Iterating over each possible lag of 12
        lagged_flow = flow[0][j:j+lags + 1]  # Get the flow data for a lag of 12
        # Attach corresponding latlong data
        latlong = latlong_scaled[i]  # Get the lat/long for the current location //# to check latlong/flow data is correctly being added. 
        # Combine latlong and flow data
        combined_arr = np.hstack((latlong, lagged_flow))
        # Append the combined array to the training data
        train_data.append(combined_arr)

train_data = process_flow_data(train_data) #numpy.ndarray | an array of arrays.
X_train=[]
X_test=[]
Y_train=[]
Y_test=[]
for i in range(0, len(latlong_scaled)):
    # Create a boolean mask where both the latitude and longitude match
    mask = (train_data[:, 0] == latlong_scaled[i][0]) & (train_data[:, 1] == latlong_scaled[i][1])
    
    # Select the rows where the mask is True (i.e., matching lat/long pairs)
    df = train_data[mask]
    x_train, x_test,y_train,y_test=custom_train_test_split(df)

    X_train.append(x_train)
    X_test.append(x_test)
    Y_train.append(y_train)
    Y_test.append(Y_test)

X_train = process_flow_data(X_train)
X_test = np.array(x_test)
y_test = np.array(y_test).reshape(-1, 1)

print(X_train)
print(X_test.shape)
print(y_test)


In [19]:
from keras.layers import Dense, Dropout, Activation, LSTM, GRU
from keras.models import Sequential

def _get_sae(inputs, hidden, output):
    """SAE(Auto-Encoders)
    Build SAE Model.

    # Arguments
        inputs: Integer, number of input units.
        hidden: Integer, number of hidden units.
        output: Integer, number of output units.
    # Returns
        model: Model, nn model.
    """

    model = Sequential()
    model.add(Dense(hidden, input_dim=inputs, name='encoder'))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(output, name='Decoder', activation='sigmoid'))

    return model


def get_saes(layers):
    """SAEs(Stacked Auto-Encoders)
    Build SAEs Model.

    # Arguments
        layers: List(int), number of input, output and hidden units.
    # Returns
        models: List(Model), List of SAE and SAEs.
    """
    
    sae1 = _get_sae(layers[0], layers[1], layers[0])
    sae2 = _get_sae(layers[1], layers[2], layers[0])
    sae3 = _get_sae(layers[2], layers[3], layers[0])

    saes = Sequential()
    saes.add(Dense(layers[1], input_dim=layers[0], name='hidden1'))
    saes.add(Activation('relu'))
    saes.add(Dense(layers[2], name='hidden2'))
    saes.add(Activation('relu'))
    saes.add(Dense(layers[3], name='hidden3'))
    saes.add(Activation('relu'))
    saes.add(Dropout(0.2))
    saes.add(Dense(layers[4], activation='sigmoid'))

    models = [sae1, sae2, sae3, saes]
    return models

layers= [12, 10, 8, 4, 1]
models = get_saes(layers)

for idx, model in enumerate(models):
    print(f"Model {idx + 1} Summary:")
    model.summary()
    print("\n" + "-" * 50 + "\n")

Model 1 Summary:
Model: "sequential_48"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Dense)             (None, 10)                130       
                                                                 
 activation_72 (Activation)  (None, 10)                0         
                                                                 
 dropout_48 (Dropout)        (None, 10)                0         
                                                                 
 Decoder (Dense)             (None, 12)                132       
                                                                 
Total params: 262 (1.02 KB)
Trainable params: 262 (1.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

--------------------------------------------------

Model 2 Summary:
Model: "sequential_49"
_____________________________________________