In [None]:
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
np.set_printoptions(threshold=sys.maxsize)
lags = 12
train = "/Users/marleywetini/repos/intelligentSystems/data/Scats Data October 2006.csv"
test = "/Users/marleywetini/repos/intelligentSystems/data/SCAT Test Data.xlsx"

# Step 1: Load and clean the data
df = pd.read_excel(test, header=1).fillna(0)

# Step 2: Define the flow columns (V00 to V95 for 96 time intervals)
flow_columns = [f"V{str(i).zfill(2)}" for i in range(96)]
grouped_data = df.groupby(['NB_LATITUDE', 'NB_LONGITUDE'])[flow_columns].apply(lambda x: x.values.tolist())
flow_data = grouped_data.values
df_flow_data = np.array(flow_data)
data_scaler = MinMaxScaler()
latlong_scaler = MinMaxScaler()
latlong_data = df[['NB_LATITUDE', 'NB_LONGITUDE']]
latlong_scaled = latlong_scaler.fit_transform(latlong_data)
train_data = []

for i, flow in enumerate(df_flow_data):
    flow = np.array(flow).flatten()
    flow = data_scaler.fit_transform(flow.reshape(-1,1)).reshape(1, -1)
    for j in range(0, len(flow[0]) - lags + 1):  # Iterating over each possible lag of 12
        lagged_flow = flow[0][j:j+lags + 1]  # Get the flow data for a lag of 12
        # Attach corresponding latlong data
        latlong = latlong_scaled[i]  # Get the lat/long for the current location
        # Combine latlong and flow data
        combined_arr = np.hstack((latlong, lagged_flow))
        # Append the combined array to the training data
        train_data.append(combined_arr)

train_data = np.array(train_data)
print(train_data.shape)



X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=.75)
print(X_train.shape)
print(X_test.shape)




In [None]:
"""
Process Data
Reshape and split data into train and test data.

Parameters:
    df (df.DataFram): dataframe of the data
    lags (int): time lag

Returns:
    X_train (np.ndarray)
    y_train (np.ndarray)
    X_test (np.ndarray)
    y_test (np.ndarray)
    flow_scaler (func)
"""
def scaler(min, max):
	def _scaler(x):
		return (x - min) / (max - min)
	return _scaler

def rescaler(min, max):
	def _rescaler(x):
		return x * (max - min) + min
	return _rescaler
flow_group = np.char.mod("V%02d", np.arange(0, 96))
grouped = df.groupby(['NB_LATITUDE', 'NB_LONGITUDE'])[flow_group].apply(lambda x: x.values.tolist())
flow_data = grouped.values
flow_max = np.array(flow_data.max()).max()
flow_min = np.array(flow_data.min()).min()
flow_scaler = scaler(flow_min, flow_max)
flow_rescaler = rescaler(flow_min, flow_max)
latlong_data = np.array(grouped.index.to_list())
latlong_scaler = MinMaxScaler(feature_range=(0, 1)).fit(latlong_data.reshape(-1, 1))
latlong_data = latlong_scaler.transform(latlong_data.reshape(-1, 1)).reshape(-1, 2)

train = []
i = 0
# Iterate over the flow data
for flow in grouped.values:
    while i != 1:  # Assuming you only want to process 3 groups, adjust this logic as needed
        flow = np.array(flow, dtype=float).flatten()  # Flatten the flow data
        
        # Scale the flow data
        flow = np.vectorize(flow_scaler)(flow)
        print(f'FLOW {i}')  # Debug print
        
        # Define the indices and offsets for lag-based processing
        indices = np.arange(lags, len(flow))
        offset = np.arange(-lags, 1)
        print(offset)  # Print scaled flow for debugging
        
        # Reshape flow data according to lag offsets
        flow = flow[indices[:, np.newaxis] + offset]
        # Prepare latlong data, matching the shape of flow
        latlong = np.tile(latlong_data[i], (len(flow), 1))
        
        # Combine latlong and flow data
        combined_arr = np.hstack((latlong, flow))
        print(combined_arr)
        # Add the combined data to the training array
        train.extend(combined_arr)
        
        # Increment the counter
        i += 1
train = np.array(train)
X = train[:, :-1]
y = train[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=.75)