<a href="https://colab.research.google.com/github/lorenafc/MscThesis_EyeTrackingIVR/blob/main/OVERLAP_autoencoder_with_rf_sequence_samples_thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data import TensorDataset


In [None]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device:', device)

# Read the CSV file
file_name = '/content/LLA2020_labeled.csv'
# file_name = '/content/eye_tracking_data_small_2019.csv'
eye_tracking_data = pd.read_csv(file_name)

Device: cpu


In [None]:
eye_tracking_data.head()

Unnamed: 0,time,L_x,L_y,L_z,C_x,C_y,C_z,observer,GT1,GT2,GT3,GT4,GT5,GT6,GT7
0,9.314,-2.969,1.6232,-1.2434,-0.4009,1.6289,-1.2939,1,0,0,0,0,0,0,0
1,9.337,-2.969,1.6255,-1.2432,-0.4007,1.629,-1.294,1,0,0,0,0,0,0,0
2,9.36,-2.969,1.626,-1.2447,-0.4006,1.629,-1.294,1,0,0,0,0,0,0,0
3,9.381,-2.969,1.6232,-1.243,-0.4004,1.6291,-1.2941,1,0,0,0,0,0,0,0
4,9.403,-2.969,1.6242,-1.241,-0.4002,1.6291,-1.2941,1,0,0,0,0,0,0,0


In [None]:
eye_tracking_data_rf = eye_tracking_data.copy()

In [None]:
# Data cleaning
eye_tracking_data = eye_tracking_data.drop(columns=['GT2', 'GT3', 'GT4', 'GT5', 'GT6', 'GT7']) # removing
eye_tracking_data = eye_tracking_data[['time', 'L_x', 'L_y', 'L_z', 'C_x', 'C_y', 'C_z', 'GT1','observer']]

In [None]:
print(eye_tracking_data.head(3))

    time    L_x     L_y     L_z     C_x     C_y     C_z  GT1  observer
0  9.314 -2.969  1.6232 -1.2434 -0.4009  1.6289 -1.2939    0         1
1  9.337 -2.969  1.6255 -1.2432 -0.4007  1.6290 -1.2940    0         1
2  9.360 -2.969  1.6260 -1.2447 -0.4006  1.6290 -1.2940    0         1


In [None]:
eye_tracking_data_without_GT1 = eye_tracking_data.drop(columns=['GT1'])

In [None]:
print(eye_tracking_data_without_GT1.head(3))

    time    L_x     L_y     L_z     C_x     C_y     C_z  observer
0  9.314 -2.969  1.6232 -1.2434 -0.4009  1.6289 -1.2939         1
1  9.337 -2.969  1.6255 -1.2432 -0.4007  1.6290 -1.2940         1
2  9.360 -2.969  1.6260 -1.2447 -0.4006  1.6290 -1.2940         1


In [None]:
# Convert all columns to float32 for compatibility with PyTorch
eye_tracking_data_without_GT1 = eye_tracking_data_without_GT1.astype('float32')

train_split = 0.75
# Creating data indices for training and test splits: LSTM autoencoder time series https://github.com/fabiozappo/LSTM-Autoencoder-Time-Series/blob/main/code/main.py
dataset_size = len(eye_tracking_data_without_GT1)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))

et_train_without_GT1 = eye_tracking_data_without_GT1.iloc[:split, :]
et_test_without_GT1 = eye_tracking_data_without_GT1.iloc[split:, :]

# Scaling the data
# scaler = MinMaxScaler()
# et_train_without_GT1 = pd.DataFrame(scaler.fit_transform(et_train_without_GT1), columns=et_train_without_GT1.columns)
# et_test_without_GT1 = pd.DataFrame(scaler.transform(et_test_without_GT1), columns=et_test_without_GT1.columns)

# Define sequence length (the sequences have overlapping data)
sequence_length = 460  # 10 seconds of data - sampled at ~45 Hz

# # Function to create sequences
# def create_sequences(data, sequence_length):
#     sequences = []
#     sequences_again = []
#     for i in range(len(data) - sequence_length):
#         seq = data.iloc[i:i+sequence_length, :].values  # Convert to numpy array with .values to use it as a tensor.
#         seq_again = data.iloc[i:i+sequence_length, :].values  # Repeated X in Y position
#         sequences.append(seq)
#         sequences_again.append(seq_again)
#     return np.array(sequences), np.array(sequences_again)



# # Generate sequences for training and testing
# X_train_seq, y_train_seq = create_sequences(et_train_without_GT1, sequence_length)
# X_test_seq, y_test_seq = create_sequences(et_test_without_GT1, sequence_length)

# ## now remove GT1


In [None]:
# from re import X
# print(X_train_seq.shape)
# print(y_train_seq.shape)
# print(X_test_seq.shape)
# print(y_test_seq.shape)

(79229, 460, 8)
(79229, 460, 8)
(26103, 460, 8)
(26103, 460, 8)


In [None]:
eye_tracking_data_GT1 = eye_tracking_data[['GT1']]
print(eye_tracking_data_GT1.head(3))
print(eye_tracking_data.shape)



   GT1
0    0
1    0
2    0
(106252, 9)


In [None]:
# Convert all columns to float32 for compatibility with PyTorch
eye_tracking_data_GT1 = eye_tracking_data_GT1.astype('float32')

train_split = 0.75
dataset_size = len(eye_tracking_data_GT1)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))

et_train_GT1 = eye_tracking_data_GT1.iloc[:split, :]
et_test_GT1 = eye_tracking_data_GT1.iloc[split:, :]

# X_train_seq_GT1, y_train_seq_GT1 = create_sequences(et_train_GT1, sequence_length)
# X_test_seq_GT1, y_test_seq_GT1 = create_sequences(et_test_GT1, sequence_length)

In [None]:
# print(X_train_seq_GT1.shape)
# print(y_train_seq_GT1.shape)
# print(X_test_seq_GT1.shape)
# print(y_test_seq_GT1.shape)

(79229, 460, 1)
(79229, 460, 1)
(26103, 460, 1)
(26103, 460, 1)


# New sequence - 2 dimensions - overlapping

In [None]:
# #new sequence - overlapping:

def subset_training_data_overlap_by_rows(
    training_data_overlap: pd.DataFrame, rows_interval: int = 460, rows_overlap: int = 135
) -> pd.DataFrame:
    """
    Splits the training_data DataFrame into overlapping subsets for each observer, using row-based intervals.
    Parameters: training_data_overlap (pd.DataFrame): The input DataFrame with an 'observer' column.
                rows_interval (int): The number of rows for each subset.
                rows_overlap (int): The number of overlapping rows for the next subset.
    Returns: pd.DataFrame: A new DataFrame with repeated overlapping rows for each observer, using global subset IDs.
    """
    # Ensure the data is sorted by observer and time
    training_data_overlap = training_data_overlap.sort_values(by=["observer", "time"]).reset_index(drop=True)

    all_subsets = []
    global_subset_id = 1

    # Iterate over each observer
    for observer_id, observer_data in training_data_overlap.groupby("observer"):
        observer_data = observer_data.reset_index(drop=True)

        # Calculate the step for each subset
        subset_step = rows_interval - rows_overlap
        n_rows = len(observer_data)

        # Create subsets using slicing
        for start_idx in range(0, n_rows, subset_step):
            end_idx = start_idx + rows_interval
            current_subset = observer_data.iloc[start_idx:end_idx].copy()

            if not current_subset.empty:
                # Label this subset with a unique global subset ID
                current_subset["subset"] = global_subset_id
                all_subsets.append(current_subset)
                global_subset_id += 1

    # Concatenate all subsets into one DataFrame
    df = pd.concat(all_subsets, ignore_index=True)
    return df





In [None]:
eye_tracking_complete = subset_training_data_overlap_by_rows(eye_tracking_data)

In [None]:
# X train and test without GT1

eye_tracking_data_without_GT1 = eye_tracking_complete.drop(columns=['GT1'])
print(eye_tracking_data_without_GT1.head(3))
print(eye_tracking_data_without_GT1.shape)
# Convert all columns to float32 for compatibility with PyTorch
eye_tracking_data_without_GT1 = eye_tracking_data_without_GT1.astype('float32')

train_split = 0.75
# Creating data indices for training and test splits: LSTM autoencoder time series https://github.com/fabiozappo/LSTM-Autoencoder-Time-Series/blob/main/code/main.py
dataset_size = len(eye_tracking_data_without_GT1)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))

et_train_without_GT1 = eye_tracking_data_without_GT1.iloc[:split, :]
et_test_without_GT1 = eye_tracking_data_without_GT1.iloc[split:, :]


    time    L_x     L_y     L_z     C_x     C_y     C_z  observer  subset
0  9.314 -2.969  1.6232 -1.2434 -0.4009  1.6289 -1.2939         1       1
1  9.337 -2.969  1.6255 -1.2432 -0.4007  1.6290 -1.2940         1       1
2  9.360 -2.969  1.6260 -1.2447 -0.4006  1.6290 -1.2940         1       1
(145037, 9)
   GT1
0    0
1    0
2    0


NameError: name 'eye_tracking_GT1' is not defined

In [None]:
print(eye_tracking_data_without_GT1.head(3))
print(eye_tracking_data_without_GT1.shape)

    time    L_x     L_y     L_z     C_x     C_y     C_z  observer  subset
0  9.314 -2.969  1.6232 -1.2434 -0.4009  1.6289 -1.2939       1.0     1.0
1  9.337 -2.969  1.6255 -1.2432 -0.4007  1.6290 -1.2940       1.0     1.0
2  9.360 -2.969  1.6260 -1.2447 -0.4006  1.6290 -1.2940       1.0     1.0
(145037, 9)


In [None]:
# X train and test GT1


eye_tracking_data_GT1 = eye_tracking_complete[['GT1']]
print(eye_tracking_data_GT1.head(3))
print(eye_tracking_data_GT1.shape)

# Convert all columns to float32 for compatibility with PyTorch
eye_tracking_data_GT1 = eye_tracking_data_GT1.astype('float32')

train_split = 0.75
dataset_size = len(eye_tracking_data_GT1)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))

et_train_GT1 = eye_tracking_data_GT1.iloc[:split, :]
et_test_GT1 = eye_tracking_data_GT1.iloc[split:, :]

   GT1
0    0
1    0
2    0
(145037, 1)


In [None]:
eye_tracking_complete.shape

(145037, 10)

In [None]:
eye_tracking_complete.head(3)

Unnamed: 0,time,L_x,L_y,L_z,C_x,C_y,C_z,GT1,observer,subset
0,9.314,-2.969,1.6232,-1.2434,-0.4009,1.6289,-1.2939,0,1,1
1,9.337,-2.969,1.6255,-1.2432,-0.4007,1.629,-1.294,0,1,1
2,9.36,-2.969,1.626,-1.2447,-0.4006,1.629,-1.294,0,1,1


In [None]:
#converting in numpy array for the tensors
X_train_seq = et_train_without_GT1.values
y_train_seq = X_train_seq.copy()
X_test_seq = et_test_without_GT1.values
y_test_seq = X_test_seq.copy()

X_train_seq_GT1 = et_train_GT1.values
y_train_seq_GT1 = X_train_seq_GT1.copy()
X_test_seq_GT1 = et_test_GT1.values
y_test_seq_GT1 = X_test_seq_GT1.copy()

In [None]:
print(f"X_train_seq shape: {X_train_seq.shape}")
print(f"y_train_seq shape: {y_train_seq.shape}")
print(f"X_test_seq shape: {X_test_seq.shape}")
print(f"y_test_seq shape: {y_test_seq.shape}")

print(f"X_train_seq_GT1 shape: {X_train_seq_GT1.shape}")
print(f"y_train_seq_GT1 shape: {y_train_seq_GT1.shape}")
print(f"X_test_seq_GT1 shape: {X_test_seq_GT1.shape}")
print(f"y_test_seq_GT1 shape: {y_test_seq_GT1.shape}")

X_train_seq shape: (108777, 9)
y_train_seq shape: (108777, 9)
X_test_seq shape: (36260, 9)
y_test_seq shape: (36260, 9)
X_train_seq_GT1 shape: (108777, 1)
y_train_seq_GT1 shape: (108777, 1)
X_test_seq_GT1 shape: (36260, 1)
y_test_seq_GT1 shape: (36260, 1)


In [None]:
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor = X_train_tensor.clone()  # Target for the autoencoder is the input itself
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_test_tensor = X_test_tensor.clone()  # Target for the autoencoder is the input itself

# Create TensorDataset for train and test sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True) #timeseries data so shuffle = False
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True) #timeseries data so shuffle = False


In [None]:
print(X_train_tensor.shape)
print(y_train_tensor.shape)
print(X_test_tensor.shape)
print(y_test_tensor.shape)

torch.Size([108777, 9])
torch.Size([108777, 9])
torch.Size([36260, 9])
torch.Size([36260, 9])


In [None]:
# Hyperparameters for the autoencoder model
random_seed = 123
learning_rate = 0.005
num_epochs = 5

# Model architecture settings
input_size = 9                                                                             # 460 samples * 8 features per sample (I am including the column "observer") = 3220
num_hidden_1 = 500  # First layer in encoder
num_hidden_2 = 50   # Compressed representation layer

# Define the Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        ### ENCODER
        self.encoder_layer1 = nn.Linear(input_size, num_hidden_1)
        self.encoder_layer2 = nn.Linear(num_hidden_1, num_hidden_2)

        ### DECODER
        self.decoder_layer1 = nn.Linear(num_hidden_2, num_hidden_1)
        self.decoder_layer2 = nn.Linear(num_hidden_1, input_size)
        # self.decoder_layer2.weight.detach().normal_(0.0, 0.1)
        # self.decoder_layer2.bias.detach().zero_()

    def encoder(self, x):
        x = F.sigmoid(self.encoder_layer1(x))
        encoded = F.sigmoid(self.encoder_layer2(x))
        return encoded

    def decoder(self, encoded_x):
        x = F.leaky_relu(self.decoder_layer1(encoded_x))
        decoded = torch.sigmoid(self.decoder_layer2(x))  # Sigmoid to get values between 0 and 1
        return decoded

    def forward(self, x):
        # Flatten input from (batch, sequence_length, features) to (batch, input_size)
        x = x.view(x.size(0), -1)
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Instantiate the model
torch.manual_seed(random_seed)
model = Autoencoder().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop
start_time = time.time()
for epoch in range(num_epochs):
    for batch_idx, (sequences, _) in enumerate(train_loader):
        sequences = sequences.to(device)

        # Forward pass
        decoded = model(sequences)

        # loss = F.binary_cross_entropy(decoded, sequences.view(sequences.size(0), -1), reduction='mean')
         # reconstruction error
        loss = F.mse_loss(decoded, sequences.view(sequences.size(0), -1)) # changes binary_cross_entropy loss to mse_loss
        optimizer.zero_grad()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Logging
        if not batch_idx % 50:
            print('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f'
                  % (epoch + 1, num_epochs, batch_idx, len(train_loader), loss))

    print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))

print('Total Training Time: %.2f min' % ((time.time() - start_time) / 60))



Epoch: 001/005 | Batch 000/424 | Loss: 18.0179
Epoch: 001/005 | Batch 050/424 | Loss: 259.0796
Epoch: 001/005 | Batch 100/424 | Loss: 519.0620
Epoch: 001/005 | Batch 150/424 | Loss: 1274.3918
Epoch: 001/005 | Batch 200/424 | Loss: 2013.5337
Epoch: 001/005 | Batch 250/424 | Loss: 3017.6548
Epoch: 001/005 | Batch 300/424 | Loss: 4542.4722
Epoch: 001/005 | Batch 350/424 | Loss: 5956.7300
Epoch: 001/005 | Batch 400/424 | Loss: 7772.0312
Time elapsed: 0.20 min
Epoch: 002/005 | Batch 000/424 | Loss: 15.6600
Epoch: 002/005 | Batch 050/424 | Loss: 259.0796
Epoch: 002/005 | Batch 100/424 | Loss: 519.0620
Epoch: 002/005 | Batch 150/424 | Loss: 1274.3918
Epoch: 002/005 | Batch 200/424 | Loss: 2013.5337
Epoch: 002/005 | Batch 250/424 | Loss: 3017.6548
Epoch: 002/005 | Batch 300/424 | Loss: 4542.4722
Epoch: 002/005 | Batch 350/424 | Loss: 5956.7300
Epoch: 002/005 | Batch 400/424 | Loss: 7772.0312
Time elapsed: 0.38 min
Epoch: 003/005 | Batch 000/424 | Loss: 15.6600
Epoch: 003/005 | Batch 050/424 | 

In [None]:
print("input size:", input_size)
print("num_hidden_1:", num_hidden_1)
print("num_hidden_2:", num_hidden_2)

input size: 9
num_hidden_1: 500
num_hidden_2: 50


# Using encoder


In [None]:
#### USING ENCODER

# Extract features from the autoencoder for Random Forest
X_train_ae_seq = np.ones((len(train_dataset), num_hidden_2))
# y_train_ae = y_train_seq
y_train_ae_seq = np.ones((len(train_dataset),num_hidden_2))

X_test_ae_seq = np.ones((len(test_dataset), num_hidden_2))
# y_test_ae = y_test_seq
y_test_ae_seq = np.ones((len(test_dataset),num_hidden_2))

start_idx = 0
for idx, (sequences, labels) in enumerate(train_loader):
    sequences = sequences.to(device)
    encoded = model.encoder(sequences.view(sequences.size(0), -1))
    # decoded = model.encoder(sequences.view(sequences.size(0), -1))

    batch_size = encoded.shape[0]
    # batch_size = decoded.shape[0]
    # X_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()
    X_train_ae_seq[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder
    y_train_ae_seq[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    start_idx += batch_size

start_idx = 0
for idx, (sequences, labels) in enumerate(test_loader):
    sequences = sequences.to(device)
    # encoded = model.encoder(sequences.view(sequences.size(0), -1))
    decoded = model.encoder(sequences.view(sequences.size(0), -1))
    # batch_size = encoded.shape[0]
    batch_size = decoded.shape[0]
    X_test_ae_seq[start_idx:start_idx + batch_size] = decoded.cpu().detach().numpy() #using encoder
    y_test_ae_seq[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    # X_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()

    start_idx += batch_size


# RANDOM FOREST

In [None]:
eye_tracking_data.columns

Index(['time', 'L_x', 'L_y', 'L_z', 'C_x', 'C_y', 'C_z', 'GT1', 'observer'], dtype='object')

In [None]:
train_split = 0.75
# Creating data indices for training and test splits: LSTM autoencoder time series https://github.com/fabiozappo/LSTM-Autoencoder-Time-Series/blob/main/code/main.py
dataset_size = len(eye_tracking_data)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))

et_train_original_rf = eye_tracking_data.iloc[:split, :]
et_test_original_rf = eye_tracking_data.iloc[split:, :]

print("et_train_rf shape is %s\n" % (et_train_original_rf.shape,))

feature_cols = ['time', 'L_x', 'L_y', 'L_z', 'C_x', 'C_y', 'C_z', 'observer']

X_train_original_rf = et_train_original_rf[feature_cols] # Features
y_train_original_rf = et_train_original_rf.GT1 # Target variable

X_test_original_rf = et_test_original_rf[feature_cols] # Features
y_test_original_rf = et_test_original_rf.GT1 # Target variable

print(f"X_train_original_rf shape is {X_train_original_rf.shape} \nY train original rf shape is {y_train_original_rf.shape} ")
print(f"X_test_original_rf shape is {X_test_original_rf.shape}\n")
print(f"y_test_original_rf shape is {y_test_original_rf.shape}")


et_train_rf shape is (79689, 9)

X_train_original_rf shape is (79689, 8) 
Y train original rf shape is (79689,) 
X_test_original_rf shape is (26563, 8)

y_test_original_rf shape is (26563,)


In [None]:
#### USING ENCODER - SO WE CAN INSERT X TEST WITH NO SEQUENCES IN THE RF MODEL

# Extract features from the autoencoder for Random Forest
X_train_ae_orig = np.ones((len(train_dataset_orig), num_hidden_2))
# y_train_ae = y_train_seq
y_train_ae_orig = np.ones((len(train_dataset_orig),num_hidden_2))

X_test_ae_orig = np.ones((len(test_dataset_orig), num_hidden_2))
# y_test_ae = y_test_seq
y_test_ae_orig = np.ones((len(test_dataset_orig),num_hidden_2))

start_idx = 0
for idx, (sequences, labels) in enumerate(train_loader_orig):
    sequences = sequences.to(device)
    encoded = model.encoder(sequences.view(sequences.size(0), -1))
    # decoded = model.encoder(sequences.view(sequences.size(0), -1))

    batch_size = encoded.shape[0]
    # batch_size = decoded.shape[0]
    # X_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()
    X_train_ae_orig[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder
    y_train_ae_orig[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    start_idx += batch_size

start_idx = 0
for idx, (sequences, labels) in enumerate(test_loader_orig):
    sequences = sequences.to(device)
    # encoded = model.encoder(sequences.view(sequences.size(0), -1))
    decoded = model.encoder(sequences.view(sequences.size(0), -1))
    # batch_size = encoded.shape[0]
    batch_size = decoded.shape[0]
    X_test_ae_orig[start_idx:start_idx + batch_size] = decoded.cpu().detach().numpy() #using encoder
    y_test_ae_orig[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    # X_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()

    start_idx += batch_size

    ### USE X_TEST_AE_ORIG IN THE RF

In [None]:
print(f"The X_train_ae_seq shape is:  {X_train_ae_seq.shape}")
print(f"The y_train_ae_seq shape is:  {X_train_ae_seq.shape}")

print(f"The X_test_ae_seq shape is:  {X_test_ae_seq.shape}\n")
print(f"The Y_test_ae_seq shape is:  {X_test_ae_seq.shape}\n")

print(f"The X_train_seq_GT1 shape is:  {X_train_seq_GT1.shape}")
print(f"The y_train_seq_GT1 shape is:  {y_train_seq_GT1.shape}\n")
print(f"The X_test_seq_GT1 shape is:  {X_test_seq_GT1.shape}")
print(f"The y_test_seq_GT1 shape is:  {y_test_seq_GT1.shape}\n")

print(f"The y_train_seq shape is:  {y_train_seq.shape}\n")

print(f"The y_train_original_rf shape is: {y_train_original_rf.shape}")
print(f"The y_test_original_rf shape is: {y_test_original_rf.shape}")
# the number of rows are different from the autoencoders and the original ones to be used in the RF

The X_train_ae_seq shape is:  (108777, 50)
The y_train_ae_seq shape is:  (108777, 50)
The X_test_ae_seq shape is:  (36260, 50)

The Y_test_ae_seq shape is:  (36260, 50)

The X_train_seq_GT1 shape is:  (108777, 1)
The y_train_seq_GT1 shape is:  (108777, 1)

The X_test_seq_GT1 shape is:  (36260, 1)
The y_test_seq_GT1 shape is:  (36260, 1)

The y_train_seq shape is:  (108777, 9)

The y_train_original_rf shape is: (79689,)
The y_test_original_rf shape is: (26563,)


In [None]:
print(X_train_ae_orig2.shape)
print(y_train_ae_orig2.shape)
print(X_test_ae_orig2.shape)
print(y_test_ae_orig2.shape)



(79689, 50)
(79689, 50)
(26563, 50)
(26563, 50)


In [None]:
print(X_test_ae_orig.shape)
print(y_test_original_rf.shape)

(36260, 50)
(26563,)


In [None]:
# Random Forest Classifier - USING X FROM ENCODER AND ORIGINAL Y - LABELS FOR TRAINING AND TEST
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_ae_seq, y_train_seq_GT1) #train_y.values.ravel()) #source: https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected

print(f'Train Accuracy: {rf.score(X_train_ae_seq, y_train_seq_GT1) * 100:.2f}%')
print(f'Test Accuracy: {rf.score(X_test_ae_orig2, y_test_original_rf) * 100:.2f}%')

  return fit_method(estimator, *args, **kwargs)


Train Accuracy: 99.84%
Test Accuracy: 55.01%


## Using original GT1 column for Y train and test dropping last rows to match autoencoder size:

In [None]:
# drop last 460 rows of y_train_original_rf is (79689,) and y_train_original_rf is (79689,)
# to match the number of rows of X_train_ae

y_train_original_rf_drop_last_rows = y_train_original_rf[:-460]
y_test_original_rf_drop_last_rows = y_test_original_rf[:-460]

In [None]:
print(f"The X_train_ae shape is:  {X_train_ae.shape}")
print(f"The X_test_ae shape is:  {X_test_ae.shape}\n")

print(f"The y_train_ae shape is:  {y_train_ae.shape}")
print(f"The y_test_ae shape is:  {y_test_ae.shape}\n")

print(f"The train_original_rf_drop_last_rows shape is:  {y_train_original_rf_drop_last_rows.shape}")
print(f"The test_original_rf_drop_last_rows shape is:  {y_test_original_rf_drop_last_rows.shape}")

The X_train_ae shape is:  (79229, 50)
The X_test_ae shape is:  (26103, 50)

The y_train_ae shape is:  (79229, 50)
The y_test_ae shape is:  (26103, 50)

The train_original_rf_drop_last_rows shape is:  (79229,)
The test_original_rf_drop_last_rows shape is:  (26103,)


In [None]:
# Random Forest Classifier - USING ENCODER AND ORIGINAL X TEST AND Y TEST
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_ae, y_train_original_rf_drop_last_rows)

print(f'Train Accuracy: {rf.score(X_train_ae, y_train_original_rf_drop_last_rows) * 100:.2f}%')
print(f'Test Accuracy: {rf.score(X_test_ae, y_train_original_rf_drop_last_rows) * 100:.2f}%')


Train Accuracy: 54.33%




ValueError: X has 8 features, but RandomForestClassifier is expecting 50 features as input.

## Creating sequences
## Use y_train_seq_rf, X_test_seq_rf and y_test_seq_rf in the RF model with X_train_ae

In [None]:
# Function to create sequences
def create_sequences_rf(data, sequence_length):
    sequences = []
    labels_GT1 = []
    for i in range(len(data) - sequence_length):
        seq = data.iloc[i:i+sequence_length, :].values  #
        label_GT1 = data.iloc[i:i+sequence_length]['GT1'].values  # Select only GT1 column
        sequences.append(seq)
        labels_GT1.append(label_GT1)
    return np.array(sequences), np.array(labels_GT1)


# Generate sequences for training and testing
X_train_seq_rf, y_train_seq_rf = create_sequences_rf(et_train_original_rf, sequence_length)
X_test_seq_rf, y_test_seq_rf = create_sequences_rf(et_test_original_rf, sequence_length)

In [None]:
print(f"The X_train_seq_rf shape is:  {X_train_seq_rf.shape}")
print(f"The y_train_seq_rf shape is:  {y_train_seq_rf.shape}\n")

print(f"The X_test_seq_rf shape is:  {X_test_seq_rf.shape}")
print(f"The y_test_seq_rf shape is:  {y_test_seq_rf.shape}\n")

In [None]:
print(f"The X_train_ae shape is:  {X_train_ae.shape}")
print(f"The X_test_ae shape is:  {X_test_ae.shape}\n")

print(f"The y_train_ae shape is:  {y_train_ae.shape}")
print(f"The y_test_ae shape is:  {y_test_ae.shape}\n")



The X_train_ae shape is:  (79229, 50)
The X_test_ae shape is:  (26103, 50)

The y_train_ae shape is:  (79229, 50)
The y_test_ae shape is:  (26103, 50)

The X_test_seq_rf shape is:  (26103, 460, 9)
The y_test_seq_rf shape is:  (26103, 460)



In [None]:
# Random Forest Classifier - USING ENCODER AND X_TRAIN_AE and y_train_seq_rf, X_test_seq_rf and y_train_seq_rf. Labels GT1 in train and test with sequences
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_ae, y_train_ae)

print(f'Train Accuracy: {rf.score(X_train_ae, y_train_ae) * 100:.2f}%')
print(f'Test Accuracy: {rf.score(X_test_ae, y_test_ae) * 100:.2f}%')

ValueError: Unknown label type: continuous-multioutput. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
# Random Forest Classifier - USING ENCODER (X_TRAIN AND X_TEST DF)
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_ae_df, y_train_seq_rf)

print(f'Train Accuracy: {rf.score(X_train_ae_df, y_train_ae_df) * 100:.2f}%')
print(f'Test Accuracy: {rf.score(X_test_ae, y_test_original_rf_drop_last_rows) * 100:.2f}%')

NameError: name 'y_train_seq_rf' is not defined

In [None]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_ae, y_train_original_rf_drop_last_rows)

print(f'Train Accuracy: {rf.score(X_train_ae, y_train_original_rf_drop_last_rows) * 100:.2f}%')
print(f'Test Accuracy: {rf.score(X_test_ae, y_test_original_rf_drop_last_rows) * 100:.2f}%')


Train Accuracy: 54.33%
Test Accuracy: 55.16%


In [None]:
print("Training label distribution:", np.unique(y_train_seq, return_counts=True))
print("Testing label distribution:", np.unique(y_test_ae, return_counts=True))


Training label distribution: (array([0., 1.], dtype=float32), array([16527326, 19348994]))
Testing label distribution: (array([0., 1.], dtype=float32), array([5569882, 7006518]))


In [None]:
print("X_train_ae shape:", X_train_ae.shape)
print("X_test_ae shape:", X_test_ae.shape)
print("y_train_ae shape:", y_train_ae.shape)
print("y_test_ae shape:", y_test_ae.shape)


X_train_ae shape: (77992, 50)
X_test_ae shape: (27340, 50)
y_train_ae shape: (77992, 460)
y_test_ae shape: (27340, 460)


## Inserting X_test in the autoencoder without sequence (simulate real data)

In [None]:
X_train_tensor_orig = torch.tensor(X_train_original_rf.values, dtype=torch.float32)
y_train_tensor_orig = X_train_tensor_orig.clone()  # Target for the autoencoder is the input itself
X_test_tensor_orig = torch.tensor(X_test_original_rf.values, dtype=torch.float32)
y_test_tensor_orig = X_test_tensor_orig.clone()  # Target for the autoencoder is the input itself

# Create TensorDataset for train and test sets
train_dataset_orig = TensorDataset(X_train_tensor_orig, y_train_tensor_orig)
test_dataset_orig = TensorDataset(X_test_tensor_orig, y_test_tensor_orig)

batch_size = 256
train_loader_orig = DataLoader(train_dataset_orig, batch_size=batch_size, shuffle=False, drop_last=True) #timeseries data -  shuffle = False
test_loader_orig = DataLoader(test_dataset_orig, batch_size=batch_size, shuffle=False, drop_last=True) #timeseries data - shuffle = False

In [None]:
print(f"x train original shape: {X_train_original_rf.shape}")
print(f"y train original shape: {y_train_original_rf.shape}")
print(X_train_tensor_orig.shape)
print(y_train_tensor_orig.shape)
print(X_test_tensor_orig.shape)
print(y_test_tensor_orig.shape)

x train original shape: (79689, 8)
y train original shape: (79689,)
torch.Size([79689, 8])
torch.Size([79689, 8])
torch.Size([26563, 8])
torch.Size([26563, 8])


In [None]:
#### USING ENCODER - SO WE CAN INSERT X TEST WITH NO SEQUENCES IN THE RF MODEL

# Extract features from the autoencoder for Random Forest
X_train_ae_orig2 = np.ones((len(train_dataset_orig), num_hidden_2))
# y_train_ae = y_train_seq
y_train_ae_orig2 = np.ones((len(train_dataset_orig),num_hidden_2))

X_test_ae_orig2 = np.ones((len(test_dataset_orig), num_hidden_2))
# y_test_ae = y_test_seq
y_test_ae_orig2 = np.ones((len(test_dataset_orig),num_hidden_2))

start_idx = 0
for idx, (sequences, labels) in enumerate(train_loader_orig):
    sequences = sequences.to(device)
    encoded = model.encoder(sequences.view(sequences.size(0), -1))
    # encoded = model.encoder(sequences.view(sequences.size(0), -1)[:, :model.encoder_layer1.in_features])  #added code
    # decoded = model.encoder(sequences.view(sequences.size(0), -1))

    batch_size = encoded.shape[0]
    # batch_size = decoded.shape[0]
    # X_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()
    X_train_ae_orig[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder
    y_train_ae_orig[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    start_idx += batch_size

start_idx = 0
for idx, (sequences, labels) in enumerate(test_loader_orig):
    sequences = sequences.to(device)
    encoded = model.encoder(sequences.view(sequences.size(0), -1))
    decoded = model.encoder(sequences.view(sequences.size(0), -1))
    decoded = model.encoder(sequences.view(sequences.size(0), -1)[:, :model.encoder_layer1.in_features])  #added code
    # batch_size = encoded.shape[0]
    batch_size = decoded.shape[0]
    X_test_ae_orig[start_idx:start_idx + batch_size] = decoded.cpu().detach().numpy() #using encoder
    y_test_ae_orig[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    # X_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()

    start_idx += batch_size

    ### USE X_TEST_AE_ORIG IN THE RF

RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x8 and 9x500)

In [None]:
#### USING ENCODER - SO WE CAN INSERT X TEST WITH NO SEQUENCES IN THE RF MODEL

# Extract features from the autoencoder for Random Forest
X_train_ae_orig2 = np.ones((len(train_dataset_orig), num_hidden_2))
# y_train_ae = y_train_seq
y_train_ae_orig2 = np.ones((len(train_dataset_orig),num_hidden_2))

X_test_ae_orig2 = np.ones((len(test_dataset_orig), num_hidden_2))
# y_test_ae = y_test_seq
y_test_ae_orig2 = np.ones((len(test_dataset_orig),num_hidden_2))

start_idx = 0
for idx, (sequences, labels) in enumerate(train_loader_orig):
    sequences = sequences.to(device)
    # The issue was that your input data had 8 features, but your encoder's
    # first layer was expecting 9 features. Instead of slicing the input,
    # we'll pad it with an extra dimension of zeros to make it compatible.

    # Get the original shape of the sequences
    original_shape = sequences.shape

    # Pad the sequences with an extra dimension of zeros at the end
    # to match the expected input features of the encoder (9)
    padded_sequences = torch.cat([sequences, torch.zeros(original_shape[0], 1, device=device)], dim=1)

    encoded = model.encoder(padded_sequences.view(padded_sequences.size(0), -1)[:, :model.encoder_layer1.in_features])

    batch_size = encoded.shape[0]
    X_train_ae_orig[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder
    y_train_ae_orig[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    start_idx += batch_size

start_idx = 0
for idx, (sequences, labels) in enumerate(test_loader_orig):
    sequences = sequences.to(device)
    # Similar to the training loop, we pad the test sequences with zeros.
    original_shape = sequences.shape
    padded_sequences = torch.cat([sequences, torch.zeros(original_shape[0], 1, device=device)], dim=1)

    decoded = model.encoder(padded_sequences.view(padded_sequences.size(0), -1)[:, :model.encoder_layer1.in_features])

    batch_size = decoded.shape[0]
    X_test_ae_orig[start_idx:start_idx + batch_size] = decoded.cpu().detach().numpy() #using encoder
    y_test_ae_orig[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using encoder #labels.cpu

In [None]:
y_test_ae_orig.shape

print(X_train_ae_orig2.shape)
print(y_train_ae_orig2.shape)
print(X_test_ae_orig2.shape)
print(y_test_ae_orig2.shape)

(79689, 50)
(79689, 50)
(26563, 50)
(26563, 50)


In [None]:
# X_train_original_rf = et_train_original_rf[feature_cols] # Features
y_train_original_rf = et_train_original_rf.GT1 # Target variable

# X_test_original_rf = et_test_original_rf[feature_cols] # Features
y_test_original_rf = et_test_original_rf.GT1 # Target variable

In [None]:
print(f"The X_train_ae_seq shape is:  {X_train_ae_seq.shape}")
print(f"The y_train_original_rf shape is:  {y_train_original_rf.shape}")
print(f"The X_test_ae_orig shape is:  {X_test_ae_orig.shape}")
print(f"The y_test_original_rf shape is:  {y_test_original_rf.shape}")

The X_train_ae_seq shape is:  (79229, 50)
The y_train_original_rf shape is:  (79689,)
The X_test_ae_orig shape is:  (26103, 50)
The y_test_original_rf shape is:  (26563,)


In [None]:
# Random Forest Classifier - USING X FROM ENCODER AND ORIGINAL Y - LABELS FOR TRAINING AND TEST
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_ae_seq, y_train_original_rf)

print(f'Train Accuracy: {rf.score(X_train_ae_seq, y_train_original_rf) * 100:.2f}%')
print(f'Test Accuracy: {rf.score(X_test_ae_orig, y_test_original_rf) * 100:.2f}%')

ValueError: Found input variables with inconsistent numbers of samples: [79229, 79689]