<a href="https://colab.research.google.com/github/lorenafc/MscThesis_EyeTrackingIVR/blob/main/OVERLAP_autoencoder_with_rf_sequence_samples_thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data import TensorDataset


In [2]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device:', device)

# Read the CSV file
file_name = '/content/LLA2020_labeled.csv'
# file_name = '/content/eye_tracking_data_small_2019.csv'
eye_tracking_data = pd.read_csv(file_name)

Device: cpu


In [3]:
eye_tracking_data.head()

Unnamed: 0,time,L_x,L_y,L_z,C_x,C_y,C_z,observer,GT1,GT2,GT3,GT4,GT5,GT6,GT7
0,9.314,-2.969,1.6232,-1.2434,-0.4009,1.6289,-1.2939,1,0,0,0,0,0,0,0
1,9.337,-2.969,1.6255,-1.2432,-0.4007,1.629,-1.294,1,0,0,0,0,0,0,0
2,9.36,-2.969,1.626,-1.2447,-0.4006,1.629,-1.294,1,0,0,0,0,0,0,0
3,9.381,-2.969,1.6232,-1.243,-0.4004,1.6291,-1.2941,1,0,0,0,0,0,0,0
4,9.403,-2.969,1.6242,-1.241,-0.4002,1.6291,-1.2941,1,0,0,0,0,0,0,0


In [4]:
eye_tracking_data_rf = eye_tracking_data.copy()

In [5]:
# Data cleaning
eye_tracking_data = eye_tracking_data.drop(columns=['GT2', 'GT3', 'GT4', 'GT5', 'GT6', 'GT7']) # removing
eye_tracking_data = eye_tracking_data[['time', 'L_x', 'L_y', 'L_z', 'C_x', 'C_y', 'C_z', 'GT1','observer']]

In [6]:
print(eye_tracking_data.head(3))

    time    L_x     L_y     L_z     C_x     C_y     C_z  GT1  observer
0  9.314 -2.969  1.6232 -1.2434 -0.4009  1.6289 -1.2939    0         1
1  9.337 -2.969  1.6255 -1.2432 -0.4007  1.6290 -1.2940    0         1
2  9.360 -2.969  1.6260 -1.2447 -0.4006  1.6290 -1.2940    0         1


### FEATURES

In [7]:
eye_tracking_data_without_GT1 = eye_tracking_data.drop(columns=['GT1'])

In [8]:
print(eye_tracking_data_without_GT1.head(3))

    time    L_x     L_y     L_z     C_x     C_y     C_z  observer
0  9.314 -2.969  1.6232 -1.2434 -0.4009  1.6289 -1.2939         1
1  9.337 -2.969  1.6255 -1.2432 -0.4007  1.6290 -1.2940         1
2  9.360 -2.969  1.6260 -1.2447 -0.4006  1.6290 -1.2940         1


In [9]:
# X train and X test without GT1

# Convert all columns to float32 for compatibility with PyTorch
eye_tracking_data_without_GT1 = eye_tracking_data_without_GT1.astype('float32')

train_split = 0.75
# Creating data indices for training and test splits: LSTM autoencoder time series https://github.com/fabiozappo/LSTM-Autoencoder-Time-Series/blob/main/code/main.py
dataset_size = len(eye_tracking_data_without_GT1)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))

et_train_without_GT1 = eye_tracking_data_without_GT1.iloc[:split, :]
et_test_without_GT1 = eye_tracking_data_without_GT1.iloc[split:, :]

# Define sequence length (the sequences have overlapping data)
sequence_length = 460  # 10 seconds of data - sampled at ~45 Hz



### LABEL - GT1

In [10]:
eye_tracking_data_GT1 = eye_tracking_data[['GT1']]
print(eye_tracking_data_GT1.head(3))
print(eye_tracking_data.shape)



   GT1
0    0
1    0
2    0
(106252, 9)


In [11]:
# Convert all columns to float32 for compatibility with PyTorch
eye_tracking_data_GT1 = eye_tracking_data_GT1.astype('float32')

train_split = 0.75
dataset_size = len(eye_tracking_data_GT1)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))

et_train_GT1 = eye_tracking_data_GT1.iloc[:split, :]
et_test_GT1 = eye_tracking_data_GT1.iloc[split:, :]



# New sequence - 2 dimensions - overlapping

In [13]:
# #new sequence - overlapping:

def subset_training_data_overlap_by_rows(
    training_data_overlap: pd.DataFrame, rows_interval: int = 460, rows_overlap: int = 135
) -> pd.DataFrame:
    """
    Splits the training_data DataFrame into overlapping subsets for each observer, using row-based intervals.
    Parameters: training_data_overlap (pd.DataFrame): The input DataFrame with an 'observer' column.
                rows_interval (int): The number of rows for each subset.
                rows_overlap (int): The number of overlapping rows for the next subset.
    Returns: pd.DataFrame: A new DataFrame with repeated overlapping rows for each observer, using global subset IDs.
    """
    # Ensure the data is sorted by observer and time
    training_data_overlap = training_data_overlap.sort_values(by=["observer", "time"]).reset_index(drop=True)

    all_subsets = []
    global_subset_id = 1

    # Iterate over each observer
    for observer_id, observer_data in training_data_overlap.groupby("observer"):
        observer_data = observer_data.reset_index(drop=True)

        # Calculate the step for each subset
        subset_step = rows_interval - rows_overlap
        n_rows = len(observer_data)

        # Create subsets using slicing
        for start_idx in range(0, n_rows, subset_step):
            end_idx = start_idx + rows_interval
            current_subset = observer_data.iloc[start_idx:end_idx].copy()

            if not current_subset.empty:
                # Label this subset with a unique global subset ID
                current_subset["subset"] = global_subset_id
                all_subsets.append(current_subset)
                global_subset_id += 1

    # Concatenate all subsets into one DataFrame
    df = pd.concat(all_subsets, ignore_index=True)
    return df





### FEATURES WITH OVERLAPPING - TEST DATA WILL NOT BE OVERLAPPED

In [14]:
### REMOVE GT1, ## SPLIT TRAIN/TEST, than overlap: et_train_without_GT1

### OVERLAP TRAINING DATA
et_train_without_GT1_overlap = subset_training_data_overlap_by_rows(et_train_without_GT1)
print(f"et_train_without_GT1_overlap shape is: {et_train_without_GT1_overlap.shape}")

### OVERLAP TEST DATA
et_test_without_GT1_overlap = subset_training_data_overlap_by_rows(et_test_without_GT1)
print(f"et_train_without_GT1_overlap shape is: {et_train_without_GT1_overlap.shape}")

# Convert all columns to float32 for compatibility with PyTorch
# eye_tracking_data_without_GT1_overlap = et_train_without_GT1_overlap.astype('float32')
et_train_without_GT1_overlap = et_train_without_GT1_overlap.astype('float32')


et_train_without_GT1_overlap shape is: (108821, 9)
et_train_without_GT1_overlap shape is: (108821, 9)


## FEATURES OVERLAPPED IN THE **ENCODER**

In [18]:
#converting in numpy array for the tensors
X_train_seq = et_train_without_GT1_overlap.values
X_2_train_seq = X_train_seq.copy() # it was x_train_seq
X_test_seq = et_test_without_GT1_overlap.values                        ### NO OVERLAP TEST DATA
X_2_test_seq = X_test_seq.copy() # it was y_test_seq

# y_train_seq_GT1 = et_train_GT1_overlap.values # it was x_train_seq   #### NO NEED LABELS FOR THE AUTOENCODEDRS!!
# y_2_train_seq_GT1 = y_train_seq_GT1.copy() # it was y_train_seq
# y_test_seq_GT1 = et_test_GT1_overlap.values # it was x_test_seq
# y_2_test_seq_GT1 = y_test_seq_GT1.copy() # it was y_train_seq

print(f"X_train_seq shape: {X_train_seq.shape}")
print(f"X_2_train_seq shape: {X_2_train_seq.shape}\n")
print(f"X_test_seq shape: {X_test_seq.shape}")                       ### NO OVERLAP TEST DATA
print(f"X_2_test_seq shape: {X_2_test_seq.shape}\n")

# print(f"y_train_seq_GT1 shape: {y_train_seq_GT1.shape}")     #### NO NEED LABELS FOR THE AUTOENCODEDRS!!
# print(f"y_2_train_seq_GT1 shape: {y_train_seq_GT1.shape}\n")
# print(f"y_test_seq_GT1 shape: {y_test_seq_GT1.shape}")
# print(f"y_2_test_seq_GT1 shape: {y_2_test_seq_GT1.shape}")

X_train_seq shape: (108821, 9)
X_2_train_seq shape: (108821, 9)

X_test_seq shape: (36144, 9)
X_2_test_seq shape: (36144, 9)



In [19]:
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
X_2_train_tensor = X_train_tensor.clone()  # Target for the autoencoder is the input itself
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
X_2_test_tensor = X_test_tensor.clone()  # Target for the autoencoder is the input itself

# Create TensorDataset for train and test sets
train_dataset = TensorDataset(X_train_tensor, X_2_train_tensor) # IT WAS Y_TRAIN_TENSOR AND Y_TEST_TENSOR
test_dataset = TensorDataset(X_test_tensor, X_2_test_tensor)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True) #timeseries data so shuffle = False
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True) #timeseries data so shuffle = False

print(f"X_train_tensor shape: {X_train_tensor.shape}")
print(f"X_2_train_tensor shape: {X_2_train_tensor.shape}\n")
print(f"X_test_tensor shape: {X_test_tensor.shape}")
print(f"X_2_test_tensor shape: {X_2_test_tensor.shape}")

X_train_tensor shape: torch.Size([108821, 9])
X_2_train_tensor shape: torch.Size([108821, 9])

X_test_tensor shape: torch.Size([36144, 9])
X_2_test_tensor shape: torch.Size([36144, 9])


# AUTOENCODER MODEL

In [20]:
# Hyperparameters for the autoencoder model
random_seed = 123
learning_rate = 0.005
num_epochs = 5

# Model architecture settings
input_size = 9                                                                             # 460 samples * 8 features per sample (I am including the column "observer") = 3220
num_hidden_1 = 500  # First layer in encoder
num_hidden_2 = 50   # Compressed representation layer

# Define the Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        ### ENCODER
        self.encoder_layer1 = nn.Linear(input_size, num_hidden_1)
        self.encoder_layer2 = nn.Linear(num_hidden_1, num_hidden_2)

        ### DECODER
        self.decoder_layer1 = nn.Linear(num_hidden_2, num_hidden_1)
        self.decoder_layer2 = nn.Linear(num_hidden_1, input_size)
        # self.decoder_layer2.weight.detach().normal_(0.0, 0.1)
        # self.decoder_layer2.bias.detach().zero_()

    def encoder(self, x):
        x = F.sigmoid(self.encoder_layer1(x))
        encoded = F.sigmoid(self.encoder_layer2(x))
        return encoded

    def decoder(self, encoded_x):
        x = F.leaky_relu(self.decoder_layer1(encoded_x))
        decoded = torch.sigmoid(self.decoder_layer2(x))  # Sigmoid to get values between 0 and 1
        return decoded

    def forward(self, x):
        # Flatten input from (batch, sequence_length, features) to (batch, input_size)
        x = x.view(x.size(0), -1)
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Instantiate the model
torch.manual_seed(random_seed)
model = Autoencoder().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop
start_time = time.time()
for epoch in range(num_epochs):
    for batch_idx, (sequences, _) in enumerate(train_loader):
        sequences = sequences.to(device)

        # Forward pass
        decoded = model(sequences)

        # loss = F.binary_cross_entropy(decoded, sequences.view(sequences.size(0), -1), reduction='mean')
         # reconstruction error
        loss = F.mse_loss(decoded, sequences.view(sequences.size(0), -1)) # changes binary_cross_entropy loss to mse_loss
        optimizer.zero_grad()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Logging
        if not batch_idx % 50:
            print('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f'
                  % (epoch + 1, num_epochs, batch_idx, len(train_loader), loss))

    print('Time elapsed: %.2f min' % ((time.time() - start_time) / 60))

print('Total Training Time: %.2f min' % ((time.time() - start_time) / 60))



Epoch: 001/005 | Batch 000/425 | Loss: 18.0179
Epoch: 001/005 | Batch 050/425 | Loss: 259.0796
Epoch: 001/005 | Batch 100/425 | Loss: 519.0620
Epoch: 001/005 | Batch 150/425 | Loss: 1274.3918
Epoch: 001/005 | Batch 200/425 | Loss: 2013.5337
Epoch: 001/005 | Batch 250/425 | Loss: 3017.6548
Epoch: 001/005 | Batch 300/425 | Loss: 4542.4722
Epoch: 001/005 | Batch 350/425 | Loss: 5956.7300
Epoch: 001/005 | Batch 400/425 | Loss: 7772.0312
Time elapsed: 0.20 min
Epoch: 002/005 | Batch 000/425 | Loss: 15.6600
Epoch: 002/005 | Batch 050/425 | Loss: 259.0796
Epoch: 002/005 | Batch 100/425 | Loss: 519.0620
Epoch: 002/005 | Batch 150/425 | Loss: 1274.3918
Epoch: 002/005 | Batch 200/425 | Loss: 2013.5337
Epoch: 002/005 | Batch 250/425 | Loss: 3017.6548
Epoch: 002/005 | Batch 300/425 | Loss: 4542.4722
Epoch: 002/005 | Batch 350/425 | Loss: 5956.7300
Epoch: 002/005 | Batch 400/425 | Loss: 7772.0312
Time elapsed: 0.38 min
Epoch: 003/005 | Batch 000/425 | Loss: 15.6600
Epoch: 003/005 | Batch 050/425 | 

# Using encoder with the overlapped features


In [21]:
#### USING ENCODER

# Extract features from the autoencoder for Random Forest
X_train_ae_overlap = np.ones((len(train_dataset), num_hidden_2))
# y_train_ae = y_train_seq
X_2_train_ae_overlap = np.ones((len(train_dataset),num_hidden_2)) # it was y_train_ae_seq

X_test_ae_overlap = np.ones((len(test_dataset), num_hidden_2))
# y_test_ae = y_test_seq
X_2_test_ae_overlap = np.ones((len(test_dataset),num_hidden_2)) # it was y_test_ae_seq

start_idx = 0
for idx, (sequences, labels) in enumerate(train_loader):
    sequences = sequences.to(device)
    encoded = model.encoder(sequences.view(sequences.size(0), -1))
    # decoded = model.encoder(sequences.view(sequences.size(0), -1))

    batch_size = encoded.shape[0]
    # batch_size = decoded.shape[0]
    # X_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()
    X_train_ae_overlap[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder
    X_2_train_ae_overlap[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    start_idx += batch_size

start_idx = 0
for idx, (sequences, labels) in enumerate(test_loader):
    sequences = sequences.to(device)
    # encoded = model.encoder(sequences.view(sequences.size(0), -1))
    decoded = model.encoder(sequences.view(sequences.size(0), -1))
    # batch_size = encoded.shape[0]
    batch_size = decoded.shape[0]
    X_test_ae_overlap[start_idx:start_idx + batch_size] = decoded.cpu().detach().numpy() #using encoder
    X_2_test_ae_overlap[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    # X_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()

    start_idx += batch_size


## OVERLAPPING THE LABEL GT1 - Y TRAIN -
###(NEED TO OVERLAP THE DATASET BEFORE SEPARATE THE FEATURES AND LABELS BECAUSE OF THE COLUMN OBSERVER IN THE FUNCTION subset_training_data_overlap_by_rows)

In [22]:
## SPLIT TRAIN/TEST, than overlap: et_train_GT1
eye_tracking_data_complete = eye_tracking_data.astype('float32')

train_split = 0.75
dataset_size = len(eye_tracking_data_complete)
indices = list(range(dataset_size))
split = int(np.floor(train_split * dataset_size))

et_train_complete = eye_tracking_data_complete.iloc[:split, :]
# et_test_GT1 = eye_tracking_data_GT1.iloc[split:, :]
### OVERLAP COMPLETE DATA - NOT POSSIBLE ONLY GT1 BECAUSE THERE IS NO OBSERVER COLUMN

et_train_complete_overlap = subset_training_data_overlap_by_rows(et_train_complete)
print(f"et_train_complete_overlap shape is: {et_train_complete_overlap.shape}\n")
print(et_train_complete_overlap.head(3))

### then REMOVE FEATUES

et_train_complete_overlap shape is: (108821, 10)

    time    L_x     L_y     L_z     C_x     C_y     C_z  GT1  observer  subset
0  9.314 -2.969  1.6232 -1.2434 -0.4009  1.6289 -1.2939  0.0       1.0       1
1  9.337 -2.969  1.6255 -1.2432 -0.4007  1.6290 -1.2940  0.0       1.0       1
2  9.360 -2.969  1.6260 -1.2447 -0.4006  1.6290 -1.2940  0.0       1.0       1


In [23]:
# Convert all columns to float32 for compatibility with PyTorch

et_train_GT1_overlap = et_train_complete_overlap[['GT1']]

print(f"et_train_GT1_overlap shape is: {et_train_GT1_overlap.shape}\n")

et_train_GT1_overlap shape is: (108821, 1)



# RANDOM FOREST

In [26]:
eye_tracking_data.columns

Index(['time', 'L_x', 'L_y', 'L_z', 'C_x', 'C_y', 'C_z', 'GT1', 'observer'], dtype='object')

### USING ENCODER IN X TEST WITH NO OVERLAPPING

In [28]:
#converting in numpy array for the tensors
X_train_orig = et_train_without_GT1.values
X_2_train_orig = X_train_orig.copy() # it was x_train_seq
X_test_orig = et_test_without_GT1.values
X_2_test_orig = X_test_orig.copy() # it was y_test_seq

# y_train_seq_GT1 = et_train_GT1_overlap.values # it was x_train_seq #### NO NEED LABELS FOR THE AUTOENCODEDRS!!
# y_2_train_seq_GT1 = y_train_seq_GT1.copy() # it was y_train_seq
# y_test_seq_GT1 = et_test_GT1_overlap.values # it was x_test_seq
# y_2_test_seq_GT1 = y_test_seq_GT1.copy() # it was y_train_seq

print(f"X_train_orig shape: {X_train_orig.shape}")
print(f"X_2_train_orig shape: {X_2_train_orig.shape}\n")
print(f"X_test_orig shape: {X_test_orig.shape}")
print(f"X_2_test_orig shape: {X_2_test_orig.shape}\n")

# print(f"y_train_seq_GT1 shape: {y_train_seq_GT1.shape}") #### NO NEED LABELS FOR THE AUTOENCODEDRS!!
# print(f"y_2_train_seq_GT1 shape: {y_train_seq_GT1.shape}\n")
# print(f"y_test_seq_GT1 shape: {y_test_seq_GT1.shape}")
# print(f"y_2_test_seq_GT1 shape: {y_2_test_seq_GT1.shape}")

X_train_orig shape: (79689, 8)
X_2_train_orig shape: (79689, 8)

X_test_orig shape: (26563, 8)
X_2_test_orig shape: (26563, 8)



In [32]:
X_train_tensor_orig = torch.tensor(X_train_orig, dtype=torch.float32)
X_2_train_tensor_orig = X_train_tensor_orig.clone()  # Target for the autoencoder is the input itself
X_test_tensor_orig = torch.tensor(X_test_orig, dtype=torch.float32)
X_2_test_tensor_orig = X_test_tensor_orig.clone()  # Target for the autoencoder is the input itself

# Create TensorDataset for train and test sets
train_dataset_orig = TensorDataset(X_train_tensor_orig, X_2_train_tensor_orig) # IT WAS Y_TRAIN_TENSOR AND Y_TEST_TENSOR
test_dataset_orig = TensorDataset(X_test_tensor_orig, X_2_test_tensor_orig)

batch_size = 256
train_loader_orig = DataLoader(train_dataset_orig, batch_size=batch_size, shuffle=False, drop_last=True) #timeseries data so shuffle = False
test_loader_orig = DataLoader(test_dataset_orig, batch_size=batch_size, shuffle=False, drop_last=True) #timeseries data so shuffle = False

print(f"X_train_tensor_orig shape: {X_train_tensor_orig.shape}")
print(f"X_2_train_tensor_orig shape: {X_2_train_tensor_orig.shape}\n")
print(f"X_test_tensor_orig shape: {X_test_tensor_orig.shape}")
print(f"X_2_test_tensor_orig shape: {X_2_test_tensor_orig.shape}")



X_train_tensor_orig shape: torch.Size([79689, 8])
X_2_train_tensor_orig shape: torch.Size([79689, 8])

X_test_tensor_orig shape: torch.Size([26563, 8])
X_2_test_tensor_orig shape: torch.Size([26563, 8])


In [34]:
#### USING ENCODER - SO WE CAN INSERT X TEST WITH NO SEQUENCES IN THE RF MODEL

# Extract features from the autoencoder for Random Forest
X_train_ae_orig = np.ones((len(train_dataset_orig), num_hidden_2))
# y_train_ae = y_train_seq
y_train_ae_orig = np.ones((len(train_dataset_orig),num_hidden_2))

X_test_ae_orig = np.ones((len(test_dataset_orig), num_hidden_2))
# y_test_ae = y_test_seq
y_test_ae_orig = np.ones((len(test_dataset_orig),num_hidden_2))

start_idx = 0
for idx, (sequences, labels) in enumerate(train_loader_orig):
    sequences = sequences.to(device)
    encoded = model.encoder(sequences.view(sequences.size(0), -1))
    # decoded = model.encoder(sequences.view(sequences.size(0), -1))

    batch_size = encoded.shape[0]
    # batch_size = decoded.shape[0]
    # X_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_train_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()
    X_train_ae_orig[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder
    y_train_ae_orig[start_idx:start_idx+batch_size] = encoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    start_idx += batch_size

start_idx = 0
for idx, (sequences, labels) in enumerate(test_loader_orig):
    sequences = sequences.to(device)
    # encoded = model.encoder(sequences.view(sequences.size(0), -1))
    decoded = model.encoder(sequences.view(sequences.size(0), -1))
    # batch_size = encoded.shape[0]
    batch_size = decoded.shape[0]
    X_test_ae_orig[start_idx:start_idx + batch_size] = decoded.cpu().detach().numpy() #using encoder
    y_test_ae_orig[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using encoder #labels.cpu().detach().numpy()
    # X_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder
    # y_test_ae[start_idx:start_idx+batch_size] = decoded.cpu().detach().numpy() #using decoder #labels.cpu().detach().numpy()

    start_idx += batch_size

    ### USE X_TEST_AE_ORIG IN THE RF

### TESTING THE MODEL:

In [35]:
print(f"The X_train_ae_overlap shape is:  {X_train_ae_overlap.shape}")
print(f"The et_train_GT1_overlap shape is:  {et_train_GT1_overlap.shape}\n")
print(f"The X_test_ae_orig shape is:  {X_test_ae_orig.shape}")
print(f"The et_test_GT1 shape is:  {et_test_GT1.shape}")

The X_train_ae_overlap shape is:  (108821, 50)
The et_train_GT1_overlap shape is:  (108821, 1)

The X_test_ae_orig shape is:  (26563, 50)
The et_test_GT1 shape is:  (26563, 1)


In [38]:
# Random Forest Classifier - USING X FROM ENCODER AND ORIGINAL Y - LABELS FOR TRAINING AND TEST
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_ae_overlap, et_train_GT1_overlap.values.ravel()) # #source: https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected

print(f'Train Accuracy: {rf.score(X_train_ae_overlap, et_train_GT1_overlap ) * 100:.2f}%')
print(f'Test Accuracy: {rf.score(X_test_ae_orig, et_test_GT1) * 100:.2f}%')

Train Accuracy: 99.96%
Test Accuracy: 44.99%
