## **This code aims to loop over combinations of GRU models and data and store results**

In [1]:
import pandas as pd
import numpy as np
import os
import time
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU 0 in first notebook

In [3]:
project_dir = "/home/jupyter-tfg2425paula/prediction_project_v3"
os.chdir(project_dir)

clean_data_dir = os.path.join(project_dir, "00_data/clean")
horizontal_data_dir = os.path.join(project_dir, "00_data/horizontal_structure")
results_dir = os.path.join(project_dir, "02_results")
pca_data_dir = os.path.join(project_dir, "00_data/pca")

### **GRU Model**

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class GRU3DClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):
        super(GRU3DClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):

        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :]) 
        # return self.sigmoid(out)
        return out

### **LSTM Model**

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class StockPriceLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.0):
        super(StockPriceLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, dropout=dropout)
    
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        batch_size = x.size(0)  # Get the batch size dynamically

        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)  # (num_layers, batch_size, hidden_dim)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)  # (num_layers, batch_size, hidden_dim)
        
        out, _ = self.lstm(x, (h0, c0))

        out = self.fc(out[:, -1, :]) 
        # out = self.sigmoid(out)
        return out
    

### **Set folders**

Macro Folders

In [6]:
model_type = "lstm"

Processing

In [7]:
processing = ["clean", "pca"]
processing = ["clean"]

Folders

In [8]:
stocks = ['AAPL', 'MSFT', 'AMZN', 'NVDA', 'SPX']
stocks = ['AAPL']
types_securities = ["single_name", "options", "technical"]
types_securities = ["options"]

Different files

In [9]:
years = ["15y", "10y", "5y", "2y"]
years = ["15y"]
window_sizes = [5, 10, 50, 100]
window_sizes = [5]
train_sizes = [80, 90, 95]
train_sizes = [80]

Same file

In [10]:
thresholds = [0.3, 0.35, 0.4, 0.45, 0.5]
thresholds = [0.5]
learning_rates = [0.005, 0.008, 0.009, 0.01]
learning_rates = [0.01]
num_epochs_list = [100, 200]
num_epochs_list = [100]
batch_sizes = [16, 32]
batch_sizes = [16]
prediction_thresholds = [0.35, 0.4, 0.45, 0.5]
prediction_thresholds = [0.5]

#### **Hyperparameters**

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

hidden_size = 64  
output_size = 2  
num_layers = 2
dropout = 0.2

criterion = nn.CrossEntropyLoss()

#### **Last data modifications**

In [12]:
def reshape_remove_characters(df):

    X = np.array([np.stack(row) for row in df.drop(columns=['Target']).values])
    y = df['Target'].values

    smote = SMOTE(random_state=42)
    n_samples, timesteps, n_features = X.shape
    X_flat = X.reshape((n_samples, timesteps * n_features))
    X_flat = np.where(X_flat == 'ç', 0, X_flat)

    X_resampled = X_flat.reshape((-1, timesteps, n_features))
    
    return X_resampled, y

### **Evaluation function**

In [13]:
def evaluate_rolling_unchanged_model_threshold(
    model, 
    X, 
    y, 
    criterion, 
    optimizer, 
    device, 
    train_size, 
    batch_size, 
    num_epochs, 
    lower_threshold
):
    """
    Evaluate a PyTorch model using a rolling prediction approach for time series,
    training the model only once on the initial training set. For each time step
    after train_size, the model makes a prediction without further parameter updates.
    Only predicts +1 or -1 if the probability of class 1 is above/below given thresholds;
    otherwise, predicts 0. Accuracy is computed only on nonzero predictions.

    Args:
        model:          PyTorch model to evaluate.
        X:              Feature data (numpy array).
        y:              Target data (numpy array).
        criterion:      Loss function (e.g., CrossEntropyLoss).
        optimizer:      Optimizer (e.g., Adam).
        device:         Device for computation (CPU or GPU).
        train_size:     Initial size of the training data (int or float).
                        If < 1, treated as fraction of total length.
        batch_size:     Batch size for training.
        num_epochs:     Number of epochs for initial training only.
        lower_threshold: Probability threshold below which model predicts -1.
        upper_threshold: Probability threshold above which model predicts +1.

    Returns:
        dict: Dictionary with the following keys:
            - "rolling_predictions": All predictions (-1, 0, +1) across the test period.
            - "rolling_targets": Corresponding true targets in [-1, +1].
            - "filtered_predictions": Nonzero predictions only.
            - "filtered_targets": Targets corresponding to nonzero predictions.
            - "accuracy_nonzero": Accuracy computed only on nonzero predictions.
    """

    # Convert X, y to tensors
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.long)

    # Determine initial training set size
    if train_size < 1.0:
        lower_bound = int(train_size * len(X))
    else:
        lower_bound = train_size

    # -------------------------
    # 1) SINGLE TRAINING PHASE
    # -------------------------
    model.to(device)
    model.train()
    
    X_train = X[:lower_bound].to(device)
    y_train = y[:lower_bound].to(device)

    train_dataset = TensorDataset(X_train, y_train)
    trainloader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=False,         # Keep False if order matters; True for better generalization
        # num_workers=4,         # Adjust based on your CPU cores
        # pin_memory=True,       # Speeds up transfer if using GPUs
        drop_last=False        # Ensure the last batch is included
    )

    epoch_train_losses = []
    for epoch in range(num_epochs):
        # torch.cuda.empty_cache()
        epoch_loss = 0.0
        for X_batch, y_batch in trainloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            pred_y = model(X_batch)   # [batch_size, num_classes]
            loss = criterion(pred_y, y_batch)
            loss.backward()

            # Gradient clipping (optional)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            epoch_loss += loss.item()
               
        if (epoch + 1) % 5 == 0 or epoch == num_epochs - 1:
            print(f"[Train] Epoch {epoch+1}/{num_epochs}, Loss={epoch_loss/len(trainloader):.4f}")

        epoch_train_losses.append(epoch_loss/len(trainloader))
        
    loss_decrease_percentage = ((epoch_train_losses[-1] - epoch_train_losses[0]) / epoch_train_losses[0]) * 100
    # ---------------------------------
    # 2) ROLLING PREDICTIONS, NO UPDATE
    # ---------------------------------
    model.eval()

    rolling_predictions = []
    rolling_targets     = []

    for i in range(lower_bound, len(X)):
        # Single-step "test" sample
        X_test = X[i:i+1].to(device)  # shape: (1, num_features)
        y_test = y[i:i+1].to(device)  # shape: (1, )

        with torch.no_grad():
            # Forward pass
            pred_y = model(X_test)  # [1, num_classes]
            probabilities = torch.softmax(pred_y, dim=1).cpu().numpy()  # shape: (1, 2)
            prob_class_1  = probabilities[:, 1]  # shape: (1,)

            # Threshold-based logic
            # Initialize all predictions to 0
            pred_classes = np.zeros_like(prob_class_1)
            # Predict -1 if prob < lower_threshold
            pred_classes[prob_class_1 < lower_threshold] = -1
            # Predict +1 if prob > upper_threshold
            pred_classes[prob_class_1 > 1-lower_threshold] = 1

        rolling_predictions.append(pred_classes[0])  # scalar
        rolling_targets.append(y_test.item())

    rolling_predictions = np.array(rolling_predictions)
    rolling_targets = np.array(rolling_targets).astype(int)

    # Convert any 0-labeled targets to -1 if your original data is in [-1, +1]
    # (Sometimes y might be {0,1} or {-1, +1}; adapt as needed.)
    rolling_targets[rolling_targets == 0] = -1

    # Filter out zero predictions
    nonzero_mask = rolling_predictions != 0
    filtered_preds = rolling_predictions[nonzero_mask]
    filtered_targets = rolling_targets[nonzero_mask]

    if len(filtered_preds) == 0:
        accuracy_nonzero = None
        print("No nonzero predictions, cannot compute thresholded accuracy.")
    else:
        accuracy_nonzero = accuracy_score(filtered_targets, filtered_preds)
        print(f"Accuracy on Nonzero Predictions: {accuracy_nonzero:.4f}")

    return {
        "rolling_predictions": rolling_predictions,
        "rolling_targets": rolling_targets,
        "filtered_predictions": filtered_preds,
        "filtered_targets": filtered_targets,
        "accuracy_nonzero": accuracy_nonzero,
        "loss_decrease_percentage": loss_decrease_percentage
    }

### **Run the loop**

In [14]:
# folder
results_list = []
for stock in stocks:
    for security_type in types_securities:
        output_folder = os.path.join(results_dir, f"{model_type}/{stock}/{security_type}") 
        os.makedirs(output_folder, exist_ok=True)
        # files
        for period in years:
            # load original data as well (for info purposes)
            filename = f"{security_type}/{stock}/{period}_data.csv"
            original_input_filepath = os.path.join(clean_data_dir, filename)
            original_data = pd.read_csv(original_input_filepath)
            start_date = original_data.loc[0, "Date"]
            end_date = original_data.iloc[-1]["Date"]
            
            for possible_train_size in train_sizes:
                
                results_csv_path = os.path.join(output_folder, f"{period}_{possible_train_size}.csv")
                
                # columns, same file
                for window_size in window_sizes:
                    print(f"{stock}, {security_type}, {period}, {possible_train_size}, {window_size}")
                    
                    # load data
                    pkl_filename = f"clean/{security_type}/{stock}/{period}_{window_size}_data.pkl"
                    input_filepath = os.path.join(horizontal_data_dir, pkl_filename)
                    print(input_filepath)
                    input_df = pd.read_pickle(input_filepath)
                    
                    X_resampled, y_resampled = reshape_remove_characters(input_df)

                    input_size = X_resampled.shape[2]
                    train_size = int(X_resampled.shape[0]*possible_train_size/100)
                    test_size = X_resampled.shape[0] - train_size

                    # generate model
                    model = StockPriceLSTM(input_size, hidden_size, output_size)
                    # model = GRU3DClassifier(input_size, hidden_size, output_size, num_layers, dropout)
                    # model = torch.nn.DataParallel(model)
                    model = model.to(device)
                    
                    for learning_rate in learning_rates:
                        
                        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
                        
                        for num_epochs in num_epochs_list:
                            for prediction_threshold in prediction_thresholds:
                                for batch_size in batch_sizes:

                                    print(f"Training {stock} | LR: {learning_rate} | Epochs: {num_epochs} | Batch: {batch_size} | Prediction Threshold: {prediction_threshold}")
                                    
                                    start_time = time.time()

                                    result = evaluate_rolling_unchanged_model_threshold(
                                        model, X_resampled, y_resampled, criterion, 
                                                                   optimizer, device, train_size, batch_size, num_epochs, lower_threshold=prediction_threshold)     

                                    rolling_predictions = result["rolling_predictions"]
                                    rolling_targets = result["rolling_targets"]
                                    test_accuracy = result["accuracy_nonzero"]
                                    loss_decrease_percentage = result["loss_decrease_percentage"]
                                    nonzero_preds = np.count_nonzero(result["rolling_predictions"])


                                    end_time = time.time()    
                                    execution_time = end_time - start_time
                                    
                                    # --------------------------------------------
                                    # 1) Create a record (dictionary) for this run
                                    # --------------------------------------------
                                    run_record = {
                                        "start_date": start_date,
                                        "end_date": end_date,
                                        "execution_time": execution_time,
                                        "test_size": test_size,
                                        "nonzero_preds": nonzero_preds,
                                        "accuracy": test_accuracy,
                                        "prediction_threshold": prediction_threshold,
                                        
                                        "window_size": window_size,
                                        "learning_rate": learning_rate,
                                        "num_epochs": num_epochs,
                                        "train_loss_change_pctg": loss_decrease_percentage,
                                        
                                        "batch_size": batch_size,
                                        
                                        "output_size": output_size,
                                        "hidden_size": hidden_size,
                                        "num_layers": num_layers,
                                        "dropout_rate": dropout,
                                        "optimizer": optimizer.__class__.__name__,
                                        "criterion": criterion
                                        
                                    }
                                    
                                    # --------------------------------------------
                                    # 2) Append the dictionary to the results list
                                    # --------------------------------------------
                                    results_list.append(run_record)
                
                # ----------------------------------------------------------------
                # 3) Write to CSV *once* after all window_sizes for this setup
                # ----------------------------------------------------------------
                if len(results_list) > 0:
                    df = pd.DataFrame(results_list)
                    
                    # If CSV already exists, append without header
                    if os.path.exists(results_csv_path):
                        df.to_csv(results_csv_path, mode='a', header=False, index=False)
                    else:
                        df.to_csv(results_csv_path, index=False)
                    
                    # Clear results_list to avoid duplication
                    results_list = []
                                    

AAPL, options, 15y, 80, 5
/home/jupyter-tfg2425paula/prediction_project_v3/00_data/horizontal_structure/clean/options/AAPL/15y_5_data.pkl
Training AAPL | LR: 0.01 | Epochs: 100 | Batch: 16 | Prediction Threshold: 0.5
[Train] Epoch 5/100, Loss=0.6928
[Train] Epoch 10/100, Loss=0.6947
[Train] Epoch 15/100, Loss=0.6918
[Train] Epoch 20/100, Loss=0.6916
[Train] Epoch 25/100, Loss=0.6914
[Train] Epoch 30/100, Loss=0.6917
[Train] Epoch 35/100, Loss=0.6909
[Train] Epoch 40/100, Loss=0.6894
[Train] Epoch 45/100, Loss=0.6878
[Train] Epoch 50/100, Loss=0.6855
[Train] Epoch 55/100, Loss=0.6843
[Train] Epoch 60/100, Loss=0.6780
[Train] Epoch 65/100, Loss=0.6711
[Train] Epoch 70/100, Loss=0.6623
[Train] Epoch 75/100, Loss=0.6582
[Train] Epoch 80/100, Loss=0.6486
[Train] Epoch 85/100, Loss=0.6472
[Train] Epoch 90/100, Loss=0.6414
[Train] Epoch 95/100, Loss=0.6317
[Train] Epoch 100/100, Loss=0.6192
Accuracy on Nonzero Predictions: 0.5057


In [14]:
# ONE EXAMPLE
security_type = "options"
stock = "AAPL"
period = "15y"
window_size = 5

pkl_filename = f"clean/{security_type}/{stock}/{period}_{window_size}_data.pkl"
input_filepath = os.path.join(horizontal_data_dir, pkl_filename)
input_df = pd.read_pickle(input_filepath)
                    
input_df               

Unnamed: 0,AAPL_CALL_OM,AAPL_CALL_O1,AAPL_CALL_OY,AAPL_CALL_OI,AAPL_CALL_VM,AAPL_PUT_OM,AAPL_PUT_O1,AAPL_PUT_OY,AAPL_PUT_OI,AAPL_PUT_VM,Return,Target
0,0 4.31 1 8.36 2 8.20 3 8.20 4 6...,0 0.2885 1 0.2807 2 0.2864 3 0.286...,0 0.3876 1 0.3926 2 0.3932 3 0.393...,0 800752 1 808079 2 815843 3 81584...,0 35209.0 1 20662.0 2 16655.0 3 18...,0 8.50 1 3.95 2 3.90 3 3.90 4 5...,0 0.2906 1 0.2823 2 0.2860 3 0.286...,0 0.4068 1 0.4021 2 0.4021 3 0.402...,0 629515 1 639877 2 645084 3 64508...,0 15917.0 1 13130.0 2 10824.0 3 11...,0 2.981192 1 -0.699437 2 -0.122285 3 ...,0.0
1,0 8.36 1 8.20 2 8.20 3 6.30 4 5...,0 0.2807 1 0.2864 2 0.2864 3 0.313...,0 0.3926 1 0.3932 2 0.3932 3 0.411...,0 808079 1 815843 2 815843 3 83088...,0 20662.0 1 16655.0 2 18762.0 3 20...,0 3.95 1 3.90 2 3.90 3 5.40 4 5...,0 0.2823 1 0.2860 2 0.2860 3 0.299...,0 0.4021 1 0.4021 2 0.4021 3 0.410...,0 639877 1 645084 2 645084 3 65102...,0 13130.0 1 10824.0 2 11682.5 3 12...,0 -0.699437 1 -0.122285 2 0.000000 3 ...,0.0
2,0 8.20 1 8.20 2 6.30 3 5.40 4 ...,0 0.2864 1 0.2864 2 0.3134 3 0.310...,0 0.3932 1 0.3932 2 0.4111 3 0.452...,0 815843 1 815843 2 830883 3 84159...,0 16655.0 1 18762.0 2 20869.0 3 32...,0 3.90 1 3.90 2 5.40 3 5.45 4 8...,0 0.2860 1 0.2860 2 0.2990 3 0.308...,0 0.4021 1 0.4021 2 0.4106 3 0.402...,0 645084 1 645084 2 651020 3 66150...,0 10824.0 1 11682.5 2 12541.0 3 17...,0 -0.122285 1 0.000000 2 -1.763064 3 ...,0.0
3,0 8.20 1 6.30 2 5.40 3 10.25 4 ...,0 0.2864 1 0.3134 2 0.3105 3 0.321...,0 0.3932 1 0.4111 2 0.4527 3 0.411...,0 815843 1 830883 2 841595 3 85171...,0 18762.0 1 20869.0 2 32046.0 3 26...,0 3.90 1 5.40 2 5.45 3 8.15 4 8...,0 0.2860 1 0.2990 2 0.3088 3 0.317...,0 0.4021 1 0.4106 2 0.4026 3 0.412...,0 645084 1 651020 2 661504 3 67066...,0 11682.5 1 12541.0 2 17148.0 3 16...,0 0.000000 1 -1.763064 2 -0.339000 3 ...,1.0
4,0 6.30 1 5.40 2 10.25 3 9.92 4 ...,0 0.3134 1 0.3105 2 0.3213 3 0.327...,0 0.4111 1 0.4527 2 0.4110 3 0.418...,0 830883 1 841595 2 851711 3 87371...,0 20869.0 1 32046.0 2 26186.0 3 46...,0 5.40 1 5.45 2 8.15 3 8.62 4 8...,0 0.2990 1 0.3088 2 0.3176 3 0.328...,0 0.4106 1 0.4026 2 0.4124 3 0.417...,0 651020 1 661504 2 670661 3 69133...,0 12541.0 1 17148.0 2 16459.0 3 26...,0 -1.763064 1 -0.339000 2 -1.470662 3 ...,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3906,0 5.90 1 6.18 2 5.05 3 6.25 4 6...,0 0.2011 1 0.1920 2 0.1863 3 0.203...,0 0.2452 1 0.2462 2 0.2445 3 0.249...,0 3191862 1 2751164 2 2787242 3 28...,0 239759.0 1 105702.0 2 94774.0 3 ...,0 5.45 1 4.78 2 5.60 3 5.07 4 4...,0 0.1928 1 0.1840 2 0.1792 3 0.186...,0 0.2287 1 0.2282 2 0.2269 3 0.234...,0 2535703 1 2239246 2 2254014 3 22...,0 153261.0 1 75314.0 2 60763.0 3 ...,0 0.000000 1 0.396914 2 1.377043 3 ...,1.0
3907,0 6.18 1 5.05 2 6.25 3 6.25 4 6...,0 0.1920 1 0.1863 2 0.2033 3 0.197...,0 0.2462 1 0.2445 2 0.2495 3 0.250...,0 2751164 1 2787242 2 2808853 3 28...,0 105702.0 1 94774.0 2 86593.0 3 ...,0 4.78 1 5.60 2 5.07 3 4.56 4 4...,0 0.1840 1 0.1792 2 0.1860 3 0.188...,0 0.2282 1 0.2269 2 0.2346 3 0.224...,0 2239246 1 2254014 2 2266403 3 23...,0 75314.0 1 60763.0 2 67136.0 3 ...,0 0.396914 1 1.377043 2 -1.410919 3 ...,1.0
3908,0 5.05 1 6.25 2 6.25 3 6.15 4 5...,0 0.1863 1 0.2033 2 0.1977 3 0.206...,0 0.2445 1 0.2495 2 0.2508 3 0.247...,0 2787242 1 2808853 2 2841711 3 28...,0 94774.0 1 86593.0 2 122734.0 3 ...,0 5.60 1 5.07 2 4.56 3 4.35 4 5...,0 0.1792 1 0.1860 2 0.1889 3 0.189...,0 0.2269 1 0.2346 2 0.2248 3 0.230...,0 2254014 1 2266403 2 2308051 3 23...,0 60763.0 1 67136.0 2 113178.0 3 ...,0 1.377043 1 -1.410919 2 1.342222 3 ...,0.0
3909,0 6.25 1 6.25 2 6.15 3 5.20 4 6...,0 0.2033 1 0.1977 2 0.2063 3 0.199...,0 0.2495 1 0.2508 2 0.2470 3 0.252...,0 2808853 1 2841711 2 2857385 3 28...,0 86593.0 1 122734.0 2 75973.0 3 ...,0 5.07 1 4.56 2 4.35 3 5.17 4 4...,0 0.1860 1 0.1889 2 0.1891 3 0.191...,0 0.2346 1 0.2248 2 0.2305 3 0.232...,0 2266403 1 2308051 2 2330907 3 23...,0 67136.0 1 113178.0 2 77180.0 3 ...,0 -1.410919 1 1.342222 2 0.114025 3 ...,1.0


In [26]:
security_type = "technical"
original_input_df = pd.read_csv(os.path.join(clean_data_dir, f"{security_type}/{stock}/{period}_data.csv"))
original_input_df = original_input_df.drop(columns = ["Date"])
original_input_df

Unnamed: 0,Adj Close,High,Low,Open,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,...,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr,Return,Target
0,6.202484,7.357143,7.248214,7.250000,474896800,4.207929e+09,12844955200,0.012165,1.019379e+07,0.004424,...,-0.053481,-6.925966,-6.713142,-0.212823,7.276828,2.981194,2.937620,51.516044,2.981194,0.0
1,6.159100,7.352857,7.246429,7.333214,318438400,4.218615e+09,12526516800,0.070116,6.398002e+06,-0.000101,...,-0.076835,-8.860430,-7.142600,-1.717830,7.277106,-0.699433,-0.701891,50.456290,-0.699433,0.0
2,6.151568,7.344643,7.277143,7.335714,286454000,4.062504e+09,12240062800,0.113572,5.118618e+06,0.000265,...,-0.112468,-10.992004,-7.912481,-3.079523,7.277235,-0.122288,-0.122363,50.272300,-0.122288,0.0
3,6.043114,7.248571,7.084643,7.115000,295257200,4.052857e+09,11944805600,0.063691,-1.035688e+06,-0.008011,...,-0.259167,-12.507668,-8.831518,-3.676150,7.276087,-1.763058,-1.778785,47.622912,-1.763058,0.0
4,6.022626,7.202857,7.098929,7.182500,424858000,3.960879e+09,11519947600,0.133392,-2.361734e+06,-0.000384,...,-0.374975,-11.059022,-9.277019,-1.782003,7.271093,-0.339001,-0.339577,47.122469,-0.339001,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3770,225.000000,226.919998,224.270004,226.399994,47923700,3.083075e+10,30180183400,0.057249,-1.684433e+07,-7.409658,...,-0.108301,-0.335164,-1.590438,1.255274,225.846789,-1.410920,-1.420968,4536.443950,-1.410920,1.0
3771,228.020004,229.740005,225.169998,225.250000,44686000,3.084180e+10,30224869400,0.040871,4.840847e+06,19.022102,...,-0.011480,-0.324963,-1.337343,1.012380,225.973880,1.342224,1.333296,4598.675419,1.342224,1.0
3772,228.279999,230.160004,226.660004,226.979996,36211800,3.083911e+10,30261081200,0.002712,5.494279e+06,9.230434,...,0.059727,-1.845162,-1.438907,-0.406255,226.081234,0.114023,0.113958,4604.032974,0.114023,1.0
3773,229.000000,229.929993,225.889999,228.059998,35169600,3.085809e+10,30296250800,0.035966,8.326832e+06,-5.743675,...,0.123926,-3.248737,-1.800873,-1.447864,226.280467,0.315403,0.314906,4618.869620,0.315403,0.0


In [19]:
def scale_data(df, selected_scale_cols, scaling_method):
    """
    Scales specified columns in a DataFrame using the specified scaling method.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        selected_cols (list): A list of column names to scale.
        scaling_method (str): The scaling method to use ("standard" or "minmax"). Default is "standard".
    
    Returns:
        pd.DataFrame: The DataFrame with specified columns scaled.
    """
    
    if scaling_method is not None:
        if scaling_method == "standard":
            scaler = StandardScaler()
        elif scaling_method == "minmax":
            scaler = MinMaxScaler()
        else:
            raise ValueError("Invalid scaling method. Choose 'standard' or 'minmax'.")

        df[selected_scale_cols] = df[selected_scale_cols].apply(pd.to_numeric, errors="coerce")
        
        # Scale only the selected columns
        df_scaled = df.copy()
        # Replace infinite values with NaN
        df[selected_scale_cols].replace([np.inf, -np.inf], np.nan, inplace=True)

        # Optionally fill NaN with column mean or median
        df[selected_scale_cols].fillna(df[selected_scale_cols].mean(), inplace=True)

        df_scaled[selected_scale_cols] = scaler.fit_transform(df[selected_scale_cols])
    
    else:
        df_scaled = df
        
    return df_scaled

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaling_method = "standard"
selected_scale_cols = list(original_input_df.columns)
scaled_df = scale_data(original_input_df, selected_scale_cols, scaling_method)
scaled_df

Unnamed: 0,AAPL_CALL_OM,AAPL_CALL_O1,AAPL_CALL_OY,AAPL_CALL_OI,AAPL_CALL_VM,AAPL_PUT_OM,AAPL_PUT_O1,AAPL_PUT_OY,AAPL_PUT_OI,AAPL_PUT_VM,Return,Target
0,-0.659948,0.023948,1.232190,-1.135737,-0.867662,0.066764,0.076497,1.069240,-1.141176,-1.135861,1.939250,-1.014149
1,0.032441,-0.019636,1.298093,-1.130776,-0.940435,-0.689031,0.020175,1.024588,-1.132042,-1.164318,-0.534837,-1.014149
2,0.005087,0.012214,1.306002,-1.125519,-0.960481,-0.697336,0.045282,1.024588,-1.127452,-1.187863,-0.146881,-1.014149
3,0.005087,0.012214,1.306002,-1.125519,-0.949940,-0.697336,0.045282,1.024588,-1.127452,-1.179098,-0.064682,-1.014149
4,-0.319737,0.163080,1.541936,-1.115336,-0.939400,-0.448173,0.133498,1.105342,-1.122220,-1.170332,-1.249798,-1.014149
...,...,...,...,...,...,...,...,...,...,...,...,...
3910,-0.328285,-0.483409,-0.570931,0.246195,-0.429808,-0.587705,-0.613622,-0.659835,0.338419,-0.142768,0.837548,0.986049
3911,-0.345381,-0.435356,-0.621018,0.256808,-0.663735,-0.622587,-0.612265,-0.605682,0.358566,-0.510329,0.011964,0.986049
3912,-0.507794,-0.471116,-0.555114,0.264063,-0.674956,-0.486378,-0.595979,-0.583831,0.374753,-0.631355,0.147328,-1.014149
3913,-0.319737,-0.456589,-0.552478,0.266275,-0.655556,-0.644182,-0.578336,-0.591432,0.392888,-0.387046,-0.205578,0.986049
