In [5]:
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
project_dir = "/home/jupyter-tfg2425paula/prediction_project_v3"
os.chdir(project_dir)

clean_data_dir = os.path.join(project_dir, "00_data/clean")
horizontal_data_dir = os.path.join(project_dir, "00_data/horizontal_structure")
results_dir = os.path.join(project_dir, "02_results")
plots_dir = os.path.join(project_dir, "03_plots")
pca_data_dir = os.path.join(project_dir, "00_data/pca")

In [23]:
stock = "AAPL"
period = "10y"
data_type = "single_name"


df = pd.read_csv(os.path.join(clean_data_dir, f"{data_type}/{stock}/{period}_data.csv"), parse_dates=["Date"])

# Sort by date if not already sorted
df = df.sort_values("Date").reset_index(drop=True)

# Extract raw arrays
returns = df["Return"].values  # shape: (N,)
targets = df["Target"].values  # shape: (N,)

# Optional: if your "Target" is sometimes -1/+1, you might map it to 0/1:
# targets = np.where(targets == -1, 0, 1)

# Check shapes
print("Returns shape:", returns.shape)
print("Targets shape:", targets.shape)


Returns shape: (2610,)
Targets shape: (2610,)


**Create sequences**

In [24]:
def create_sequences(feature_array, target_array, seq_length=5):
    """
    Transforms a 1D feature array and target array into
    LSTM-friendly sequences of shape [batch, seq_length, 1].
    The label is the value at the (i+seq_length)-th position in target_array.
    """
    X, y = [], []
    for i in range(len(feature_array) - seq_length):
        X.append(feature_array[i : i + seq_length])        # chunk of length seq_length
        y.append(target_array[i + seq_length])             # next point’s label
    X = np.array(X)
    y = np.array(y)
    
    # Reshape X to [batch, seq_length, 1] for an LSTM with single feature
    X = X.reshape(X.shape[0], X.shape[1], 1)
    return X, y


In [25]:
class SimpleLSTM(nn.Module):
    def __init__(self, input_dim=1, hidden_dim=16, num_layers=1):
        super(SimpleLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, 1)  # For binary classification -> single output node
        
    def forward(self, x):
        # x shape: [batch, seq_length, input_dim]
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))    # out shape: [batch, seq_length, hidden_dim]
        out = out[:, -1, :]               # take the last time step
        out = self.fc(out)                # shape: [batch, 1]
        return torch.sigmoid(out).squeeze()  # shape: [batch], squashed to (0,1)


In [26]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

# -------------------------------------------------------------------
# 1) Your rolling evaluation function (unchanged from your snippet)
#    Just make sure you have it defined in your code.
# -------------------------------------------------------------------
def evaluate_rolling_unchanged_model_threshold(
    model, 
    X, 
    y, 
    criterion, 
    optimizer, 
    device, 
    train_size, 
    batch_size, 
    num_epochs, 
    lower_threshold,
    upper_threshold
):
    """
    Evaluate a PyTorch model using a rolling prediction approach for time series,
    training the model only once on the initial training set. For each time step
    after train_size, the model makes a prediction without further parameter updates.
    Only predicts +1 or -1 if the probability of class 1 is above/below given thresholds;
    otherwise, predicts 0. Accuracy is computed only on nonzero predictions.
    """
    import numpy as np
    from sklearn.metrics import accuracy_score
    import torch

    # Convert X, y to tensors
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.long)

    # Determine initial training set size
    if isinstance(train_size, float) and train_size < 1.0:
        lower_bound = int(train_size * len(X))
    else:
        lower_bound = train_size

    # -------------------------
    # 1) SINGLE TRAINING PHASE
    # -------------------------
    model.to(device)
    model.train()
    
    X_train = X[:lower_bound].to(device)
    y_train = y[:lower_bound].to(device)

    train_dataset = TensorDataset(X_train, y_train)
    trainloader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=False,
        drop_last=False
    )

    epoch_train_losses = []
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for X_batch, y_batch in trainloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            pred_y = model(X_batch)   # [batch_size, num_classes]
            loss = criterion(pred_y, y_batch)
            loss.backward()

            # Gradient clipping (optional)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            epoch_loss += loss.item()
               
        epoch_train_losses.append(epoch_loss/len(trainloader))
        
        if (epoch + 1) % 5 == 0 or epoch == num_epochs - 1:
            print(f"[Train] Epoch {epoch+1}/{num_epochs}, Loss={epoch_train_losses[-1]:.4f}")

    # Just an extra diagnostic: how the loss changed from first to last epoch
    loss_decrease_percentage = ((epoch_train_losses[-1] - epoch_train_losses[0]) 
                                / epoch_train_losses[0]) * 100

    # ---------------------------------
    # 2) ROLLING PREDICTIONS, NO UPDATE
    # ---------------------------------
    model.eval()
    rolling_predictions = []
    rolling_targets     = []

    for i in range(lower_bound, len(X)):
        # Single-step test sample
        X_test = X[i:i+1].to(device)  # shape: (1, num_features)
        y_test = y[i:i+1].to(device)  # shape: (1, )

        with torch.no_grad():
            # Forward pass
            pred_y = model(X_test)  # [1, num_classes]
            probabilities = torch.softmax(pred_y, dim=1).cpu().numpy()  # shape: (1, 2)
            prob_class_1  = probabilities[:, 1]  # shape: (1,)

            # Threshold-based logic
            # Initialize prediction to 0
            pred_class = 0
            if prob_class_1 < lower_threshold:
                pred_class = -1
            elif prob_class_1 > upper_threshold:
                pred_class = 1

        rolling_predictions.append(pred_class)
        rolling_targets.append(y_test.item())

    rolling_predictions = np.array(rolling_predictions)
    rolling_targets = np.array(rolling_targets).astype(int)

    # Example: if your target data is in {0, 1}, you might not need to do this.
    # If your target data is in {0, 1} but you want to interpret 0 as -1:
    rolling_targets[rolling_targets == 0] = -1

    # Filter out zero predictions
    nonzero_mask = rolling_predictions != 0
    filtered_preds = rolling_predictions[nonzero_mask]
    filtered_targets = rolling_targets[nonzero_mask]

    if len(filtered_preds) == 0:
        accuracy_nonzero = None
        print("No nonzero predictions, cannot compute thresholded accuracy.")
    else:
        accuracy_nonzero = accuracy_score(filtered_targets, filtered_preds)
        print(f"Accuracy on Nonzero Predictions: {accuracy_nonzero:.4f}")

    return {
        "rolling_predictions": rolling_predictions,
        "rolling_targets": rolling_targets,
        "filtered_predictions": filtered_preds,
        "filtered_targets": filtered_targets,
        "accuracy_nonzero": accuracy_nonzero,
        "loss_decrease_percentage": loss_decrease_percentage
    }

# -------------------------------------------------------------------
# 2) Define a model that outputs 2 classes (for CrossEntropyLoss)
# -------------------------------------------------------------------
class SimpleLSTMClassifier(nn.Module):
    """
    A simple single-feature LSTM that outputs 2 logits for classification.
    """
    def __init__(self, input_dim=1, hidden_dim=16, num_layers=1):
        super(SimpleLSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # We'll treat the input as (batch_size, 1 feature) => reshape as (batch_size, seq_len=1, input_dim=1)
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        # For 2-class classification, output dimension = 2
        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        # x shape: [batch_size, input_dim] or [batch_size, num_features]
        # We treat each sample as a "sequence" of length 1.
        # If you truly have multiple timesteps, adapt accordingly.
        x = x.unsqueeze(1)  # => [batch_size, seq_len=1, input_dim]
        
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim, device=x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim, device=x.device)
        
        out, _ = self.lstm(x, (h0, c0))       # out shape: [batch_size, 1, hidden_dim]
        out = out[:, -1, :]                   # => [batch_size, hidden_dim]
        out = self.fc(out)                    # => [batch_size, 2] (logits for 2 classes)
        return out


In [30]:
# -------------------------------------------------------------------
# 3) Example usage with single-feature "returns" data
# -------------------------------------------------------------------
if __name__ == "__main__":

    X_cont, y_cont = create_sequences(returns, targets, seq_length=seq_length)

    # -- 4B) Train/validation split --
    X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(
        X_cont, y_cont, test_size=0.2, shuffle=False
    )

    ################################
    # B) Instantiate model, loss, optimizer
    ################################
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SimpleLSTMClassifier(input_dim=1, hidden_dim=16, num_layers=1)
    criterion = nn.CrossEntropyLoss()  # because we have 2 logits
    optimizer = optim.Adam(model.parameters(), lr=0.007)

    ################################
    # C) Rolling evaluation
    ################################
    # Example: Use first 70% to train, then roll.
    # We'll let the function do all the training in one block:
    train_size = 0.7  
    batch_size = 32
    num_epochs = 50

    # Probability thresholds:
    #   If prob_class_1 < lower_threshold => predict -1
    #   If prob_class_1 > upper_threshold => predict +1
    #   Else => predict 0
    lower_threshold = 0.5

    results = evaluate_rolling_unchanged_model_threshold(
        model=model,
        X=X_cont,
        y=y_cont,
        criterion=criterion,
        optimizer=optimizer,
        device=device,
        train_size=train_size,
        batch_size=batch_size,
        num_epochs=num_epochs,
        lower_threshold=lower_threshold,
        upper_threshold=0.5
    )

    print("Rolling Evaluation Results:")
    print(" Nonzero Accuracy:", results["accuracy_nonzero"])
    print(" Loss Decrease %: ", results["loss_decrease_percentage"])
    print(" Rolling Predictions:", results["rolling_predictions"])
    print(" Rolling Targets:    ", results["rolling_targets"])

ValueError: LSTM: Expected input to be 2D or 3D, got 4D instead

In [15]:
# -- 5A) Convert "Return" to 0/1 based on sign --
binary_returns = np.where(returns < 0, 0.0, 1.0)

# -- 5B) Prepare sequences with the new binary returns data --
X_bin, y_bin = create_sequences(binary_returns, targets, seq_length=seq_length)

# -- 5C) Train/validation split --
X_train_b, X_val_b, y_train_b, y_val_b = train_test_split(
    X_bin, y_bin, test_size=0.2, shuffle=False
)

# -- 5D) Convert to torch tensors --
X_train_b = torch.tensor(X_train_b, dtype=torch.float32)
y_train_b = torch.tensor(y_train_b, dtype=torch.float32)
X_val_b   = torch.tensor(X_val_b,   dtype=torch.float32)
y_val_b   = torch.tensor(y_val_b,   dtype=torch.float32)

# -- 5E) Instantiate another LSTM model for the binary input --
model_bin = SimpleLSTM(input_dim=1, hidden_dim=16, num_layers=1)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_bin.parameters(), lr=1e-3)

# -- 5F) Training loop (similar to above) --
num_epochs = 100
batch_size = 32

for epoch in range(num_epochs):
    model_bin.train()
    batch_losses = []
    
    # (Optionally) shuffle data
    # idx = torch.randperm(X_train_b.size(0))
    # X_train_b = X_train_b[idx]
    # y_train_b = y_train_b[idx]
    
    for i in range(0, X_train_b.size(0), batch_size):
        x_batch = X_train_b[i : i + batch_size]
        y_batch = y_train_b[i : i + batch_size]
        
        optimizer.zero_grad()
        outputs = model_bin(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        batch_losses.append(loss.item())
    
    # Validation
    model_bin.eval()
    with torch.no_grad():
        val_outputs = model_bin(X_val_b)
        val_loss = criterion(val_outputs, y_val_b).item()
        
        val_preds = (val_outputs >= 0.5).int().numpy()
        val_acc   = accuracy_score(y_val_b.numpy(), val_preds)
    
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {np.mean(batch_losses):.4f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")


Epoch 1/100 | Train Loss: 0.6933 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 2/100 | Train Loss: 0.6933 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 3/100 | Train Loss: 0.6933 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 4/100 | Train Loss: 0.6933 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 5/100 | Train Loss: 0.6933 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 6/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 7/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 8/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 9/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 10/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 11/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 12/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 13/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 14/100 | Train Loss: 0.6932 | Val Loss: 0.6927 | Val Acc: 0.5202
Epoch 15/100 | 