1. Train your LSTM model on historical data up to a certain point in time. This LSTM model is responsible for extracting meaningful features from your time series data.

2. Use the trained LSTM model to extract features from historical data.

3. Define a forecasting horizon, which is the number of time steps into the future you want to predict.

4. Prepare a dataset for prediction by selecting the most recent data (equal to the forecasting horizon) from your dataset. This will serve as the input to your model for making predictions.

5. Use the trained LSTM model to extract features from this recent data.

6. Feed the extracted features into your DecisionTreeRegressor, RandomForestRegressor, or any regression model.

7. Make predictions using the regression model for the future time steps.

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn import metrics
import numpy as npw
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import MinMaxScaler


In [9]:
# Read the CSV file
data = pd.read_csv("../data/data/aapl_raw_data.csv")
data.shape
#data.tail(30)


(10777, 9)

In [10]:
data = data.iloc[:10747]
data.tail(1)


Unnamed: 0,date,open,high,low,close,volume,adjusted_close,change_percent,avg_vol_20d
10746,2023-07-31,196.06,196.49,195.26,196.45,38824100,196.1851,0.32,49803320.0


In [11]:
data.isnull().sum()
data=data.fillna(0)  # Filling null values with zero
#data.isnull().sum()


In [12]:


#data["date"] = data["date"].astype(float)
data["open"] = data["open"].astype(float)
data["high"] = data["high"].astype(float)
data["low"] = data["low"].astype(float)
data["volume"] = data["volume"].astype(float)
data["adjusted_close"] = data["adjusted_close"].astype(float)
data["change_percent"] = data["change_percent"].astype(float)
data["avg_vol_20d"] = data["avg_vol_20d"].astype(float)

data["close"] = data["close"].astype(float)


In [13]:
# Specify the columns you want to standardize
columns_to_standardize = ["open", "high", "low", "volume", "adjusted_close", "change_percent", "avg_vol_20d"]

# Create a StandardScaler object
scaler = StandardScaler()

# Loop through the columns and standardize each one
for column in columns_to_standardize:
    data[column] = scaler.fit_transform(data[[column]])


In [14]:
import torch
import torch.nn as nn
import pandas as pd

# Sample data with a 'date' column containing dates from 1980-12-12 to 2023-07-31
data2 = pd.DataFrame({'date': pd.date_range(start='1980-12-12', end='2023-07-31')})

# Convert 'date' column to datetime
data2['date'] = pd.to_datetime(data['date'])

# Extract day, month, and year from the date column
data['day'] = data2['date'].dt.day
data['month'] = data2['date'].dt.month
data['year'] = data2['date'].dt.year

# Define the embedding dimensions
embedding_dim = 1  # You can adjust this dimension as needed

# Create embedding layers for day, month, and year
day_embedding = nn.Embedding(32, embedding_dim)  # 0-31 days
month_embedding = nn.Embedding(13, embedding_dim)  # 1-12 months
year_embedding = nn.Embedding(44, embedding_dim)  # Embedding for years from 1980 to 2023

# Convert day, month, and year to tensors with Long data type
day_tensor = torch.LongTensor(data['day'].values)
month_tensor = torch.LongTensor(data['month'].values)
year_tensor = torch.LongTensor(data['year'].values - 1980)  # Convert years to an index from 0 to 43

# Pass tensors through embedding layers to get embeddings
day_embeddings = day_embedding(day_tensor)
month_embeddings = month_embedding(month_tensor)
year_embeddings = year_embedding(year_tensor)

# Concatenate the embeddings
date_embeddings = torch.cat((day_embeddings, month_embeddings, year_embeddings), dim=1)

# Print the resulting embeddings
print(date_embeddings)


tensor([[-0.5278,  1.1206, -0.0680],
        [-1.7133,  1.1206, -0.0680],
        [-1.9575,  1.1206, -0.0680],
        ...,
        [-0.2539,  0.0297,  0.2391],
        [-1.6886,  0.0297,  0.2391],
        [ 1.3227,  0.0297,  0.2391]], grad_fn=<CatBackward0>)


In [15]:
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the x_train and y_train data
x_train = data[['open', 'high', 'low', 'volume','adjusted_close', 'change_percent', 'avg_vol_20d']].to_numpy()
y_train = data["close"].to_numpy()

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=45)

# Initialize the scaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and testing data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Convert the data to PyTorch tensors
x_train_tensor = torch.tensor(x_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
x_test_tensor = torch.tensor(x_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


# Concatenate date embeddings with your testing data
x_test_feature_tensors = torch.tensor(x_test_scaled, dtype=torch.float32)
x_test_date_embeddings = date_embeddings[len(x_train_scaled):]  # Use the remaining embeddings for testing data
x_test_combined = torch.cat((x_test_feature_tensors, x_test_date_embeddings), dim=1)

# Convert the combined testing data to PyTorch tensors
x_test_tensor = torch.tensor(x_test_combined, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)






# Inside your LSTM model class
class LSTMModel(nn.Module):
    def __init__(self, input_size, date_embedding_dim, hidden_dim, n_layers, output_size, sequence_length):
        super(LSTMModel, self).__init__()

        self.input_size = input_size + date_embedding_dim  # Updated input size to include date embeddings
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.sequence_length = sequence_length

        # Define the LSTM layer as a class attribute
        self.lstm = nn.LSTM(self.input_size, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        batch_size = x.size(0)

        h0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(x.device)

        # Ensure input has the shape [batch_size, sequence_length, input_size]
        x = x.view(batch_size, 1, self.input_size)

        out, (hidden, cell) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out




# Set random seeds for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# Define hyperparameters
input_size = 7  # Adjust based on the number of input features (excluding change_percent)
date_embedding_dim = 3  # Adjust based on the dimension of your date embeddings
output_size = 1
hidden_dim = 128
n_layers = 4
sequence_length = 75  # Keep this as 1 for your input data
batch_size = 64

# Concatenate date embeddings with your feature vectors
x_train_feature_tensors = torch.tensor(x_train_scaled, dtype=torch.float32)
x_train_date_embeddings = date_embeddings[:len(x_train_scaled)]  # Use the same length as your training data
x_train_combined = torch.cat((x_train_feature_tensors, x_train_date_embeddings), dim=1)




# Convert the combined data to PyTorch tensors
#x_train_tensor = torch.tensor(x_train_combined, dtype=torch.float32)
#y_train_tensor = torch.tensor(y_train, dtype=torch.float32)



# Copy x_train_combined to create x_train_tensor
x_train_tensor = x_train_combined.clone().detach()
x_train_tensor = x_train_tensor.to(torch.float32)

# Copy x_test_combined to create x_test_tensor
x_test_tensor = x_test_combined.clone().detach()
x_test_tensor = x_test_tensor.to(torch.float32)


# Create y_train_tensor directly from y_train
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)





# Create a DataLoader for batch training
train_data = torch.utils.data.TensorDataset(x_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)




# Create an instance of the LSTM model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size, date_embedding_dim, hidden_dim, n_layers, output_size, sequence_length).to(device)

# Define loss function and optimizer
loss_function = nn.MSELoss()
learning_rate = 0.00015
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print(f'Hyperparameters: Learning Rate={learning_rate}, Sequence Length={sequence_length}, Batch Size={batch_size}, Input Size={input_size}, Date Embedding Dim={date_embedding_dim}, Hidden Dim={hidden_dim},'
              f'Layers={n_layers}')


# Training loop
num_epochs = 3000

for epoch in range(num_epochs):
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_x)

        # Calculate loss
        loss = loss_function(outputs, batch_y.view(-1, 1))  # Ensure batch_y has the right shape

        # Backpropagation
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 100 == 0:
        # Extract features on the testing set
        model.eval()
        with torch.no_grad():
            # Extract features from the hidden states for x_test_tensor
            hidden_states_x, _ = model.lstm(x_test_tensor.view(x_test_tensor.size(0), 1, -1))
            # Extract features from the hidden states for y_test_tensor

            hidden_states_y, _ = model.lstm(y_test_tensor.view(y_test_tensor.size(0), 1, 1))
            #y_test_tensor = y_test_tensor.view(y_test_tensor.size(0), 1, 1)


            val_outputs = model(x_test_tensor)
            val_loss = loss_function(val_outputs, y_test_tensor.view(-1, 1))  # Ensure y_test_tensor has the right shape
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')
        # You can now use 'hidden_states' as the feature representations of your sequences
        # The shape of 'hidden_states' will be (batch_size, sequence_length, hidden_dim)

# Print hidden_states after training
print(hidden_states_x)
print(hidden_states_y)











  x_test_tensor = torch.tensor(x_test_combined, dtype=torch.float32)


Hyperparameters: Learning Rate=0.00015, Sequence Length=75, Batch Size=64, Input Size=7, Date Embedding Dim=3, Hidden Dim=128,Layers=4


In [None]:
# 1. Feature extraction
model.eval()
with torch.no_grad():
    hidden_states, _ = model.lstm(x_test_tensor.view(x_test_tensor.size(0), 1, -1))
    val_outputs = model(x_test_tensor)
    y_test_predictions = val_outputs


#2. generate date embeddings for the future date


import torch

# Define the target date: 12th August
target_date = "11th August"

# Define the embedding dimensions (same as in your previous code)
embedding_dim = 1

# Define the maximum values for day, month, and year based on your previous code
max_day = 31  # Maximum day
max_month = 12  # Maximum month
max_year = 43  # Maximum year (from 1980 to 2023)

# Create one-hot encodings for day, month, and year
day_encoding = torch.zeros(max_day)
month_encoding = torch.zeros(max_month)
year_encoding = torch.zeros(max_year + 1)  # +1 to account for the inclusive range

# Map the target date to one-hot encoding
# Extract the day and month from the target date
day_index = int(target_date.split(" ")[0].replace("th", "")) - 1  # Extract the day from the target date
month_index = 8  # August is the 9th month (0-based index)

# Set the corresponding elements to 1
day_encoding[day_index] = 1
month_encoding[month_index] = 1
year_encoding[43] = 1  # 43 corresponds to the year 2023 in your previous code

# Concatenate the day, month, and year encodings to get the date embedding
date_embedding_11_august = torch.cat((day_encoding, month_encoding, year_encoding), dim=0)



#3. Combine LSTM Features with Date Embeddings



import torch

# Assuming 'hidden_states' has shape [3547, 1, 32]
# 'date_embedding_11_august' has shape [76]

# Broadcast 'date_embedding_11_august' to match the shape of 'hidden_states'
# This will repeat 'date_embedding_11_august' along the second dimension
date_embedding_11_august_broadcasted = date_embedding_11_august.reshape(1, 1, -1).expand(3547, 1, -1)

# Combine 'hidden_states' and 'date_embedding_11_august_broadcasted'
combined_states = torch.cat((hidden_states, date_embedding_11_august_broadcasted), dim=2)

# The resulting 'combined_states' will have a shape of [3547, 1, 32 + 76]


# 4. Train a Random Forest Model:


import torch
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Assuming 'y_test_predictions' contains the predicted 'close' values
# You can extract them as a numpy array using .numpy() method
close_values = y_test_predictions.numpy()

# Reshape the 'combined_states' to remove the extra dimension
combined_states_2d = combined_states.reshape(-1, combined_states.shape[-1])

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters as needed

# Fit the model on your data
rf_model.fit(combined_states_2d, close_values)

# Now the Random Forest model is trained and ready for predictions



# 5. Evaluate the model


import torch
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Assuming 'y_test_predictions' contains the predicted 'close' values
# You can extract them as a numpy array using .numpy() method
close_values = y_test_predictions.numpy()

# Reshape the 'combined_states' to remove the extra dimension
combined_states_2d = combined_states.reshape(-1, combined_states.shape[-1])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_states_2d, close_values, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters as needed

# Fit the model on your training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print or use the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R^2): {r2}')


# 6. Make Predictions


import torch
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Assuming 'hidden_states' contains the LSTM features
# 'y_test_predictions' contains the predicted 'close' values

# Define the target date: 11th August
target_date = "11th August"

# Define the embedding dimensions (same as in your previous code)
embedding_dim = 1

# Define the maximum values for day, month, and year based on your previous code
max_day = 31  # Maximum day
max_month = 12  # Maximum month
max_year = 43  # Maximum year (from 1980 to 2023)

# Create one-hot encodings for day, month, and year
day_encoding = torch.zeros(max_day)
month_encoding = torch.zeros(max_month)
year_encoding = torch.zeros(max_year + 1)  # +1 to account for the inclusive range

# Map the target date to one-hot encoding
# Extract the day and month from the target date
day_index = int(target_date.split(" ")[0].replace("th", "")) - 1  # Extract the day from the target date
month_index = 8  # August is the 9th month (0-based index)

# Set the corresponding elements to 1
day_encoding[day_index] = 1
month_encoding[month_index] = 1
year_encoding[43] = 1  # 43 corresponds to the year 2023 in your previous code

# Concatenate the day, month, and year encodings to get the date embedding
date_embedding_11_august = torch.cat((day_encoding, month_encoding, year_encoding), dim=0)

# Broadcast 'date_embedding_11_august' to match the shape of 'hidden_states'
date_embedding_11_august_broadcasted = date_embedding_11_august.reshape(1, 1, -1).expand(hidden_states.shape[0], 1, -1)

# Combine 'hidden_states' and 'date_embedding_11_august_broadcasted'
combined_states = torch.cat((hidden_states, date_embedding_11_august_broadcasted), dim=2)

# Extract 'y_test_predictions' as a numpy array
close_values = y_test_predictions.numpy()

# Reshape the 'combined_states' to remove the extra dimension
combined_states_2d = combined_states.reshape(-1, combined_states.shape[-1])

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on your data
rf_model.fit(combined_states_2d, close_values)

# Now, the Random Forest model is trained and ready for predictions

# Make a prediction for 11th August 2023
predicted_close_11_august = rf_model.predict(combined_states_2d)

# 'predicted_close_11_august' contains the predicted "close" value for 11th August 2023
# Print the predicted "close" value for 11th August 2023
print("Predicted Close Value for 11th August 2023:", predicted_close_11_august)




torch.Size([3547, 1])
