In [51]:
import numpy as np
import pandas as pd

## Setup GPU

In [None]:
# Use GPU if available
device_name = "GPU" if tf.config.experimental.list_physical_devices('GPU') else "cpu"
print("Using device:", device_name)
# Check type of GPU available on machine
physical_devices = tf.config.list_physical_devices()
print("Available devices:", physical_devices

## Transformation of the dataframe

In [52]:
data_yf = pd.read_csv('data/all_stock_data.csv')
data_yf.shape

In [56]:
#create a list of dataframes
symbols = list(data_yf['Ticker'].value_counts().index)

df_list = []

# Loop over each unique ticker
for ticker in range(len(symbols)):
    df = data_yf[data_yf['Ticker'] == symbols[ticker]]
    
    # Rename all columns in df to "original_column_nameTicker"
    df.columns = [str(col) + str(symbols[ticker]) for col in df.columns]
    
    df = df.reset_index().drop(columns=['index'])#[0:990]
    
    # Append modified df to the list
    df_list.append(df)

In [57]:
data = pd.concat(df_list, axis=1)
data.set_index('DateFITB', inplace=True) #set Date as index
data.index = pd.to_datetime(data.index) #set index to datetime

In [58]:
date_values = [col for col in data.columns if 'Date' in col] #drop other 'Date' values
data.drop(columns=date_values, inplace=True)

In [59]:
data.shape

(228, 19850)

## Predict then Optimize -- Shared Learning

In [60]:
#keep only float columns for model
float_columns = data.select_dtypes(include=['float64']).columns
data = data[float_columns] #only floats
data.dropna(inplace=True) #drop Nas
data.shape

In [63]:
#save data to use elsewhere
data.to_csv('data/all_stock_data_transformed_horizontally.csv')

In [64]:
import tensorflow as tf
from scipy.optimize import minimize

## Helper Functions

In [67]:
# Prepare features and targets for model training
def prepare_features_targets(data_2):
#     indicators = calculate_technical_indicators(data)
#     returns = np.log(data / data.shift(1)).add_suffix('_returns')
#     full_data = pd.concat([indicators, returns], axis=1).dropna()
    features = data_2[[col for col in data_2.columns if 'gain_loss_' not in col]]
    targets = data_2[[col for col in data_2.columns if 'gain_loss_' in col]]
    return features, targets

# Build and compile a neural network
def build_model(input_shape, output_shape):
    with tf.device(device_name):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='relu', kernel_initializer='he_normal',input_shape=(input_shape,)),
            tf.keras.layers.Dense(32, activation='relu',kernel_initializer='he_normal'),
            tf.keras.layers.Dense(output_shape)
        ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Portfolio optimization function with variance consideration and risk aversion
def optimize_portfolio(weights, model, features, historical_returns, risk_aversion=0.8):
    weights = weights / np.sum(weights)  # Normalize weights
    pred_returns = model.predict(np.array([features.iloc[-1]]))[0]
    expected_return = np.dot(weights, pred_returns)
    covariance_matrix = historical_returns.cov()
    portfolio_variance = np.dot(weights.T, np.dot(covariance_matrix, weights))
    # Objective function with risk aversion parameter
    return -expected_return + risk_aversion * portfolio_variance  # Maximize returns and penalize variance


## Run Models

In [69]:
# Main script
#features, targets = prepare_features_targets(data)
with tf.device(device_name):
    print(device_name)
    # Your model code here
    split_date = pd.Timestamp('2023-01-01')
    # Prepare your features and targets first
    features, targets = prepare_features_targets(data)
    # Split the data based on the threshold date
    train_features = features[features.index < split_date]
    test_features = features[features.index >= split_date]
    train_targets = targets[targets.index < split_date]
    test_targets = targets[targets.index >= split_date]
    model = build_model(train_features.shape[1], train_targets.shape[1])
    model.fit(train_features, train_targets, epochs=5, batch_size=5)

constraints = {'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1}
initial_weights = np.ones(len(symbols)) / len(symbols) # Equal weights
bounds = tuple((0, 1) for _ in symbols)


with tf.device(device_name):
    # Optimizing the portfolio on training data
    train_optimized_result = minimize(optimize_portfolio, initial_weights, args=(model, train_features, train_targets, 1), bounds=bounds, constraints=constraints)
    optimized_weights_train = train_optimized_result.x

    # Optimizing the portfolio on testing data
    test_optimized_result = minimize(optimize_portfolio, initial_weights, args=(model, test_features, test_targets, 1), bounds=bounds, constraints=constraints)
    optimized_weights_test = test_optimized_result.x

    # Calculating ROI
    train_predicted_returns = model.predict(train_features) 
    train_roi = np.dot(optimized_weights_train, train_predicted_returns.mean(axis=0)) * 10000  # Assuming $10,000 initial investment

    test_predicted_returns = model.predict(test_features)
    test_roi = np.dot(optimized_weights_test, test_predicted_returns.mean(axis=0)) * 10000  # Assuming $10,000 initial investment


GPU
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - loss: 155.3329
Epoch 2/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.1649
Epoch 3/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0076
Epoch 4/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0080
Epoch 5/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0077
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[

In [72]:
print("Optimized Portfolio Weights (Train):", optimized_weights_train)
print("Training ROI:", train_roi)
print("Optimized Portfolio Weights (Test):", optimized_weights_test)
print("Testing ROI:", test_roi)

Optimized Portfolio Weights (Train): [2.62299324e-16 0.00000000e+00 1.87785592e-01 9.61045020e-17
 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.44254635e-17
 1.00668512e-16 0.00000000e+00 1.28437071e-17 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.49255079e-16
 0.00000000e+00 3.55537786e-16 9.16702651e-17 3.83571960e-17
 2.95143250e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.37799245e-01
 0.00000000e+00 5.74011979e-17 0.00000000e+00 3.97599613e-17
 2.22077065e-01 7.28280253e-18 0.00000000e+00 0.00000000e+00
 5.33959110e-17 8.44073498e-17 0.00000000e+00 5.64162356e-17
 0.00000000e+00 1.96395406e-17 0.00000000e+00 2.91343179e-17
 1.58720547e-16 0.00000000e+00 1.68975157e-17 0.00000000e+00
 0.00000000e+00 6.36692103e-17 4.99184440e-16 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 3.47697309e-17 0.00000000e+00
 0.00000000e+00 0.00000000e+00 5.08642966e-17 0.

In [73]:
tickers = [col.split('_')[0] for col in targets.columns]


def print_non_zero_weights(weights, title, tickers):
    # Using a threshold to consider weights effectively zero
    threshold = 1e-10
    non_zero_weights = {tickers[index]: weight for index, weight in enumerate(weights) if weight > threshold}
    print(f"{title} (non-zero weights):")
    for ticker, weight in non_zero_weights.items():
        print(f"{ticker}: Weight {weight}")

# Assuming 'symbols' is a list of ticker symbols corresponding to the indices of the weights! 
# Need to confirm this... otherwise the assumed tickers below will be wrong
print_non_zero_weights(optimized_weights_train, "Optimized Portfolio Weights (Train)", symbols)
print("Training ROI:", train_roi)
print_non_zero_weights(optimized_weights_test, "Optimized Portfolio Weights (Test)", symbols)
print("Testing ROI:", test_roi)



Optimized Portfolio Weights (Train) (non-zero weights):
PPL: Weight 0.18778559220162006
GPC: Weight 0.29514324999032787
CMS: Weight 0.1377992454786503
COO: Weight 0.22207706541903938
AKAM: Weight 0.0028229933253976305
JPM: Weight 0.1543718535849644
Training ROI: 249.02216741025518
Optimized Portfolio Weights (Test) (non-zero weights):
GRMN: Weight 0.048755759073509874
GPC: Weight 0.17226912316186876
COO: Weight 0.1323116561722748
BKR: Weight 0.19271433712946415
AKAM: Weight 0.1878874790356767
RTX: Weight 0.10158398937715363
JPM: Weight 0.16447765605004797
Testing ROI: 248.4176863229602


Non Shared Learning


In [None]:
#Need a train loader and a test loader
#Need to convert to optDataset, then wrap with DataLoader

Create a LSTM-based model for stock prediction tasks

In [None]:
import torch.nn.functional as F
from torch import nn
import torch 
class StockLSTM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(StockLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        all_h, (h, c) = self.lstm(x)
        out = self.fc(all_h) # Apply Linear layer to outputs from all the hidden state.
        return out

In [None]:
import gurobipy as gp
from gurobipy import GRB
import numpy as np
import pyepo
from pyepo.model.grb import optGrbModel
import torch
from torch import nn
from torch.utils.data import DataLoader

m = 50 # change based on number of assets
cov = np.cov(np.random.randn(10, m), rowvar=False) # covariance matrix
optmodel = pyepo.model.grb.portfolioModel(m, cov) # build model

Auto-Sklearn cannot be imported.
Restricted license - for non-production use only - expires 2025-11-24


In [None]:
import time

# train model
def trainModel(reg, loss_func, method_name, num_epochs=20, lr=1e-2):
    # set adam optimizer
    optimizer = torch.optim.Adam(reg.parameters(), lr=lr)
    # train mode
    reg.train()
    # init log
    train_loss_log = []
    loss_log_regret = [pyepo.metric.regret(reg, optmodel, loader_test)]
    # init elpased time
    elapsed = 0
    for epoch in range(num_epochs):
        # start timing
        tick = time.time()
        # load data
        train_loss = 0
        for i, data in enumerate(loader_train):
            x, c, w, z = data
            # cuda
            if torch.cuda.is_available():
                x, c, w, z = x.cuda(), c.cuda(), w.cuda(), z.cuda()
            # forward pass
            cp = reg(x)
            if method_name == "spo+":
                loss = loss_func(cp, c, w, z)
            elif method_name == "mse":
                loss = loss_func(cp, c)
            else:
                raise ValueError("Method name {} not supported".format(method_name))
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # record time
            tock = time.time()
            elapsed += tock - tick
            train_loss += loss.item()
        train_loss /= len(loader_train)
        train_loss_log.append(train_loss)
        regret = pyepo.metric.regret(reg, optmodel, loader_test)
        loss_log_regret.append(regret)
        print("Epoch {:2},  Loss: {:9.4f},  Regret: {:7.4f}%".format(epoch+1, train_loss, regret*100))
    print("Total Elapsed Time: {:.2f} Sec.".format(elapsed))
    return train_loss_log, loss_log_regret

Create a Predict-then-Optimize Model

In [None]:

spop = pyepo.func.SPOPlus(optmodel, processes=1)

In [None]:
# Hyperparameters
VOCAB_SIZE = #fill in once data is imported
EMBEDDING_DIM = #param we can optimize
HIDDEN_DIM = #also a param to optimize
learning_rate = #another optimizable param
epoch = #optimize
num_layers = #optimize
epochs = 20
learning_rate = 2e-3
method_name = "spo+"

# Instantiate the model
lstm = StockLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM,num_layers=num_layers)
loss_log_lstm_spo, loss_log_regret_lstm_spo = trainModel(lstm, loss_func=spop, method_name=method_name)
