### **Imports**

In [116]:
import numpy as np
import pandas as pd

In [117]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [118]:
import importlib
import utils
importlib.reload(utils)

<module 'utils' from '/Users/pawelgrzeszczyk/Documents/02_studies/master/portfolio-optimization-dl/main/modeling/utils.py'>

### **Reading the dataset**

In [119]:
# Reading data, renaming columns and reordering
data = pd.read_csv('../data/data_files/w20_stock.csv').filter(regex='^Close_')
data.columns = [x.split('_')[1].split('.')[0] for x in data.columns]
data['Date'] = pd.read_csv('../data/data_files/w20_stock.csv')['Date']
data.insert(0, 'Date', data.pop('Date'))

# Date as index
data = data.set_index('Date')

# Getting the first row with no NaN values
first_full_row = data.dropna().first_valid_index()
print(f'First row with no NaN values is from: ({first_full_row})')

# Removing rows with NaN values
data = data.loc[first_full_row:]
print(f'Number of rows with NaN values removed: {first_full_row}')
print(f'Number of rows with no NaN values: {data.shape[0]}')
data.head()

First row with no NaN values is from: (2021-05-27)
Number of rows with NaN values removed: 2021-05-27
Number of rows with no NaN values: 792


Unnamed: 0_level_0,ALE,ALR,BDX,CDR,CPS,DNP,JSW,KGH,KRU,KTY,LPP,MBK,OPL,PCO,PEO,PGE,PKN,PKO,PZU,SPL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-05-27,58.279999,32.900002,313.5,175.5,30.700001,283.899994,36.299999,200.0,247.600006,643.0,11040.0,297.799988,6.695,44.0,93.760002,10.025,78.279999,39.290001,36.169998,265.700012
2021-05-28,58.5,33.990002,306.0,178.580002,30.700001,285.0,36.310001,202.899994,258.0,599.0,11470.0,300.0,6.71,44.645,95.5,10.165,79.940002,39.799999,36.900002,271.299988
2021-05-31,57.66,33.740002,298.5,168.740005,30.5,284.600006,36.490002,202.0,258.0,624.0,11200.0,302.399994,6.65,44.0,95.959999,9.986,81.800003,40.669998,37.060001,267.0
2021-06-01,59.209999,34.48,299.0,153.380005,30.76,283.799988,34.799999,208.800003,262.0,610.0,11320.0,307.0,6.68,44.0,96.699997,9.936,81.68,39.950001,37.290001,269.0
2021-06-02,59.18,34.41,295.0,161.580002,31.620001,286.100006,34.849998,209.5,278.600006,592.0,11360.0,300.0,6.7,43.735001,97.260002,10.24,81.18,40.09,37.900002,271.299988


### **Prepare data for the model**

**Calculating percentage changes**

In [120]:
# Calculate daily returns
data_returns = data.pct_change().dropna()

# Add new row equal to 0 - don't invest in anything
data_returns['SAVE'] = 0

print(f'Data shape: {data_returns.shape}')
data_returns.head()

Data shape: (791, 21)


Unnamed: 0_level_0,ALE,ALR,BDX,CDR,CPS,DNP,JSW,KGH,KRU,KTY,...,MBK,OPL,PCO,PEO,PGE,PKN,PKO,PZU,SPL,SAVE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-28,0.003775,0.033131,-0.023923,0.01755,0.0,0.003875,0.000276,0.0145,0.042003,-0.068429,...,0.007388,0.00224,0.014659,0.018558,0.013965,0.021206,0.01298,0.020183,0.021076,0
2021-05-31,-0.014359,-0.007355,-0.02451,-0.055101,-0.006515,-0.001403,0.004957,-0.004436,0.0,0.041736,...,0.008,-0.008942,-0.014447,0.004817,-0.017609,0.023267,0.021859,0.004336,-0.01585,0
2021-06-01,0.026882,0.021932,0.001675,-0.091028,0.008525,-0.002811,-0.046314,0.033663,0.015504,-0.022436,...,0.015212,0.004511,0.0,0.007712,-0.005007,-0.001467,-0.017703,0.006206,0.007491,0
2021-06-02,-0.000507,-0.00203,-0.013378,0.053462,0.027958,0.008104,0.001437,0.003352,0.063359,-0.029508,...,-0.022801,0.002994,-0.006023,0.005791,0.030596,-0.006121,0.003504,0.016358,0.00855,0
2021-06-04,-0.006928,0.009881,0.011864,0.059537,-0.028463,0.001748,0.021521,-0.024821,-0.003589,-0.016892,...,0.005333,0.018657,0.002515,0.001439,0.042481,0.004435,0.007733,0.008179,0.008846,0


**Convert to target shape**

In [121]:
# Parameters
seq_len = 30
batch_size = len(data_returns) - seq_len
input_size = len(data_returns.columns)

# Convert DataFrame to NumPy for easier slicing
data_returns_np = data_returns.values

# Create sequences
X = []
Y = []
for i in range(batch_size):
    X.append(data_returns_np[i:i+seq_len])

    # Get the index of the highest return for the next day
    next_day_returns = data_returns_np[i+seq_len]
    Y.append(next_day_returns)

# Convert lists to NumPy arrays
X = np.array(X)  # Shape: (batch_size, seq_len, input_size)
Y = np.array(Y)  # Shape: (batch_size, input_size)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)  # Shape: (batch_size, seq_len, input_size)
Y = torch.tensor(Y, dtype=torch.float32)  # Shape: (batch_size, input_size)

In [122]:
print(f'X shape: \n\t{X.shape}')
print(f'X sample: \n\t{X[0][:2]}')

X shape: 
	torch.Size([761, 30, 21])
X sample: 
	tensor([[ 0.0038,  0.0331, -0.0239,  0.0175,  0.0000,  0.0039,  0.0003,  0.0145,
          0.0420, -0.0684,  0.0389,  0.0074,  0.0022,  0.0147,  0.0186,  0.0140,
          0.0212,  0.0130,  0.0202,  0.0211,  0.0000],
        [-0.0144, -0.0074, -0.0245, -0.0551, -0.0065, -0.0014,  0.0050, -0.0044,
          0.0000,  0.0417, -0.0235,  0.0080, -0.0089, -0.0144,  0.0048, -0.0176,
          0.0233,  0.0219,  0.0043, -0.0158,  0.0000]])


In [123]:
print(f'Y shape: \n\t{Y.shape}')
print(f'Y sample: \n\t{Y[0]}')

Y shape: 
	torch.Size([761, 21])
Y sample: 
	tensor([ 0.0299, -0.0476, -0.0037,  0.0737,  0.0127,  0.0284, -0.0304, -0.0122,
         0.0303,  0.0043, -0.0095, -0.0571,  0.0236,  0.0085, -0.0178, -0.0111,
         0.0129, -0.0214, -0.0086, -0.0407,  0.0000])


In [124]:
Y_max_id = Y.argmax(axis=1)
Y_max = torch.zeros_like(Y)
Y_max[torch.arange(Y_max_id.shape[0]), Y_max_id] = 1

print(f'Y_max shape: \n\t{Y_max.shape}')
print(f'Y_max sample: \n\t{Y_max[0]}')

Y_max shape: 
	torch.Size([761, 21])
Y_max sample: 
	tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])


### **Model**

In [125]:
class LSTMModel(nn.Module): 
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate=0.5): 
        super(LSTMModel, self).__init__() 
        self.input_size = input_size #input size 
        self.hidden_size = hidden_size #hidden state 
        self.output_size = output_size #number of classes 
        self.num_layers = num_layers #number of layers 

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_rate) #lstm 
        self.fc =  nn.Linear(hidden_size, hidden_size) #fully connected 1 
        self.fc_out = nn.Linear(hidden_size, output_size) #fully connected 2 
        self.relu = nn.ReLU() 
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x): 
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state 
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state 

        # LSTM layer 
        lstm_out, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state 
        x = lstm_out[:, -1, :]  # (batch_size, hidden_size)
        x = self.relu(x) 
        x = self.fc(x) 
        x = self.relu(x) 
        x = self.fc_out(x) 

        # Pass the output through the softmax function (to get sum equal to 1) 
        out = self.softmax(x)
        
        return out

### **Training**

In [126]:
input_size = X.shape[2]
hidden_size = 128
output_size = Y.shape[1]

# Create the model
model = LSTMModel(input_size=input_size,
                  hidden_size=hidden_size,
                  output_size=output_size,
                  num_layers=2)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=1e-4)

# Training
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(X)

    # Calculate loss
    loss = criterion(outputs, Y_max)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    if (epoch+1) % 1 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.0454
Epoch [2/10], Loss: 0.0449
Epoch [3/10], Loss: 0.0486
Epoch [4/10], Loss: 0.0450
Epoch [5/10], Loss: 0.0451
Epoch [6/10], Loss: 0.0450
Epoch [7/10], Loss: 0.0448
Epoch [8/10], Loss: 0.0454
Epoch [9/10], Loss: 0.0450
Epoch [10/10], Loss: 0.0450


In [128]:
with torch.no_grad():
    output_weights = model(X)

In [129]:
output_weights.shape

torch.Size([761, 21])

In [130]:
torch.sum(Y_max, dim=0)

tensor([49., 50., 64., 53., 35., 38., 88., 47., 32., 29., 49., 42., 20., 34.,
        15., 48., 18., 12.,  9., 19., 10.])

In [131]:
output_weights[0]

tensor([0.0658, 0.0681, 0.0882, 0.0696, 0.0506, 0.0503, 0.0367, 0.0648, 0.0434,
        0.0417, 0.0664, 0.0595, 0.0313, 0.0484, 0.0246, 0.0669, 0.0275, 0.0240,
        0.0203, 0.0290, 0.0230])

In [132]:
torch.max(output_weights[0] * 100)

tensor(8.8172)

In [133]:
output_weights[65]

tensor([0.0659, 0.0683, 0.0887, 0.0699, 0.0505, 0.0501, 0.0369, 0.0649, 0.0432,
        0.0414, 0.0665, 0.0597, 0.0312, 0.0484, 0.0244, 0.0671, 0.0274, 0.0238,
        0.0201, 0.0289, 0.0228])