### **Imports**

In [1]:
import numpy as np
import pandas as pd

In [None]:
import torch

### **Reading the dataset**

In [5]:
# Reading data, renaming columns and reordering
data = pd.read_csv('../data/data_files/w20_stock.csv').filter(regex='^Close_')
data.columns = [x.split('_')[1].split('.')[0] for x in data.columns]
data['Date'] = pd.read_csv('../data/data_files/w20_stock.csv')['Date']
data.insert(0, 'Date', data.pop('Date'))

# Date as index
data = data.set_index('Date')

# Getting the first row with no NaN values
first_full_row = data.dropna().first_valid_index()
print(f'First row with no NaN values is from: ({first_full_row})')

# Removing rows with NaN values
data = data.loc[first_full_row:]
print(f'Number of rows with NaN values removed: {first_full_row}')
print(f'Number of rows with no NaN values: {data.shape[0]}')
data.head()

First row with no NaN values is from: (2021-05-27)
Number of rows with NaN values removed: 2021-05-27
Number of rows with no NaN values: 792


Unnamed: 0_level_0,ALE,ALR,BDX,CDR,CPS,DNP,JSW,KGH,KRU,KTY,LPP,MBK,OPL,PCO,PEO,PGE,PKN,PKO,PZU,SPL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-05-27,58.279999,32.900002,313.5,175.5,30.700001,283.899994,36.299999,200.0,247.600006,643.0,11040.0,297.799988,6.695,44.0,93.760002,10.025,78.279999,39.290001,36.169998,265.700012
2021-05-28,58.5,33.990002,306.0,178.580002,30.700001,285.0,36.310001,202.899994,258.0,599.0,11470.0,300.0,6.71,44.645,95.5,10.165,79.940002,39.799999,36.900002,271.299988
2021-05-31,57.66,33.740002,298.5,168.740005,30.5,284.600006,36.490002,202.0,258.0,624.0,11200.0,302.399994,6.65,44.0,95.959999,9.986,81.800003,40.669998,37.060001,267.0
2021-06-01,59.209999,34.48,299.0,153.380005,30.76,283.799988,34.799999,208.800003,262.0,610.0,11320.0,307.0,6.68,44.0,96.699997,9.936,81.68,39.950001,37.290001,269.0
2021-06-02,59.18,34.41,295.0,161.580002,31.620001,286.100006,34.849998,209.5,278.600006,592.0,11360.0,300.0,6.7,43.735001,97.260002,10.24,81.18,40.09,37.900002,271.299988


### **Prepare data for the model**

**Calculating percentage changes**

In [6]:
# Calculate daily returns
data_returns = data.pct_change().dropna()

# Add new row equal to 0 - don't invest in anything
data_returns['SAVE'] = 0

print(f'Data shape: {data_returns.shape}')
data_returns.head()

Data shape: (791, 21)


Unnamed: 0_level_0,ALE,ALR,BDX,CDR,CPS,DNP,JSW,KGH,KRU,KTY,...,MBK,OPL,PCO,PEO,PGE,PKN,PKO,PZU,SPL,SAVE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-28,0.003775,0.033131,-0.023923,0.01755,0.0,0.003875,0.000276,0.0145,0.042003,-0.068429,...,0.007388,0.00224,0.014659,0.018558,0.013965,0.021206,0.01298,0.020183,0.021076,0
2021-05-31,-0.014359,-0.007355,-0.02451,-0.055101,-0.006515,-0.001403,0.004957,-0.004436,0.0,0.041736,...,0.008,-0.008942,-0.014447,0.004817,-0.017609,0.023267,0.021859,0.004336,-0.01585,0
2021-06-01,0.026882,0.021932,0.001675,-0.091028,0.008525,-0.002811,-0.046314,0.033663,0.015504,-0.022436,...,0.015212,0.004511,0.0,0.007712,-0.005007,-0.001467,-0.017703,0.006206,0.007491,0
2021-06-02,-0.000507,-0.00203,-0.013378,0.053462,0.027958,0.008104,0.001437,0.003352,0.063359,-0.029508,...,-0.022801,0.002994,-0.006023,0.005791,0.030596,-0.006121,0.003504,0.016358,0.00855,0
2021-06-04,-0.006928,0.009881,0.011864,0.059537,-0.028463,0.001748,0.021521,-0.024821,-0.003589,-0.016892,...,0.005333,0.018657,0.002515,0.001439,0.042481,0.004435,0.007733,0.008179,0.008846,0


**Convert to target shape**

In [7]:
# Parameters
seq_len = 30
batch_size = len(data_returns) - seq_len
input_size = len(data_returns.columns)

# Convert DataFrame to NumPy for easier slicing
data_returns_np = data_returns.values

# Create sequences
X = []
Y = []
for i in range(batch_size):
    X.append(data_returns_np[i:i+seq_len])

    # Get the index of the highest return for the next day
    next_day_returns = data_returns_np[i+seq_len]
    max_return_index = np.argmax(next_day_returns)
    y = np.zeros(input_size)
    y[max_return_index] = 1
    Y.append(y)

# Convert to NumPy array for PyTorch
X = np.array(X)

# Reshape X to (batch_size, seq_len, input_size)
X = X.reshape(batch_size, seq_len, input_size)
X.shape

(761, 30, 21)

In [8]:
X[0][-1]

array([-0.00375935,  0.00876136,  0.00931099,  0.00277365, -0.02049689,
       -0.02763899,  0.02240894,  0.01623377,  0.06676442,  0.        ,
        0.03571429,  0.03201024, -0.00826452, -0.00403844,  0.0127333 ,
       -0.00066323,  0.01608506,  0.01297998,  0.00678979,  0.03136455,
        0.        ])

In [9]:
Y[0]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.])