In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
# Assuming `features` and `target` are your inputs and outputs prepared as PyTorch tensors

class LabelLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LabelLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n.squeeze(0))
        return out

data_type = "S&P"
csv_location = f"data/{data_type}/data.csv"
df = pd.read_csv(csv_location)

In [4]:
# Parameters
input_size = 1  # Number of features per time step
hidden_size = 128  # LSTM hidden size
num_classes = 5  # Assuming 5 classes for your problem
batch_size = 64
learning_rate = 0.001
num_epochs = 10


In [6]:
price_change_map = {
"MAJOR_INCREASE": 0,
"MINOR_INCREASE": 1,
"NO_CHANGE": 2,
"MINOR_DECREASE": 3,
"MAJOR_DECREASE": 4
}
df['label_mapped'] = df['label'].map(price_change_map)

features = df['label_mapped'].shift(-1).fillna(method='ffill').to_frame()
features = features.rename(columns={'label_mapped': 'shift_1'})
features['shift_2'] = df['label_mapped'].shift(-2).fillna(method='ffill')
features['shift_3'] = df['label_mapped'].shift(-3).fillna(method='ffill')
target = df['label_mapped']


def prepare_sequences(df):
    sequences = []
    targets = []
    for i in range(len(df) - 3):
        sequence = df['label_mapped'][i:i+3].values
        target = df['label_mapped'][i+3]
        sequences.append(sequence)
        targets.append(target)
    return torch.tensor(sequences, dtype=torch.float).unsqueeze(-1), torch.tensor(targets, dtype=torch.long)

features, target = prepare_sequences(df)

# Assuming other parts of your setup (model definition, etc.) are correct

# Adjusting the DataLoader
train_dataset = TensorDataset(features, target)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)


In [7]:


# DataLoader
train_dataset = TensorDataset(features, target)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

model = LabelLSTM(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Forward pass
        scores = model(data)
        loss = criterion(scores, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [5]:

#Train a softmax regression just using the last 3 predictions, nothing else

#Train random thing jsut based of dist of values in training data.

# Could consider other options like using raw data... training more coplex?

#Or could try with labels... also more complex

#Maybe also training with lstm...

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



import pandas as pd

csv_location = "data/S&P/data.csv"

df = pd.read_csv(csv_location)
price_change_map = {
    "MAJOR_INCREASE": 2,
    "MINOR_INCREASE": 1,
    "NO_CHANGE": 0,
    "MINOR_DECREASE": -1,
    "MAJOR_DECREASE": -2
}

df['label'] = df['label'].map(price_change_map)



features = df['label'].shift(-1).fillna(method='ffill').to_frame()  # Mocking sequential feature
features['shift_1'] = df['label'].shift(-2).fillna(method='ffill')
features['shift_2'] = df['label'].shift(-3).fillna(method='ffill')
target = df['label']
features

Unnamed: 0,label,shift_1,shift_2
0,1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0
...,...,...,...
2402,0.0,1.0,-1.0
2403,1.0,-1.0,-2.0
2404,-1.0,-2.0,-2.0
2405,-2.0,-2.0,-2.0


In [3]:
df

Unnamed: 0,sentence,label,price_change
0,"At the Federal Reserve, we are strongly commit...",2,-0.006
1,Today my colleagues on the FOMC and I kept int...,3,0.057
2,"These measures, along with our strong guidance...",2,-0.206
3,"Since the beginning of the year, indicators of...",2,-0.155
4,"As with overall economic activity, conditions ...",2,-0.694
...,...,...,...
2402,"I’d also say that, you know, a strong U.S. eco...",3,0.300
2403,"Now, we do understand, though, that when we—wh...",4,0.000
2404,"It’s a relatively small number, and those part...",3,0.201
2405,"So, again, we’ll continue to conduct U.S. mone...",2,-0.499


In [7]:
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.1, random_state=42)

In [8]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Evaluating the model
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

print(accuracy)
print(conf_matrix)
print(class_report)


0.37344398340248963
[[ 0 14  0  7  0]
 [ 0 61  0 30  0]
 [ 0 14  0  9  0]
 [ 0 54  0 29  0]
 [ 0 17  0  6  0]]
              precision    recall  f1-score   support

          -2       0.00      0.00      0.00        21
          -1       0.38      0.67      0.49        91
           0       0.00      0.00      0.00        23
           1       0.36      0.35      0.35        83
           2       0.00      0.00      0.00        23

    accuracy                           0.37       241
   macro avg       0.15      0.20      0.17       241
weighted avg       0.27      0.37      0.31       241



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
