In [9]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import pywt
import numpy as np
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas_ta as ta
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from plotly.subplots import make_subplots
from features import * 
from plots import * 

df = pd.read_csv(r"data/coin_Ethereum.csv", parse_dates=['Date'], index_col=['Date'])
df = df[["Open", "High", "Low", "Close", "Volume"]]
df = df.rename(columns={ i:i.lower() for i in ["Open", "High", "Low", "Close", "Volume"]})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report


def preprocess_lstm(df, label_column, split_date, sequence_length):
    le = LabelEncoder()
    df[label_column] = le.fit_transform(df[label_column])
    train_df = df[df.index < split_date]
    test_df = df[df.index >= split_date]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_df.drop(label_column, axis=1))
    X_test = scaler.transform(test_df.drop(label_column, axis=1))
    y_train = train_df[label_column].values
    y_test = test_df[label_column].values
    

    X_train, y_train = create_sequences(X_train, y_train, sequence_length)
    X_test, y_test = create_sequences(X_test, y_test, sequence_length)

    print("After sequence creation:")
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")

    # Check if any of the datasets are empty
    if X_train.shape[0] == 0 or X_test.shape[0] == 0:
        raise ValueError("Empty train or test dataset after sequence creation.")

    return X_train, X_test, y_train, y_test, le, test_df

def create_sequences(X, y, length):
    Xs, ys = [], []
    for i in range(len(X) - length):
        Xs.append(X[i:i+length])
        ys.append(y[i+length])
    return np.array(Xs), np.array(ys)


class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n.squeeze(0))
        return out

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attention = nn.Linear(hidden_size * 2, 1)

    def forward(self, x):
        scores = self.attention(x).squeeze(2)
        alpha = F.softmax(scores, dim=1)
        context = (x * alpha.unsqueeze(2)).sum(dim=1)
        return context

class AdvancedLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=2, dropout_rate=0.5):
        super(AdvancedLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, 
                            batch_first=True, dropout=dropout_rate, bidirectional=True)
        self.layer_norm = nn.LayerNorm(hidden_size * 2)
        self.attention = Attention(hidden_size)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = self.layer_norm(lstm_out)
        context = self.attention(lstm_out)
        out = self.fc(self.dropout(context))
        return out



def train_lstm(df, label_column, split_date, sequence_length=30, hidden_size=50, epochs=100, batch_size=64):
    X_train, X_test, y_train, y_test, le, test_df = preprocess_lstm(df, label_column, split_date, sequence_length)

    train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    model = AdvancedLSTMClassifier(X_train.shape[2], hidden_size, len(le.classes_)) #LSTMClassifier(X_train.shape[2], hidden_size, len(le.classes_))
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
        y_pred_test = model(X_test_tensor)
        y_pred_probabilities_test = F.softmax(y_pred_test, dim=1).numpy()
        y_pred_test = torch.argmax(y_pred_test, dim=1).numpy()

    print("TRAIN:")
    print(classification_report(y_train, model(torch.tensor(X_train, dtype=torch.float32)).argmax(dim=1).numpy(), target_names=le.classes_))

    print("TEST:")
    print(classification_report(y_test, y_pred_test, target_names=le.classes_))

    return test_df, y_test, y_pred_test, y_pred_probabilities_test, le.classes_



LABELING_COLUMN = 'market_mode'
SPLIT_DATE = '2021-01-01'
def train_lstm_models(df:pd.DataFrame, coef=1, pair_name=None, save_path=None):
    
    df['close_wavelet_smoothed'] = wavelet_smooth(df['close'], wavelet='db4', level=5)
    df = label_market_regimes(df, 'close_wavelet_smoothed', min_peak_distance=20, slope_std_multiplier=0.2)
    df = add_features(df)

    df.drop(['close_wavelet_smoothed', 'rolling_slope'], axis=1, inplace=True)
    df.dropna(axis=1, inplace=True)
    test_df, y_test, y_pred_test, y_pred_proba_test, classes_ = train_lstm(df, LABELING_COLUMN, pd.to_datetime(SPLIT_DATE))
    test_df = test_df.iloc[-len(y_test):]
    plot_results(test_df, y_test, y_pred_test, y_pred_proba_test, classes_)
    return df

#     print(df.tail())
#     V = train_lstm(df, LABELING_COLUMN, SPLIT_DATE)

#     #plot_results(hmm_train(df.copy(), 'market_mode', pd.to_datetime(SPLIT_DATE)))
#     print('=' * 70)

new_df = train_lstm_models(df, LABELING_COLUMN, SPLIT_DATE)
new_df 


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

After sequence creation:
X_train shape: (1943, 30, 12), X_test shape: (157, 30, 12)
