In [1]:
import lightning as L


In [2]:
import torch.nn as nn


from torch import Tensor
from torch.utils.data import get_worker_info


from typing import Dict, List, Tuple, TypeVar

T = TypeVar('T')
D = TypeVar('D')

Hidden = List[Tuple[Tensor, ...]]

def exists(var : T | None) -> bool:
    return var is not None

def default(var : T | None, val : D) -> T | D:
    return var if exists(var) else val

def enlarge_as(src : Tensor, other : Tensor) -> Tensor:
    '''
        Add sufficient number of singleton dimensions
        to tensor a **to the right** so to match the
        shape of tensor b. NOTE that simple broadcasting
        works in the opposite direction.
    '''
    return rearrange(src, f'... -> ...{" 1" * (other.dim() - src.dim())}').contiguous()




class CausalConv1d(nn.Conv1d):
    def __init__(
            self,
            in_channels,
            out_channels,
            kernel_size,
            stride=1,
            dilation=1,
            groups=1,
            bias=True
    ):
        self._padding = (kernel_size - 1) * dilation

        super(CausalConv1d, self).__init__(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=self._padding,
            dilation=dilation,
            groups=groups,
            bias=bias)

    def forward(self, inp : Tensor) -> Tensor:
        # Handle the case where input has only two dimensions
        # we expect them to have semantics (batch, channels),
        # so we add the missing dimension manually
        if inp.dim() == 2: inp = rearrange(inp, 'b i -> b 1 i')

        result = super(CausalConv1d, self).forward(inp)
        if self._padding != 0: return result[..., :-self._padding]
        return result

class BlockLinear(nn.Module):
    def __init__(
            self,
            block_dims : List[int | List[int]],
            bias : bool = False,
    ):
        super(BlockLinear, self).__init__()

        self._blocks = nn.ParameterList([
            nn.Parameter(torch.randn(size, requires_grad=True))
            for size in block_dims
        ])

        self._bias = nn.Parameter(torch.zeros(sum(block_dims))) if bias else None

    def forward(self, inp : Tensor) -> Tensor:
        # Assemble the blocks into a block-diagonal matrix
        full = torch.block_diag(*self._blocks)

        out = torch.matmul(full, inp)

        if self._bias is not None:
            out = out + self._bias

        return out


In [3]:
import torch
import torch.nn as nn

from math import sqrt
from torch import exp
from torch import tanh
from torch import sigmoid
from einops import einsum, rearrange

from torch import Tensor
from typing import Tuple
from torch.nn.functional import silu
from torch.nn.functional import gelu

class sLSTM(nn.Module):
    '''The scalar-Long Short Term Memory (sLSTM) module as
    originally introduced in Beck et al. (2024)] see:
    (https://arxiv.org/abs/2405.04517).
    
    This model is a variant of the standard LSTM model and
    offers two major improvements:
    - Exponential gating with appropriate state normalization
        to avoid overflows induced by the exponential function.
    - A new memory mixing within heads but not across heads.
    '''

    def __init__(
            self,
            inp_dim : int,
            head_dim : int,
            head_num : int,
            ker_size : int = 4,
            p_factor : float = 4/3,
    ) -> None:
        super().__init__()

        self.inp_dim = inp_dim
        self.head_dim = head_dim
        self.head_num = head_num

        self.inp_norm = nn.LayerNorm(inp_dim)
        self.hid_norm = nn.GroupNorm(head_num, head_dim * head_num)

        self.causal_conv = CausalConv1d(1, 1, kernel_size=ker_size)

        self.W_z = nn.Linear(inp_dim, head_num * head_dim)
        self.W_i = nn.Linear(inp_dim, head_num * head_dim)
        self.W_o = nn.Linear(inp_dim, head_num * head_dim)
        self.W_f = nn.Linear(inp_dim, head_num * head_dim)

        self.R_z = BlockLinear([(head_dim, head_dim)] * head_num)
        self.R_i = BlockLinear([(head_dim, head_dim)] * head_num)
        self.R_o = BlockLinear([(head_dim, head_dim)] * head_num)
        self.R_f = BlockLinear([(head_dim, head_dim)] * head_num)

        # NOTE: The factor of two in the output dimension of the up_proj
        # is due to the fact that the output needs to branch into two
        # separate outputs to account for the the gated GeLU connection.
        # See Fig. 9 in the paper.
        proj_dim = int(p_factor * head_num * head_dim)
        self.up_proj   = nn.Linear(head_num * head_dim, 2 * proj_dim)
        self.down_proj = nn.Linear(proj_dim, inp_dim)

    def init_hidden(self) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        '''Initialize the hidden state of the sLSTM model.

        Args:
            batch_size (int): The batch size of the input sequence.

        Returns:
            Tuple[Tensor, Tensor, Tensor, Tensor]: The hidden state tuple containing the cell state,
                normalizer state, hidden state, and stabilizer state.
        '''

        n_0 = torch.ones (self.head_num * self.head_dim, device=self.device)
        c_0 = torch.zeros(self.head_num * self.head_dim, device=self.device)
        h_0 = torch.zeros(self.head_num * self.head_dim, device=self.device)
        m_0 = torch.zeros(self.head_num * self.head_dim, device=self.device)

        return c_0, n_0, h_0, m_0

    def forward(
            self,
            seq: Tensor,
            hid: Tuple[Tensor, Tensor, Tensor, Tensor],
            use_conv : bool = False,
    ) -> Tuple[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
        '''Forward pass of the sLSTM model.

        Args:
            seq (Tensor): The input sequence tensor of shape (batch_size, input_dim).
            hid (Tuple[Tensor, Tensor, Tensor, Tensor]): The hidden state tuple containing the cell state,
                normalizer state, hidden state, and stabilizer state.

        Returns:
            Tuple[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]: The output tensor with the residual
                connection and the newly updated hidden state tuple.
        '''

        b, d = seq.shape

        # Separate the hidden (previous) state into the cell state,
        # the normalizer state, the hidden state, and the stabilizer state.
        c_tm1, n_tm1, h_tm1, m_tm1 = hid

        x_t : Tensor = self.inp_norm(seq)

        # Optional causal convolution block for the input
        # and forget gates. See Fig. 9 in the paper.
        if use_conv:
            # FIXME: The causal conv branch is broken.
            x_c = self.causal_conv(x_t)
            x_c = silu(x_c).squeeze()
        else:
            x_c = x_t

        # Project the input to the different heads for all
        # the gates.
        # NOTE: For input (i) and forget (f) inputs we use
        # the output of the causal conv. See Fig. 9 in the paper.
        i_t: Tensor = self.W_i(x_c) + self.R_i(h_tm1)
        f_t: Tensor = self.W_f(x_c) + self.R_f(h_tm1)
        z_t: Tensor = self.W_z(x_t) + self.R_z(h_tm1)
        o_t: Tensor = self.W_o(x_t) + self.R_o(h_tm1)

        # Compute the gated outputs for the newly computed inputs
        m_t = torch.max(f_t + m_tm1, i_t)

        i_t = exp(i_t - m_t)         # Eq. (16) in ref. paper | or Eq. (38) in supp. mat.
        f_t = exp(f_t - m_t + m_tm1) # Eq. (17) in ref. paper | or Eq. (39) in supp. mat.

        z_t = tanh(z_t)              # Eq. (11) in ref. paper
        o_t = sigmoid(o_t)           # Eq. (14) in ref. paper

        # Update the internal states of the model
        c_t = f_t * c_tm1 + i_t * z_t # Eq. (8) in ref. paper
        n_t = f_t * n_tm1 + i_t       # Eq. (9) in ref. paper
        h_t = o_t * (c_t / n_t)       # Eq. (10) in ref. paper

        # Compute the output of the LSTM block
        out = self.hid_norm(h_t)

        # Perform up-and-down projection of the output with
        # projection factor 4/3. See Fig. (9) in supp. mat.
        out1, out2 = self.up_proj(out).chunk(2, dim=-1)

        out = out1 + gelu(out2)
        out = self.down_proj(out)

        # Return output with the residual connection and the
        # newly updated hidden state.
        return out + seq, (c_t, n_t, h_t, m_t)

class mLSTM(nn.Module):
    '''The matrix-Long Short Term Memory (mLSTM) module as
    originally introduced in Beck et al. (2024)] see:
    (https://arxiv.org/abs/2405.04517).
    
    This model is a variant of the standard LSTM model and
    offers superior memory due to its storing values in a
    matrix instead of a scalar. It is fully parallelizable
    and updates internal memory with the covariance rule.
    '''

    def __init__(
            self,
            inp_dim : int,
            head_num : int,
            head_dim : int,
            p_factor : int = 2,
            ker_size : int = 4,
    ) -> None:
        super().__init__()

        self.inp_dim = inp_dim
        self.head_num = head_num
        self.head_dim = head_dim

        hid_dim = head_num * head_dim

        self.inp_norm = nn.LayerNorm(inp_dim)
        self.hid_norm = nn.GroupNorm(head_num, hid_dim)

        # NOTE: The factor of two in the output dimension of the up_proj
        # is due to the fact that the output needs to branch into two
        self.up_l_proj = nn.Linear(inp_dim, int(p_factor * inp_dim))
        self.up_r_proj = nn.Linear(inp_dim, hid_dim)
        self.down_proj = nn.Linear(hid_dim, inp_dim)

        self.causal_conv = CausalConv1d(1, 1, kernel_size=ker_size)

        self.skip = nn.Conv1d(int(p_factor * inp_dim), hid_dim, kernel_size=1, bias=False)

        self.W_i = nn.Linear(int(p_factor * inp_dim), head_num)
        self.W_f = nn.Linear(int(p_factor * inp_dim), head_num)
        self.W_o = nn.Linear(int(p_factor * inp_dim), hid_dim)

        self.W_q = nn.Linear(int(p_factor * inp_dim), hid_dim)
        self.W_k = nn.Linear(int(p_factor * inp_dim), hid_dim)
        self.W_v = nn.Linear(int(p_factor * inp_dim), hid_dim)
        
    @property
    def device(self) -> str:
        '''Get the device of the model.

        Returns:
            str: The device of the model.
        '''
        return next(self.parameters()).device
    
    def init_hidden(self, bs : int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        '''Initialize the hidden state of the sLSTM model.

        Args:
            batch_size (int): The batch size of the input sequence.

        Returns:
            Tuple[Tensor, Tensor, Tensor, Tensor]: The hidden state tuple containing the cell state,
                normalizer state, hidden state, and stabilizer state.
        '''

        c_0 = torch.zeros(bs, self.head_num, self.head_dim, self.head_dim, device=self.device)
        n_0 = torch.ones (bs, self.head_num, self.head_dim               , device=self.device) 
        m_0 = torch.zeros(bs, self.head_num                              , device=self.device)

        return c_0, n_0, m_0

    def forward(
            self,
            seq: Tensor,
            hid: Tuple[Tensor, Tensor],
    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        '''_summary_

        Args:
            seq (Tensor): _description_
            hid (Tuple[Tensor, Tensor]): _description_

        Returns:
            Tuple[Tensor, Tuple[Tensor, Tensor]]: _description_
        '''

        # Separate the hidden (previous) state into the cell state,
        # the normalizer state, the hidden state, and the stabilizer state.
        c_tm1, n_tm1, m_tm1 = hid

        x_n : Tensor = self.inp_norm(seq) # shape: b i

        x_t = self.up_l_proj(x_n) # shape: b (i * p_factor)
        r_t = self.up_r_proj(x_n) # shape: b (h d)

        # Compute the causal convolutional input (to be 
        # used for the query and key gates)
        x_c = self.causal_conv(x_t) # shape: b 1 (i * p_factor)
        x_c = silu(x_c).squeeze()   # shape: b (i * p_factor)

        q_t = rearrange(self.W_q(x_c), 'b (h d) -> b h d', h=self.head_num)
        k_t = rearrange(self.W_k(x_c), 'b (h d) -> b h d', h=self.head_num) / sqrt(self.head_dim)
        v_t = rearrange(self.W_v(x_t), 'b (h d) -> b h d', h=self.head_num)

        i_t: Tensor = self.W_i(x_c) # shape: b h
        f_t: Tensor = self.W_f(x_c) # shape: b h
        o_t: Tensor = self.W_o(x_t) # shape: b (h d)

        # Compute the gated outputs for the newly computed inputs
        m_t = torch.max(f_t + m_tm1, i_t)

        i_t = exp(i_t - m_t)         # Eq. (25) in ref. paper
        f_t = exp(f_t - m_t + m_tm1) # Eq. (26) in ref. paper
        o_t = sigmoid(o_t)           # Eq. (27) in ref. paper

        # Update the internal states of the model
        c_t = enlarge_as(f_t, c_tm1) * c_tm1 + enlarge_as(i_t, c_tm1) * einsum(v_t, k_t, 'b h d, b h p -> b h d p')
        n_t = enlarge_as(f_t, n_tm1) * n_tm1 + enlarge_as(i_t, k_t)   * k_t
        h_t = o_t * rearrange(
            einsum(c_t, q_t, 'b h d p, b h p -> b h d') /
            einsum(n_t, q_t, 'b h d, b h d -> b h').clamp(min=1).unsqueeze(-1),
            'b h d -> b (h d)'
        ) # Eq. (21) in ref. paper

        x_c = rearrange(x_c, 'b i -> b i 1')
        out = self.hid_norm(h_t) + self.skip(x_c).squeeze() # shape: b (h d)
        out = out * silu(r_t)                               # shape: b (h d)
        out = self.down_proj(out)                           # shape: h i

        # Return output with the residual connection and the
        # newly updated hidden state.
        return out + seq, (c_t, n_t, m_t)

In [4]:
# import torch
# import torch.nn as nn
# 
# from src.models.xlstm.m_lstm import mLSTM
import torch.nn.functional as F
from torchmetrics import Accuracy, F1Score
# 
# 
# class xLSTMBlock(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, dropout=0.0, bidirectional=False, lstm_type="slstm"):
#         super(xLSTMBlock, self).__init__()
#         self.input_size = input_size
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.dropout = dropout
#         self.bidirectional = bidirectional
#         self.lstm_type = lstm_type
# 
#         if lstm_type == "slstm":
#             self.lstm = sLSTM(input_size, hidden_size, num_layers, dropout)
#         elif lstm_type == "mlstm":
#             print("Warning: mLSTM is not working yet.")
#             self.lstm = mLSTM(input_size, hidden_size, num_layers, dropout)
#         else:
#             raise ValueError(f"Invalid LSTM type: {lstm_type}")
# 
#         self.norm = nn.LayerNorm(input_size)
#         self.activation = nn.GELU()
#         self.dropout_layer = nn.Dropout(dropout)
# 
#         if bidirectional:
#             self.proj = nn.Linear(2 * hidden_size, input_size)
#         else:
#             self.proj = nn.Linear(hidden_size, input_size)
# 
#         # print shapes
#         # print(f"input_size: {input_size}")
#         # print(f"hidden_size: {hidden_size}")
#         # print(f"num_layers: {num_layers}")
#         # print(f"dropout: {dropout}")
#         # print(f"proj: {self.proj}")
# 
#         self.reset_parameters()
# 
#     def reset_parameters(self):
#         nn.init.xavier_uniform_(self.proj.weight)
#         nn.init.zeros_(self.proj.bias)
# 
#     def forward(self, input_seq, hidden_state=None):
#         lstm_output, hidden_state = self.lstm(input_seq, hidden_state)
#         if self.lstm_type == "slstm":
#             hidden_state = [[hidden_state[i][0].detach(), hidden_state[i][1].detach()] for i in range(len(hidden_state))]
# 
#         if self.bidirectional:
#             lstm_output = torch.cat((lstm_output[:, :, :self.hidden_size], lstm_output[:, :, self.hidden_size:]), dim=-1)
# 
#         output = self.activation(self.proj(lstm_output))
#         output = self.norm(output + input_seq)
#         output = self.dropout_layer(output)
# 
#         return output, hidden_state
# 
# class xLSTM(L.LightningModule):
#     def __init__(self, optimizer, scheduler, input_size, hidden_size, output_size, num_layers, num_blocks,
#                  dropout=0.0, bidirectional=False, lstm_type="slstm"):
#         super().__init__()
#         self.save_hyperparameters()
# 
#         self.accuracy = Accuracy(task='multiclass', num_classes=output_size)
#         self.f1_score = F1Score(num_classes=output_size, average='weighted', task='multiclass')
#         self.num_blocks = num_blocks
#         self.lstm_type = lstm_type
# 
#         self.blocks = nn.ModuleList([
#             xLSTMBlock(input_size, hidden_size, num_layers,
#                        dropout, bidirectional, lstm_type)
#             for i in range(num_blocks)
#         ])
# 
#         self.output_layer = nn.Linear(input_size, output_size)
# 
#     def forward(self, input_seq, hidden_states=None):
#         if hidden_states is None:
#             hidden_states = [None] * self.num_blocks
# 
#         output_seq = input_seq
#         for i, block in enumerate(self.blocks):
#             output_seq, hidden_state = block(output_seq, hidden_states[i])
#             if self.lstm_type == "slstm":
#                 hidden_states[i] = [[hidden_state[j][0].detach(), hidden_state[j][1].detach()] for j in range(len(hidden_state))]
#             else:
#                 hidden_states[i] = hidden_state
# 
#         output_seq = output_seq[:, -1, :]
#         output_seq = self.output_layer(output_seq)
#         return output_seq
# 
#     def _shared_step(self, batch, batch_idx):
#         x, y = batch
#         logits = self(x)
#         preds = torch.argmax(logits, dim=1)
#         y = torch.argmax(y, dim=1)
#         print(logits.shape, y.shape)
#         loss = F.cross_entropy(logits, y)
#         acc = self.accuracy(preds, y)
#         f1 = self.f1_score(preds, y)
#         return loss, acc, f1
# 
#     def training_step(self, batch, batch_idx):
#         loss, acc, f1 = self._shared_step(batch, batch_idx)
#         self.log_dict({"train_loss": loss, "train_acc": acc, "train_f1": f1}, prog_bar=True)
#         return loss
# 
#     def validation_step(self, batch, batch_idx):
#         loss, acc, f1 = self._shared_step(batch, batch_idx)
#         self.log_dict({"val_loss": loss, "val_acc": acc, "val_f1": f1}, prog_bar=True)
#         return loss
# 
#     def configure_optimizers(self):
#         optimizer = self.hparams.optimizer(params=self.trainer.model.parameters())
#         return optimizer
#         # scheduler = self.hparams.scheduler(optimizer, T_max=10)
#         # 
#         # return {
#         #     "optimizer": optimizer,
#         #     "lr_scheduler": {
#         #         "scheduler": scheduler,
#         #         "interval": "epoch",
#         #         "frequency": 1,
#         #     }
#         # }

In [5]:
import torch
import torch.nn as nn
from warnings import warn
from lightning import LightningModule

from torch import Tensor
from torch.optim import AdamW
from torch.optim import Optimizer
from torch.nn .functional import softmax
from torch.nn.functional import cross_entropy

from typing import Any, Dict, Generator, List, Tuple, Callable, Iterable

from itertools import repeat
from einops import rearrange



OptimizerCallable = Callable[[Iterable], Optimizer]

class xLSTM(LightningModule):
    '''The extended Long Short Term Memory (xLSTM) module as
    originally introduced in Beck et al. (2024)] see:
    (https://arxiv.org/abs/2405.04517).
    
    This model stacks sLSTM and mLSTM modules with residual
    connections and offers superior memory and performance
    compared to the standard LSTM model, achieving competitive
    or better performance and scaling than Transformer models
    or State-Space models.
    '''

    def __init__(
            self,
            num_layers : int,
            signature : Tuple[int, int],
            inp_dim : int,
            head_dim : int,
            head_num : int,
            output_size : int,
            p_factor : Tuple[float, float] = (2, 4/3),
            ker_size : int = 4,
            optimizer : OptimizerCallable = AdamW,
            inference_kw: Dict[str, Any] = {}
    ) -> None:
        '''Initialize the LLM model.

        Args:
            vocab_size (int): The size of the vocabulary.
            num_layers (int): The number of layers in the LLM model.
            signature (Tuple[int, int]): The signature of the LLM model,
                which represents the ration of the mLSTM-to-sLSTM blocks.
            inp_dim (int): The dimension of the input tokens.
            head_dim (int): The dimension of each attention head.
            head_num (int): The number of attention heads.
            p_factor (Tuple[float, float], optional): The expansion factor
                for the MLP projection in the m|s-LSTM blocks. Defaults to (2, 4/3).
            ker_size (int, optional): The kernel size for the causal convolutional layers.
                Defaults to 4.
                
            kwargs: Additional keyword arguments used at inference time (see relevant
                arguments of the generate method).
        '''
        super().__init__()

        self.accuracy = Accuracy(task='multiclass', num_classes=output_size)
        self.f1_score = F1Score(num_classes=output_size, average='weighted', task='multiclass')
        self.optimizer = optimizer
        self.inference_kw = inference_kw

        m_factor, s_factor = p_factor

        mlstm_par = {
            'inp_dim' : inp_dim,
            'head_dim' : head_dim,
            'head_num' : head_num,
            'p_factor' : m_factor,
            'ker_size' : ker_size,
        }

        slstm_par = {
            'inp_dim' : inp_dim,
            'head_dim' : head_dim,
            'head_num' : head_num,
            'p_factor' : s_factor,
            'ker_size' : ker_size,
        }

        m_num, s_num = signature
        which = [True] * m_num + [False] * s_num

        self.model : List[mLSTM | sLSTM] = nn.ModuleList([
            mLSTM(**mlstm_par) if w else sLSTM(**slstm_par)
            for w, _ in zip(repeat(which), range(num_layers))
        ])

        self.head = nn.Linear(inp_dim, output_size, bias=False)

        self.save_hyperparameters()

    def forward(
            self,
            seq: Tensor,
            hid: Hidden | None = None,
            batch_first : bool = True,
    ) -> Tuple[Tensor, Hidden]:
        '''Forward pass of the xLSTM model.

        Args:
            tok (Tensor): Input tensor representing the sequence tokens.
                Expected shape: (batch, seq_len) if batch_first=True,
                else (seq_len, batch).
            hid (Hidden, optional): Cache object for storing intermediate hidden
                values of the m|s-LSTM blocks of the model. If None, the hidden
                states are initialized by the models. Defaults to None.

        Returns:
            Tuple[Tensor, Hidden]: Returns tensor of predicted logits of shape
                (batch, seq_len, vocab_size) if batch_first=True or of shape
                (seq_len, batch, vocab_size) if batch_first=False, and the
                updated hidden model states.
        '''


        if batch_first: seq = rearrange(seq, 'b s i -> s b i')
        if hid is None: hid = [l.init_hidden(seq.size(1)) for l in self.model]

        # Pass the sequence through the mLSTM and sLSTM blocks
        out = []
        for inp in seq:
            # Compute model output and update the hidden states
            for i, lstm in enumerate(self.model):
                inp, hid[i] = lstm(inp, hid[i])

            out.append(inp)

        out = torch.stack(out, dim=1 if batch_first else 0)
        out = self.head(out)
        out = out[:, -1, :]

        return out, hid

    def _shared_step(self, batch, batch_idx):
        x, y = batch
        logits, hid = self(x)
        preds = torch.argmax(logits, dim=1).float()
        loss = F.cross_entropy(logits, y.float())
        y = torch.argmax(y, dim=1).float()
        acc = self.accuracy(preds, y)
        f1 = self.f1_score(preds, y)
        return loss, acc, f1

    def training_step(self, batch, batch_idx):
        loss, acc, f1 = self._shared_step(batch, batch_idx)
        self.log_dict({"train_loss": loss, "train_acc": acc, "train_f1": f1}, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, acc, f1 = self._shared_step(batch, batch_idx)
        self.log_dict({"val_loss": loss, "val_acc": acc, "val_f1": f1}, prog_bar=True)
        return loss

    def configure_optimizers(self) -> Optimizer:
        optim = self.optimizer(
            self.parameters(),
        )

        return optim

In [1]:
from src.data.dataset import SensorDataModule

dataset = SensorDataModule(32, "../data/partitions", k_folds=0)
dataset.setup()

train_dataloader, val_dataloader = dataset.train_dataloader(), dataset.val_dataloader()

TypeError: list indices must be integers or slices, not NoneType

In [7]:
trainer = L.Trainer(max_epochs=5,
                     accelerator='mps',
                     log_every_n_steps=10)


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/dmnk/PycharmProjects/cdl1-sensor-based/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [8]:

batch = next(iter(train_dataloader))


In [9]:
batch[0].shape

torch.Size([32, 251, 16])

In [10]:
model = xLSTM(num_layers=3,
                         signature=(2, 1),
                         inp_dim=16,
                         head_dim=32,
                         head_num=4,
                         output_size=dataset.num_classes,
                         p_factor=(2, 4/3),
                         ker_size=4, optimizer=torch.optim.AdamW) 

In [11]:
# model = xLSTM(input_size=16,
#                          hidden_size=32,
#                          output_size=dataset.num_classes,
#                          num_layers=3,
#                          dropout=0.1,
#                          num_blocks=2, optimizer=torch.optim.AdamW, scheduler=torch.optim.lr_scheduler.CosineAnnealingLR,
#               lstm_type="slstm")


In [12]:
model.training_step(batch, 0)

/Users/dmnk/PycharmProjects/cdl1-sensor-based/.venv/lib/python3.11/site-packages/lightning/pytorch/core/module.py:436: You are trying to `self.log()` but the `self.trainer` reference is not registered on the model yet. This is most likely because the model hasn't been passed to the `Trainer`


tensor(1.8622, grad_fn=<DivBackward1>)

In [13]:
trainer.fit(model, train_dataloader, val_dataloader)




  | Name     | Type               | Params
------------------------------------------------
0 | accuracy | MulticlassAccuracy | 0     
1 | f1_score | MulticlassF1Score  | 0     
2 | model    | ModuleList         | 79.0 K
3 | head     | Linear             | 80    
------------------------------------------------
79.1 K    Trainable params
0         Non-trainable params
79.1 K    Total params
0.316     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid 80189) is killed by signal: Interrupt: 2. 

In [6]:
import lightning as L
from src.data.dataset import SensorDataModule

dataset = SensorDataModule(32, "../data/partitions")
dataset.setup()

train_dataloader, val_dataloader = dataset.train_dataloader(), dataset.val_dataloader()
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x335e35b50>

In [1]:

import torch.nn.functional as F
from torchmetrics import Accuracy, F1Score


In [2]:
batch = next(iter(train_dataloader))

batch[0].shape

NameError: name 'train_dataloader' is not defined

In [3]:
# path/filename: /path/to/your_pytorch_lightning_wrapper.py
import torch
import torch.nn as nn
import lightning as pl
from xlstm import (
    xLSTMBlockStack,
    xLSTMBlockStackConfig,
    mLSTMBlockConfig,
    mLSTMLayerConfig,
    sLSTMBlockConfig,
    sLSTMLayerConfig,
    FeedForwardConfig,
)

class XLSTMPLModule(pl.LightningModule):
    def __init__(self, context_length, num_blocks, embedding_dim, slstm_at, mlstm_config, slstm_config, num_classes):
        super().__init__()
        self.save_hyperparameters()  # Saves all initialization parameters for easy access and reproducibility
        self.model = xLSTMBlockStack(
            xLSTMBlockStackConfig(
                mlstm_block=mlstm_config,
                slstm_block=slstm_config,
                context_length=context_length,
                num_blocks=num_blocks,
                embedding_dim=embedding_dim,
                slstm_at=slstm_at,
            )
        )
        self.classifier = nn.Linear(embedding_dim, num_classes)  # Classifier layer
        self.accuracy = Accuracy(task='multiclass', num_classes=num_classes)
        self.f1_score = F1Score(num_classes=num_classes, average='weighted', task='multiclass')
        
    def forward(self, x):
        features = self.model(x)
        output = self.classifier(features[:, -1, :]) 
        return output

    def _shared_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        preds = torch.argmax(logits, dim=1).float()
        loss = F.cross_entropy(logits, y.float())
        y = torch.argmax(y, dim=1).float()
        acc = self.accuracy(preds, y)
        f1 = self.f1_score(preds, y)
        return loss, acc, f1

    def training_step(self, batch, batch_idx):
        loss, acc, f1 = self._shared_step(batch, batch_idx)
        self.log_dict({"train_loss": loss, "train_acc": acc, "train_f1": f1}, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, acc, f1 = self._shared_step(batch, batch_idx)
        self.log_dict({"val_loss": loss, "val_acc": acc, "val_f1": f1}, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
        return optimizer

# Configurations for the mLSTM and sLSTM blocks
mlstm_config = mLSTMBlockConfig(
    mlstm=mLSTMLayerConfig(
        conv1d_kernel_size=4, qkv_proj_blocksize=4, num_heads=4
    )
)

slstm_config = sLSTMBlockConfig(
    slstm=sLSTMLayerConfig(
        backend="vanilla",
        num_heads=4,
        conv1d_kernel_size=4,
    ),
    feedforward=FeedForwardConfig(proj_factor=1.3, act_fn="gelu"),
)

# Initialization with the number of classes for the classification task
pl_module = XLSTMPLModule(
    context_length=251,
    num_blocks=7,
    embedding_dim=16,
    slstm_at=[0],
    mlstm_config=mlstm_config,
    slstm_config=slstm_config,
    num_classes=5
)


In [4]:
pl_module.training_step(batch, 0)


NameError: name 'batch' is not defined

In [7]:
trainer = L.Trainer(max_epochs=20,
                    accelerator='mps',
                    log_every_n_steps=10)

trainer.fit(pl_module, train_dataloader, val_dataloader)


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type               | Params
--------------------------------------------------
0 | model      | xLSTMBlockStack    | 38.9 K
1 | classifier | Linear             | 85    
2 | accuracy   | MulticlassAccuracy | 0     
3 | f1_score   | MulticlassF1Score  | 0     
--------------------------------------------------
39.0 K    Trainable params
0         Non-trainable params
39.0 K    Total params
0.156     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.
