In [1]:
import matplotlib.pyplot as plt
import torch.nn as nn
import numpy as np

import itertools
import torch

from torch.utils.data import DataLoader, TensorDataset, Dataset
from copy import deepcopy

from wave_generator import WaveGen

from functools import partial

from ray import train, tune
# from ray.train import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler

In [2]:
def torch_train_test_split(*tensors, split=.8):
    n_samples = tensors[0].shape[0]
    train_size = int(split * n_samples)
    test_size = n_samples - train_size
    rand_indices = torch.randperm(n_samples)
    train_indices, test_indices = torch.split(rand_indices, [train_size, test_size])
    assert train_indices.shape[0] == train_size, f'{train_indices.shape} != {train_size}'
    assert test_indices.shape[0] == n_samples - train_size, f'{test_indices.shape} != {n_samples - train_size}'

    data_tensors = list()
    for tensor in tensors:
        assert tensor.shape[0] == n_samples
        train_data = tensor[train_indices]
        test_data = tensor[test_indices]
        data_tensors.extend((train_data, test_data))
    return data_tensors
    
class TorchDataset(Dataset):
    def __init__(self, seq2seq_tensor, enc_window=120, dec_window=120):
        assert seq2seq_tensor.ndim == 2
        assert seq2seq_tensor.shape[1] > enc_window + dec_window, f'{seq2seq_tensor.shape} | {enc_window} | {dec_window}'
        self._data = seq2seq_tensor
        self.enc_window = enc_window
        self.dec_window = dec_window
        return
    
    @property
    def shape(self):
        return (self._data.shape[1] - (self.enc_window + self.dec_window), 1)

    def __len__(self):
        return self.shape[1]

    def __getitem__(self, index):
        enc_data = self._data[:,index:index + self.enc_window]
        dec_data = self._data[:,index + self.enc_window:self.enc_window + self.dec_window + index]
        return enc_data, dec_data

In [3]:
def get_load_data_fn(
    size=2048,
    slope_min=.0001,
    slope_max=.001,
    n_channels=64,
    n_periods=45,
    enc_window = 128,
    dec_window = 128,
    plot=True,
):
    arr_dict = dict()
    slopes = np.linspace(slope_min, slope_max, n_channels)

    for slope in slopes:
        arr = WaveGen(size=size).linear_phase(n_periods=n_periods).cos().amp(2).t_noise(std=.01, dof=2).sample(n_samples=1).samples
        trend = np.exp(np.cumsum(np.log(np.random.normal(slope, .005, size=size) + 1))) - 1
        arr_dict[slope] = np.squeeze(arr + trend)

    if plot:
        for slope in np.random.choice(slopes, size=3):
            plt.figure(figsize=(15,5))
            plt.plot(arr_dict[slope].squeeze())
            plt.show()
    
    data = torch.from_numpy(np.stack(list(arr_dict.values()), axis=1))
    print(data.shape)
    split_idx = int(data.shape[0]*.8)
    train_data = data[:split_idx].transpose(0, 1)
    test_data = data[split_idx:].transpose(0, 1)

    def load_data():
        train_dataset = TorchDataset(train_data, enc_window=enc_window, dec_window=dec_window)
        test_dataset = TorchDataset(test_data, enc_window=enc_window, dec_window=dec_window)
        return train_dataset, test_dataset

    return load_data

In [4]:
class SineCNNLSTM(nn.Module):
    def __init__(
        self, 
        input_size,
        output_size,
        in_channels=64,
        out_channels=128,
        kernel_size=2, 
        stride=1, 
        drop=.4, 
        hidden_size=32,
        num_layers=3,
        **extra,
    ):
        super(SineCNNLSTM, self).__init__()
        
#         pool_out_shape = (conv1_out_shape - pool_kernel) // pool_kernel + 1
        
        self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride)
        self.conv2 = nn.Conv1d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride)
        conv1_out_shape = (input_size - kernel_size) // stride + 1
        conv2_out_shape = (conv1_out_shape - kernel_size) // stride + 1
        
        self.lstm = nn.LSTM(conv2_out_shape, hidden_size, num_layers=num_layers, dropout=drop)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.output = nn.Sigmoid()
        return
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x, _ = self.lstm(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.output(x)
        return x

In [5]:
class SineCNNLSTM2(nn.Module):
    def __init__(
        self, 
        enc_window,
        dec_window,
        hidden_dim=64,
        output_dim=128,
        n_lstm_layers=3,
        kernel_size=3,
        stride=1,
        drop=.4,
    ):
        super(SineCNNLSTM, self).__init__()
        
        self.kernel_size = kernel_size
        self.stride = stride
        self.enc_window = enc_window
        self.dec_window = dec_window
        
        self.conv_in = self.single_conv(1, hidden_dim)
        self.conv_hidden = self.single_conv(hidden_dim, hidden_dim)
        self.conv_out = self.single_conv(hidden_dim, output_dim)
        self.gap = nn.AdaptiveAvgPool1d(1)
        
        self.lstm = nn.LSTM(output_dim, hidden_dim, num_layers=n_lstm_layers, dropout=drop)
        self.activation = nn.Sigmoid()

        self.criterion = nn.MSELoss()
        return
        
    def single_conv(self, in_channels, out_channels):
        layer = nn.Sequential(
            nn.Conv1d(
                in_channels, 
                out_channels, 
                kernel_size=self.kernel_size,
                stride=self.stride,
            ),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(inplace=True),
        )
        return layer

    def conv_forward(self, x):
        x = self.conv_in(x)
        x = self.conv_hidden(x)
        x = self.conv_out(x)
        print(x.shape)
        x = self.gap(x)
        print(x.shape)
        return x
    
    def forward(self, x):
        x = torch.cat([
            self.conv_forward(x[i:i+self.enc_window]) 
            for i 
            in range(x.shape[0] - self.enc_window)
        ])
        print('pre-lstm', x.shape)
        x = self.lstm(x)
        x = self.activation(x)
        print('post-lstm', x.shape)
        return x
    
    def loss(self, pred, label):
        label = label[i+self.enc_window:i+self.enc_window+self.dec_window]
        loss = self.criterion(pred, label)
        return loss

In [6]:
# https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
def train_lstm(
    config,
    load_data_fn,
    load_model_fn,
    val_split=.8,
):
    model = load_model_fn(config)
    
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=config['lr'])
    checkpoint = session.get_checkpoint()
    
    if checkpoint:
        checkpoint_state = checkpoint.to_dict()
        start_epoch = checkpoint_state['epoch']
        net.load_state_dict(checkpoint_state['net_state_dict'])
        optimizer.load_state_dict(checkpoint_state['optimizer_state_dict'])
    else:
        start_epoch = 0
    
    train_dataset, test_dataset = load_data_fn()
    k = int(len(train_dataset) * val_split)
    train_dataset, val_dataset = train_dataset[:k], train_dataset[k:]
    
    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

    for epoch in range(start_epoch, 10):
        running_train_loss = 0.
        running_test_loss = 0.
        for (train_data, train_labels), (test_data, test_labels) in zip(train_dataloader, test_dataloader):
            optimizer.zero_grad()
            train_output = model(train_data[i])
            train_loss = criterion(train_output, train_labels)
            train_loss.backward()
            optimizer.step()
            running_train_loss += train_loss.item()

            for i in range(test_data.shape[0]):
                with torch.no_grad():
                    test_output = model(test_data[i])
                    test_loss = criterion(test_output, test_labels[i])
                    running_test_loss += test_loss.item()
                
            checkpoint_data = {
                'epoch' : epoch,
#                 'net_state_dict' : model.state_dict(),
                'optimizer_state_dict' : optimizer.state_dict(),
            }
            train.report(checkpoint_data)
        if epoch == 9:
            torch.save(model.state_dict(), './model.pth')
#             checkpoint = Checkpoint.from_dict(checkpoint_data)
            
#             session.report(
#                 {'loss' : test_loss / test_data.shape[0]},
#                 checkpoint=checkpoint,
#             )

    return

In [None]:
def hyperparameter_tune(total_size=2048, input_size=128, output_size=128, in_channels=16, cpus=4, num_samples=10, max_t=100, grace_period=2, reduction_factor=2):
    model_load_fn = lambda x : SineCNNLSTM(input_size, output_size, in_channels=in_channels, **x)
    
    config = {
        'out_channels' : [16, 32, 64],
        'kernel_size' : [2,8,24,48],
        'stride' : [1,2,4,8,24],
        'drop' : [.2, .4, .6],
        'hidden_size' : [16, 32, 64],
        'num_layers' : [4,8],
    }

    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=max_t,
        grace_period=grace_period,
        reduction_factor=reduction_factor,
    )

    load_data_fn = get_load_data_fn(total_size, n_channels=in_channels, enc_window=input_size, dec_window=output_size)
    tuner = tune.Tuner(
        partial(train_lstm, load_data_fn, model_load_fn, val_split=.8),
        tune_config=tune.TuneConfig(
            num_samples=20,
            scheduler=scheduler,
        ),
        param_space=config,
    )
    result = tuner.fit()
    
    best_trial = result.get_best_result('loss', mode='min')
    print(best_trial)
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final test loss: {best_trial.last_result['loss']}")
    
    best_model = model_load_fn(
        out_channels=best_trial.config['out_channels'], 
        kernel_size=best_trial.config['kernel_size'], 
        stride=best_trial.config['stride'],
        drop=best_trial.config['drop'],
        hidden_size=best_trial.config['hidden_size'],
        num_layers=best_trial.config['num_layers'],
    )
#     best_checkpoint_data = best_trial.checkpoint.to_air_checkpoint().to_dict()

    state_dict = torch.load(os.path.join(best_model.path, 'model.pth'))
    best_model.load_state_dict(state_dict)
    return best_model

best_model = hyperparameter_tune()

0,1
Current time:,2023-12-05 21:40:52
Running for:,00:00:59.93
Memory:,10.9/15.9 GiB

Trial name,# failures,error file
train_lstm_1d2f6_00000,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00000_0_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00001,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00001_1_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00002,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00002_2_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00003,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00003_3_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00004,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00004_4_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00005,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00005_5_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00006,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00006_6_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00007,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00007_7_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00008,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00008_8_2023-12-05_21-39-52\error.txt
train_lstm_1d2f6_00009,1,C:/Users/Nick/ray_results/train_lstm_2023-12-05_21-39-44/train_lstm_1d2f6_00009_9_2023-12-05_21-39-52\error.txt

Trial name,status,loc
train_lstm_1d2f6_00017,PENDING,
train_lstm_1d2f6_00018,PENDING,
train_lstm_1d2f6_00019,PENDING,
train_lstm_1d2f6_00000,ERROR,
train_lstm_1d2f6_00001,ERROR,
train_lstm_1d2f6_00002,ERROR,
train_lstm_1d2f6_00003,ERROR,
train_lstm_1d2f6_00004,ERROR,
train_lstm_1d2f6_00005,ERROR,
train_lstm_1d2f6_00006,ERROR,


[33m(raylet)[0m [2023-12-05 21:39:52,352 E 32244 38368] (raylet.exe) agent_manager.cc:70: The raylet exited immediately because one Ray agent failed, agent_name = dashboard_agent/15724.
[33m(raylet)[0m The raylet fate shares with the agent. This can happen because
[33m(raylet)[0m - The version of `grpcio` doesn't follow Ray's requirement. Agent can segfault with the incorrect `grpcio` version. Check the grpcio version `pip freeze | grep grpcio`.
[33m(raylet)[0m - The agent failed to start because of unexpected error or port conflict. Read the log `cat /tmp/ray/session_latest/logs/{dashboard_agent|runtime_env_agent}.log`. You can find the log file structure here https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure.
[33m(raylet)[0m - The agent is killed by the OS (e.g., out of memory).
[33m(raylet)[0m *** SIGTERM received at time=1701833992 ***
[33m(raylet)[0m     @   00007FF6276885C6  (unknown)  (unknown)
[33m(raylet)[0m     @   00

2023-12-05 21:40:06,561	ERROR tune_controller.py:1383 -- Trial task failed for trial train_lstm_1d2f6_00005
Traceback (most recent call last):
  File "C:\Users\Nick\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "C:\Users\Nick\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\_private\auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\Nick\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\Nick\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\_private\worker.py", line 2565, in get
    raise value
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.
	class_name: wrap_function.<locals>.ImplicitFunc
	actor_id: 47b941adcba9388a1354406c01000000
	namespace: 

2023-12-05 21:40:06,584	ERROR tune_controller.py:1383 -- Trial task failed for trial train_lstm_1d2f6_00000
Traceback (most recent call last):
  File "C:\Users\Nick\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "C:\Users\Nick\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\_private\auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\Nick\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\Nick\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\_private\worker.py", line 2565, in get
    raise value
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.
	class_name: wrap_function.<locals>.ImplicitFunc
	actor_id: 7b85a25c6cc74ee586acd17d01000000
	namespace: 

