From ee35a180a94ffebc8486165f738b97b0d2f7c36b Mon Sep 17 00:00:00 2001 From: DonaldKuck Date: Wed, 26 Nov 2025 16:34:07 +0800 Subject: [PATCH 1/2] Add MPS support and centralize device selection Add get_device() function in pytorch_utils.py supporting CUDA, MPS, and CPU Refactor 26 PyTorch model files to use centralized device selection Enable automatic MPS device selection on Apple Silicon devices The device selection priority is: CUDA > MPS > CPU --- qlib/contrib/model/pytorch_adarnn.py | 12 ++++--- qlib/contrib/model/pytorch_add.py | 4 +-- qlib/contrib/model/pytorch_alstm.py | 4 +-- qlib/contrib/model/pytorch_alstm_ts.py | 24 +++++++------- qlib/contrib/model/pytorch_gats.py | 4 +-- qlib/contrib/model/pytorch_gats_ts.py | 20 +++++------ qlib/contrib/model/pytorch_general_nn.py | 24 +++++++------- qlib/contrib/model/pytorch_gru.py | 4 +-- qlib/contrib/model/pytorch_gru_ts.py | 24 +++++++------- qlib/contrib/model/pytorch_hist.py | 4 +-- qlib/contrib/model/pytorch_igmtf.py | 4 +-- qlib/contrib/model/pytorch_krnn.py | 3 +- qlib/contrib/model/pytorch_localformer.py | 3 +- qlib/contrib/model/pytorch_localformer_ts.py | 19 ++++++----- qlib/contrib/model/pytorch_lstm.py | 3 +- qlib/contrib/model/pytorch_lstm_ts.py | 23 +++++++------ qlib/contrib/model/pytorch_nn.py | 7 ++-- qlib/contrib/model/pytorch_sandwich.py | 3 +- qlib/contrib/model/pytorch_sfm.py | 4 +-- qlib/contrib/model/pytorch_tabnet.py | 4 +-- qlib/contrib/model/pytorch_tcn.py | 4 +-- qlib/contrib/model/pytorch_tcn_ts.py | 20 +++++------ qlib/contrib/model/pytorch_tcts.py | 5 +-- qlib/contrib/model/pytorch_transformer.py | 3 +- qlib/contrib/model/pytorch_transformer_ts.py | 19 ++++++----- qlib/contrib/model/pytorch_utils.py | 35 ++++++++++++++++++++ 26 files changed, 163 insertions(+), 120 deletions(-) diff --git a/qlib/contrib/model/pytorch_adarnn.py b/qlib/contrib/model/pytorch_adarnn.py index c1585a6ac0..4cfa5feee7 100644 --- a/qlib/contrib/model/pytorch_adarnn.py +++ b/qlib/contrib/model/pytorch_adarnn.py @@ -12,7 +12,7 @@ import torch.nn.functional as F import torch.optim as optim from torch.autograd import Function -from qlib.contrib.model.pytorch_utils import count_parameters +from qlib.contrib.model.pytorch_utils import count_parameters, get_device from qlib.data.dataset import DatasetH from qlib.data.dataset.handler import DataHandlerLP from qlib.log import get_module_logger @@ -81,7 +81,7 @@ def __init__( self.optimizer = optimizer.lower() self.loss = loss self.n_splits = n_splits - self.device = torch.device("cuda:%d" % GPU if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( @@ -162,7 +162,9 @@ def train_AdaRNN(self, train_loader_list, epoch, dist_old=None, weight_mat=None) list_label = [] for data in data_all: # feature :[36, 24, 6] - feature, label_reg = data[0].to(self.device).float(), data[1].to(self.device).float() + feature, label_reg = data[0].to(self.device, dtype=torch.float32), data[1].to( + self.device, dtype=torch.float32 + ) list_feat.append(feature) list_label.append(label_reg) flag = False @@ -396,7 +398,7 @@ def __init__( self.model_type = model_type self.trans_loss = trans_loss self.len_seq = len_seq - self.device = torch.device("cuda:%d" % GPU if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) in_size = self.n_input features = nn.ModuleList() @@ -558,7 +560,7 @@ def __init__(self, loss_type="cosine", input_dim=512, GPU=0): """ self.loss_type = loss_type self.input_dim = input_dim - self.device = torch.device("cuda:%d" % GPU if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) def compute(self, X, Y): """Compute adaptation loss diff --git a/qlib/contrib/model/pytorch_add.py b/qlib/contrib/model/pytorch_add.py index c94a03ecc3..464caa6385 100644 --- a/qlib/contrib/model/pytorch_add.py +++ b/qlib/contrib/model/pytorch_add.py @@ -17,7 +17,7 @@ import torch.optim as optim from qlib.contrib.model.pytorch_gru import GRUModel from qlib.contrib.model.pytorch_lstm import LSTMModel -from qlib.contrib.model.pytorch_utils import count_parameters +from qlib.contrib.model.pytorch_utils import count_parameters, get_device from qlib.data.dataset import DatasetH from qlib.data.dataset.handler import DataHandlerLP from qlib.log import get_module_logger @@ -83,7 +83,7 @@ def __init__( self.optimizer = optimizer.lower() self.base_model = base_model self.model_path = model_path - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.gamma = gamma diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py index d1c619ebf4..499c1c814c 100644 --- a/qlib/contrib/model/pytorch_alstm.py +++ b/qlib/contrib/model/pytorch_alstm.py @@ -16,7 +16,7 @@ import torch.nn as nn import torch.optim as optim -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -70,7 +70,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index 95b5cf95d8..29d74f57ec 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -17,7 +17,7 @@ import torch.optim as optim from torch.utils.data import DataLoader -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -74,7 +74,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.n_jobs = n_jobs self.seed = seed @@ -171,11 +171,11 @@ def train_epoch(self, data_loader): self.ALSTM_model.train() for data, weight in data_loader: - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.ALSTM_model(feature.float()) - loss = self.loss_fn(pred, label, weight.to(self.device)) + pred = self.ALSTM_model(feature) + loss = self.loss_fn(pred, label, weight.to(self.device, dtype=torch.float32)) self.train_optimizer.zero_grad() loss.backward() @@ -189,13 +189,13 @@ def test_epoch(self, data_loader): losses = [] for data, weight in data_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) # feature[torch.isnan(feature)] = 0 - label = data[:, -1, -1].to(self.device) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.ALSTM_model(feature.float()) - loss = self.loss_fn(pred, label, weight.to(self.device)) + pred = self.ALSTM_model(feature) + loss = self.loss_fn(pred, label, weight.to(self.device, dtype=torch.float32)) losses.append(loss.item()) score = self.metric_fn(pred, label) @@ -295,10 +295,10 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.ALSTM_model(feature.float()).detach().cpu().numpy() + pred = self.ALSTM_model(feature).detach().cpu().numpy() preds.append(pred) diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index 16439b3783..fc2c92df3f 100644 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -15,7 +15,7 @@ import torch.nn as nn import torch.optim as optim -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -75,7 +75,7 @@ def __init__( self.loss = loss self.base_model = base_model self.model_path = model_path - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_gats_ts.py b/qlib/contrib/model/pytorch_gats_ts.py index 09f0ac08b2..f8779f104d 100644 --- a/qlib/contrib/model/pytorch_gats_ts.py +++ b/qlib/contrib/model/pytorch_gats_ts.py @@ -16,7 +16,7 @@ from torch.utils.data import DataLoader from torch.utils.data import Sampler -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset.handler import DataHandlerLP from ...contrib.model.pytorch_lstm import LSTMModel @@ -94,7 +94,7 @@ def __init__( self.loss = loss self.base_model = base_model self.model_path = model_path - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.n_jobs = n_jobs self.seed = seed @@ -198,10 +198,10 @@ def train_epoch(self, data_loader): for data in data_loader: data = data.squeeze() - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.GAT_model(feature.float()) + pred = self.GAT_model(feature) loss = self.loss_fn(pred, label) self.train_optimizer.zero_grad() @@ -217,11 +217,11 @@ def test_epoch(self, data_loader): for data in data_loader: data = data.squeeze() - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) # feature[torch.isnan(feature)] = 0 - label = data[:, -1, -1].to(self.device) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.GAT_model(feature.float()) + pred = self.GAT_model(feature) loss = self.loss_fn(pred, label) losses.append(loss.item()) @@ -325,10 +325,10 @@ def predict(self, dataset): for data in test_loader: data = data.squeeze() - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.GAT_model(feature.float()).detach().cpu().numpy() + pred = self.GAT_model(feature).detach().cpu().numpy() preds.append(pred) diff --git a/qlib/contrib/model/pytorch_general_nn.py b/qlib/contrib/model/pytorch_general_nn.py index 503c5a2a50..7980fe91e8 100644 --- a/qlib/contrib/model/pytorch_general_nn.py +++ b/qlib/contrib/model/pytorch_general_nn.py @@ -17,7 +17,7 @@ from qlib.data.dataset.weight import Reweighter -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH, TSDatasetH from ...data.dataset.handler import DataHandlerLP @@ -83,7 +83,7 @@ def __init__( self.optimizer = optimizer.lower() self.loss = loss self.weight_decay = weight_decay - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.n_jobs = n_jobs self.seed = seed @@ -189,12 +189,12 @@ def _get_fl(self, data: torch.Tensor): """ if data.dim() == 3: # it is a time series dataset - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) elif data.dim() == 2: # it is a tabular dataset - feature = data[:, 0:-1].to(self.device) - label = data[:, -1].to(self.device) + feature = data[:, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1].to(self.device, dtype=torch.float32) else: raise ValueError("Unsupported data shape.") return feature, label @@ -205,8 +205,8 @@ def train_epoch(self, data_loader): for data, weight in data_loader: feature, label = self._get_fl(data) - pred = self.dnn_model(feature.float()) - loss = self.loss_fn(pred, label, weight.to(self.device)) + pred = self.dnn_model(feature) + loss = self.loss_fn(pred, label, weight.to(self.device, dtype=torch.float32)) self.train_optimizer.zero_grad() loss.backward() @@ -223,8 +223,8 @@ def test_epoch(self, data_loader): feature, label = self._get_fl(data) with torch.no_grad(): - pred = self.dnn_model(feature.float()) - loss = self.loss_fn(pred, label, weight.to(self.device)) + pred = self.dnn_model(feature) + loss = self.loss_fn(pred, label, weight.to(self.device, dtype=torch.float32)) losses.append(loss.item()) score = self.metric_fn(pred, label) @@ -357,10 +357,10 @@ def predict( for data in test_loader: feature, _ = self._get_fl(data) - feature = feature.to(self.device) + feature = feature.to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.dnn_model(feature.float()).detach().cpu().numpy() + pred = self.dnn_model(feature).detach().cpu().numpy() preds.append(pred) diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 06aa6810b8..01648f4879 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -19,7 +19,7 @@ from ...log import get_module_logger from ...model.base import Model from ...utils import get_or_create_path -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device class GRU(Model): @@ -70,7 +70,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py index 65da5ac4b4..bddaaf77bc 100755 --- a/qlib/contrib/model/pytorch_gru_ts.py +++ b/qlib/contrib/model/pytorch_gru_ts.py @@ -16,7 +16,7 @@ import torch.optim as optim from torch.utils.data import DataLoader -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset.handler import DataHandlerLP from ...model.utils import ConcatDataset @@ -72,7 +72,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.n_jobs = n_jobs self.seed = seed @@ -165,11 +165,11 @@ def train_epoch(self, data_loader): self.GRU_model.train() for data, weight in data_loader: - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.GRU_model(feature.float()) - loss = self.loss_fn(pred, label, weight.to(self.device)) + pred = self.GRU_model(feature) + loss = self.loss_fn(pred, label, weight.to(self.device, dtype=torch.float32)) self.train_optimizer.zero_grad() loss.backward() @@ -183,13 +183,13 @@ def test_epoch(self, data_loader): losses = [] for data, weight in data_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) # feature[torch.isnan(feature)] = 0 - label = data[:, -1, -1].to(self.device) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.GRU_model(feature.float()) - loss = self.loss_fn(pred, label, weight.to(self.device)) + pred = self.GRU_model(feature) + loss = self.loss_fn(pred, label, weight.to(self.device, dtype=torch.float32)) losses.append(loss.item()) score = self.metric_fn(pred, label) @@ -289,10 +289,10 @@ def predict(self, dataset): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.GRU_model(feature.float()).detach().cpu().numpy() + pred = self.GRU_model(feature).detach().cpu().numpy() preds.append(pred) diff --git a/qlib/contrib/model/pytorch_hist.py b/qlib/contrib/model/pytorch_hist.py index 779cde9c85..73511539ef 100644 --- a/qlib/contrib/model/pytorch_hist.py +++ b/qlib/contrib/model/pytorch_hist.py @@ -16,7 +16,7 @@ import torch import torch.nn as nn import torch.optim as optim -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -80,7 +80,7 @@ def __init__( self.model_path = model_path self.stock2concept = stock2concept self.stock_index = stock_index - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_igmtf.py b/qlib/contrib/model/pytorch_igmtf.py index 0bddc5a0f5..2bc0c58000 100644 --- a/qlib/contrib/model/pytorch_igmtf.py +++ b/qlib/contrib/model/pytorch_igmtf.py @@ -16,7 +16,7 @@ import torch.nn as nn import torch.optim as optim -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -74,7 +74,7 @@ def __init__( self.loss = loss self.base_model = base_model self.model_path = model_path - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_krnn.py b/qlib/contrib/model/pytorch_krnn.py index d97920b4dc..c81a272421 100644 --- a/qlib/contrib/model/pytorch_krnn.py +++ b/qlib/contrib/model/pytorch_krnn.py @@ -19,6 +19,7 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device ######################################################################## ######################################################################## @@ -276,7 +277,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_localformer.py b/qlib/contrib/model/pytorch_localformer.py index 42851dd6a2..b3df08350f 100644 --- a/qlib/contrib/model/pytorch_localformer.py +++ b/qlib/contrib/model/pytorch_localformer.py @@ -20,6 +20,7 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device from torch.nn.modules.container import ModuleList # qrun examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml ” @@ -58,7 +59,7 @@ def __init__( self.optimizer = optimizer.lower() self.loss = loss self.n_jobs = n_jobs - self.device = torch.device("cuda:%d" % GPU if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger = get_module_logger("TransformerModel") self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) diff --git a/qlib/contrib/model/pytorch_localformer_ts.py b/qlib/contrib/model/pytorch_localformer_ts.py index ae60a39968..cb659e2f0e 100644 --- a/qlib/contrib/model/pytorch_localformer_ts.py +++ b/qlib/contrib/model/pytorch_localformer_ts.py @@ -21,6 +21,7 @@ from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP from torch.nn.modules.container import ModuleList +from .pytorch_utils import get_device class LocalformerModel(Model): @@ -56,7 +57,7 @@ def __init__( self.optimizer = optimizer.lower() self.loss = loss self.n_jobs = n_jobs - self.device = torch.device("cuda:%d" % GPU if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger = get_module_logger("TransformerModel") self.logger.info( @@ -106,10 +107,10 @@ def train_epoch(self, data_loader): self.model.train() for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.model(feature.float()) # .float() + pred = self.model(feature) loss = self.loss_fn(pred, label) self.train_optimizer.zero_grad() @@ -124,11 +125,11 @@ def test_epoch(self, data_loader): losses = [] for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.model(feature.float()) # .float() + pred = self.model(feature) loss = self.loss_fn(pred, label) losses.append(loss.item()) @@ -211,10 +212,10 @@ def predict(self, dataset): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.model(feature.float()).detach().cpu().numpy() + pred = self.model(feature).detach().cpu().numpy() preds.append(pred) diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index 3ba09097ac..78582f1f36 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -19,6 +19,7 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device class LSTM(Model): @@ -69,7 +70,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py index a0fc34d583..825c253a62 100755 --- a/qlib/contrib/model/pytorch_lstm_ts.py +++ b/qlib/contrib/model/pytorch_lstm_ts.py @@ -20,6 +20,7 @@ from ...data.dataset.handler import DataHandlerLP from ...model.utils import ConcatDataset from ...data.dataset.weight import Reweighter +from .pytorch_utils import get_device class LSTM(Model): @@ -71,7 +72,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.n_jobs = n_jobs self.seed = seed @@ -161,11 +162,11 @@ def train_epoch(self, data_loader): self.LSTM_model.train() for data, weight in data_loader: - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.LSTM_model(feature.float()) - loss = self.loss_fn(pred, label, weight.to(self.device)) + pred = self.LSTM_model(feature) + loss = self.loss_fn(pred, label, weight.to(self.device, dtype=torch.float32)) self.train_optimizer.zero_grad() loss.backward() @@ -179,12 +180,12 @@ def test_epoch(self, data_loader): losses = [] for data, weight in data_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) # feature[torch.isnan(feature)] = 0 - label = data[:, -1, -1].to(self.device) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.LSTM_model(feature.float()) - loss = self.loss_fn(pred, label, weight.to(self.device)) + pred = self.LSTM_model(feature) + loss = self.loss_fn(pred, label, weight.to(self.device, dtype=torch.float32)) losses.append(loss.item()) score = self.metric_fn(pred, label) @@ -284,10 +285,10 @@ def predict(self, dataset): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.LSTM_model(feature.float()).detach().cpu().numpy() + pred = self.LSTM_model(feature).detach().cpu().numpy() preds.append(pred) diff --git a/qlib/contrib/model/pytorch_nn.py b/qlib/contrib/model/pytorch_nn.py index 190d1ba45a..3b325bfb12 100644 --- a/qlib/contrib/model/pytorch_nn.py +++ b/qlib/contrib/model/pytorch_nn.py @@ -17,7 +17,7 @@ import torch.nn as nn import torch.optim as optim -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -89,10 +89,7 @@ def __init__( self.eval_steps = eval_steps self.optimizer = optimizer.lower() self.loss_type = loss - if isinstance(GPU, str): - self.device = torch.device(GPU) - else: - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.weight_decay = weight_decay self.data_parall = data_parall diff --git a/qlib/contrib/model/pytorch_sandwich.py b/qlib/contrib/model/pytorch_sandwich.py index 344368143f..1a512b6344 100644 --- a/qlib/contrib/model/pytorch_sandwich.py +++ b/qlib/contrib/model/pytorch_sandwich.py @@ -20,6 +20,7 @@ from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP from .pytorch_krnn import CNNKRNNEncoder +from .pytorch_utils import get_device class SandwichModel(nn.Module): @@ -152,7 +153,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py index c971f1a58c..9146af1096 100644 --- a/qlib/contrib/model/pytorch_sfm.py +++ b/qlib/contrib/model/pytorch_sfm.py @@ -16,7 +16,7 @@ import torch.nn.init as init import torch.optim as optim -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -233,7 +233,7 @@ def __init__( self.eval_steps = eval_steps self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_tabnet.py b/qlib/contrib/model/pytorch_tabnet.py index 3c698edade..8395ee4aec 100644 --- a/qlib/contrib/model/pytorch_tabnet.py +++ b/qlib/contrib/model/pytorch_tabnet.py @@ -16,7 +16,7 @@ import torch.nn.functional as F from torch.autograd import Function -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -69,7 +69,7 @@ def __init__( self.n_epochs = n_epochs self.logger = get_module_logger("TabNet") self.pretrain_n_epochs = pretrain_n_epochs - self.device = "cuda:%s" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu" + self.device = get_device(GPU) self.loss = loss self.metric = metric self.early_stop = early_stop diff --git a/qlib/contrib/model/pytorch_tcn.py b/qlib/contrib/model/pytorch_tcn.py index f6e7e953a0..5a760402b9 100755 --- a/qlib/contrib/model/pytorch_tcn.py +++ b/qlib/contrib/model/pytorch_tcn.py @@ -16,7 +16,7 @@ import torch.nn as nn import torch.optim as optim -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -75,7 +75,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger.info( diff --git a/qlib/contrib/model/pytorch_tcn_ts.py b/qlib/contrib/model/pytorch_tcn_ts.py index a6cc38885c..ac750f1001 100755 --- a/qlib/contrib/model/pytorch_tcn_ts.py +++ b/qlib/contrib/model/pytorch_tcn_ts.py @@ -16,7 +16,7 @@ import torch.optim as optim from torch.utils.data import DataLoader -from .pytorch_utils import count_parameters +from .pytorch_utils import count_parameters, get_device from ...model.base import Model from ...data.dataset.handler import DataHandlerLP from .tcn import TemporalConvNet @@ -73,7 +73,7 @@ def __init__( self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.n_jobs = n_jobs self.seed = seed @@ -167,10 +167,10 @@ def train_epoch(self, data_loader): for data in data_loader: data = torch.transpose(data, 1, 2) - feature = data[:, 0:-1, :].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, 0:-1, :].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.TCN_model(feature.float()) + pred = self.TCN_model(feature) loss = self.loss_fn(pred, label) self.train_optimizer.zero_grad() @@ -186,12 +186,12 @@ def test_epoch(self, data_loader): for data in data_loader: data = torch.transpose(data, 1, 2) - feature = data[:, 0:-1, :].to(self.device) + feature = data[:, 0:-1, :].to(self.device, dtype=torch.float32) # feature[torch.isnan(feature)] = 0 - label = data[:, -1, -1].to(self.device) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.TCN_model(feature.float()) + pred = self.TCN_model(feature) loss = self.loss_fn(pred, label) losses.append(loss.item()) @@ -274,10 +274,10 @@ def predict(self, dataset): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.TCN_model(feature.float()).detach().cpu().numpy() + pred = self.TCN_model(feature).detach().cpu().numpy() preds.append(pred) diff --git a/qlib/contrib/model/pytorch_tcts.py b/qlib/contrib/model/pytorch_tcts.py index d8736627c2..bb059fd872 100644 --- a/qlib/contrib/model/pytorch_tcts.py +++ b/qlib/contrib/model/pytorch_tcts.py @@ -19,6 +19,7 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device class TCTS(Model): @@ -73,8 +74,8 @@ def __init__( self.batch_size = batch_size self.early_stop = early_stop self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu") - self.use_gpu = torch.cuda.is_available() + self.device = get_device(GPU) + self.use_gpu = self.device != torch.device("cpu") self.seed = seed self.input_dim = input_dim self.output_dim = output_dim diff --git a/qlib/contrib/model/pytorch_transformer.py b/qlib/contrib/model/pytorch_transformer.py index d05b9f4cad..eef57a8b80 100644 --- a/qlib/contrib/model/pytorch_transformer.py +++ b/qlib/contrib/model/pytorch_transformer.py @@ -20,6 +20,7 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device # qrun examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml ” @@ -57,7 +58,7 @@ def __init__( self.optimizer = optimizer.lower() self.loss = loss self.n_jobs = n_jobs - self.device = torch.device("cuda:%d" % GPU if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger = get_module_logger("TransformerModel") self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) diff --git a/qlib/contrib/model/pytorch_transformer_ts.py b/qlib/contrib/model/pytorch_transformer_ts.py index 70590e03e5..5364bc89d8 100644 --- a/qlib/contrib/model/pytorch_transformer_ts.py +++ b/qlib/contrib/model/pytorch_transformer_ts.py @@ -20,6 +20,7 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device class TransformerModel(Model): @@ -55,7 +56,7 @@ def __init__( self.optimizer = optimizer.lower() self.loss = loss self.n_jobs = n_jobs - self.device = torch.device("cuda:%d" % GPU if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.device = get_device(GPU) self.seed = seed self.logger = get_module_logger("TransformerModel") self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) @@ -103,10 +104,10 @@ def train_epoch(self, data_loader): self.model.train() for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) - pred = self.model(feature.float()) # .float() + pred = self.model(feature) loss = self.loss_fn(pred, label) self.train_optimizer.zero_grad() @@ -121,11 +122,11 @@ def test_epoch(self, data_loader): losses = [] for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) - label = data[:, -1, -1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.model(feature.float()) # .float() + pred = self.model(feature) loss = self.loss_fn(pred, label) losses.append(loss.item()) @@ -209,10 +210,10 @@ def predict(self, dataset): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) with torch.no_grad(): - pred = self.model(feature.float()).detach().cpu().numpy() + pred = self.model(feature).detach().cpu().numpy() preds.append(pred) diff --git a/qlib/contrib/model/pytorch_utils.py b/qlib/contrib/model/pytorch_utils.py index eb35c383b0..5d2409bc29 100644 --- a/qlib/contrib/model/pytorch_utils.py +++ b/qlib/contrib/model/pytorch_utils.py @@ -1,9 +1,44 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import torch import torch.nn as nn +def get_device(GPU=0): + """ + Select the appropriate device for PyTorch operations. + + Parameters + ---------- + GPU : int or str + If int: GPU device ID (>= 0 to use GPU if available, < 0 to force CPU) + If str: Device string (e.g., "cuda:0", "mps", "cpu") + + Returns + ------- + torch.device + The selected device object + + Examples + -------- + >>> device = get_device(0) # Uses CUDA:0 if available, else MPS if available, else CPU + >>> device = get_device("mps") # Explicitly use MPS + >>> device = get_device(-1) # Force CPU + """ + if isinstance(GPU, str): + return torch.device(GPU) + + # If GPU is an int + # GPU >= 0 means try to use GPU (CUDA or MPS), GPU < 0 means force CPU + if GPU >= 0 and torch.cuda.is_available(): + return torch.device(f"cuda:{GPU}") + elif GPU >= 0 and hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return torch.device("mps") + else: + return torch.device("cpu") + + def count_parameters(models_or_parameters, unit="m"): """ This function is to obtain the storage size unit of a (or multiple) models. From 1fe169101ca6454663005dd5c1b210a2111bc2ee Mon Sep 17 00:00:00 2001 From: DonaldKuck Date: Mon, 1 Dec 2025 14:55:15 +0800 Subject: [PATCH 2/2] refactor: clear mps cache after pytorch model destrcut --- qlib/contrib/model/pytorch_adarnn.py | 2 + qlib/contrib/model/pytorch_add.py | 2 + qlib/contrib/model/pytorch_alstm.py | 2 + qlib/contrib/model/pytorch_alstm_ts.py | 2 + qlib/contrib/model/pytorch_gats.py | 2 + qlib/contrib/model/pytorch_gats_ts.py | 2 + qlib/contrib/model/pytorch_general_nn.py | 2 + qlib/contrib/model/pytorch_gru.py | 2 + qlib/contrib/model/pytorch_gru_ts.py | 2 + qlib/contrib/model/pytorch_igmtf.py | 2 + qlib/contrib/model/pytorch_krnn.py | 1026 +++++++++--------- qlib/contrib/model/pytorch_localformer.py | 648 +++++------ qlib/contrib/model/pytorch_localformer_ts.py | 608 +++++------ qlib/contrib/model/pytorch_lstm.py | 2 + qlib/contrib/model/pytorch_lstm_ts.py | 2 + qlib/contrib/model/pytorch_nn.py | 3 +- qlib/contrib/model/pytorch_sandwich.py | 2 + qlib/contrib/model/pytorch_sfm.py | 2 + qlib/contrib/model/pytorch_tabnet.py | 2 + qlib/contrib/model/pytorch_tcn.py | 2 + qlib/contrib/model/pytorch_tcn_ts.py | 2 + qlib/contrib/model/pytorch_tcts.py | 2 + qlib/contrib/model/pytorch_transformer.py | 574 +++++----- qlib/contrib/model/pytorch_transformer_ts.py | 532 ++++----- 24 files changed, 1737 insertions(+), 1690 deletions(-) diff --git a/qlib/contrib/model/pytorch_adarnn.py b/qlib/contrib/model/pytorch_adarnn.py index 4cfa5feee7..b3e16d5262 100644 --- a/qlib/contrib/model/pytorch_adarnn.py +++ b/qlib/contrib/model/pytorch_adarnn.py @@ -301,6 +301,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() return best_score def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): diff --git a/qlib/contrib/model/pytorch_add.py b/qlib/contrib/model/pytorch_add.py index 464caa6385..07ffc9fb6c 100644 --- a/qlib/contrib/model/pytorch_add.py +++ b/qlib/contrib/model/pytorch_add.py @@ -415,6 +415,8 @@ def fit( torch.save(best_param, save_path) if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py index 499c1c814c..f98c01ee15 100644 --- a/qlib/contrib/model/pytorch_alstm.py +++ b/qlib/contrib/model/pytorch_alstm.py @@ -263,6 +263,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index 29d74f57ec..a053f5d5b4 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -283,6 +283,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index fc2c92df3f..de5225d87a 100644 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -297,6 +297,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_gats_ts.py b/qlib/contrib/model/pytorch_gats_ts.py index f8779f104d..128d3cd070 100644 --- a/qlib/contrib/model/pytorch_gats_ts.py +++ b/qlib/contrib/model/pytorch_gats_ts.py @@ -311,6 +311,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_general_nn.py b/qlib/contrib/model/pytorch_general_nn.py index 7980fe91e8..7de90fe4f6 100644 --- a/qlib/contrib/model/pytorch_general_nn.py +++ b/qlib/contrib/model/pytorch_general_nn.py @@ -330,6 +330,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict( self, diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 01648f4879..310b361aad 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -288,6 +288,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py index bddaaf77bc..a187f3722f 100755 --- a/qlib/contrib/model/pytorch_gru_ts.py +++ b/qlib/contrib/model/pytorch_gru_ts.py @@ -277,6 +277,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_igmtf.py b/qlib/contrib/model/pytorch_igmtf.py index 2bc0c58000..8e8129e62b 100644 --- a/qlib/contrib/model/pytorch_igmtf.py +++ b/qlib/contrib/model/pytorch_igmtf.py @@ -323,6 +323,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_krnn.py b/qlib/contrib/model/pytorch_krnn.py index c81a272421..7262e3b134 100644 --- a/qlib/contrib/model/pytorch_krnn.py +++ b/qlib/contrib/model/pytorch_krnn.py @@ -1,512 +1,514 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - - -from __future__ import division -from __future__ import print_function - -import numpy as np -import pandas as pd -from typing import Text, Union -import copy -from ...utils import get_or_create_path -from ...log import get_module_logger - -import torch -import torch.nn as nn -import torch.optim as optim - -from ...model.base import Model -from ...data.dataset import DatasetH -from ...data.dataset.handler import DataHandlerLP -from .pytorch_utils import get_device - -######################################################################## -######################################################################## -######################################################################## - - -class CNNEncoderBase(nn.Module): - def __init__(self, input_dim, output_dim, kernel_size, device): - """Build a basic CNN encoder - - Parameters - ---------- - input_dim : int - The input dimension - output_dim : int - The output dimension - kernel_size : int - The size of convolutional kernels - """ - super().__init__() - - self.input_dim = input_dim - self.output_dim = output_dim - self.kernel_size = kernel_size - self.device = device - - # set padding to ensure the same length - # it is correct only when kernel_size is odd, dilation is 1, stride is 1 - self.conv = nn.Conv1d(input_dim, output_dim, kernel_size, padding=(kernel_size - 1) // 2) - - def forward(self, x): - """ - Parameters - ---------- - x : torch.Tensor - input data - - Returns - ------- - torch.Tensor - Updated representations - """ - - # input shape: [batch_size, seq_len*input_dim] - # output shape: [batch_size, seq_len, input_dim] - x = x.view(x.shape[0], -1, self.input_dim).permute(0, 2, 1).to(self.device) - y = self.conv(x) # [batch_size, output_dim, conved_seq_len] - y = y.permute(0, 2, 1) # [batch_size, conved_seq_len, output_dim] - - return y - - -class KRNNEncoderBase(nn.Module): - def __init__(self, input_dim, output_dim, dup_num, rnn_layers, dropout, device): - """Build K parallel RNNs - - Parameters - ---------- - input_dim : int - The input dimension - output_dim : int - The output dimension - dup_num : int - The number of parallel RNNs - rnn_layers: int - The number of RNN layers - """ - super().__init__() - - self.input_dim = input_dim - self.output_dim = output_dim - self.dup_num = dup_num - self.rnn_layers = rnn_layers - self.dropout = dropout - self.device = device - - self.rnn_modules = nn.ModuleList() - for _ in range(dup_num): - self.rnn_modules.append(nn.GRU(input_dim, output_dim, num_layers=self.rnn_layers, dropout=dropout)) - - def forward(self, x): - """ - Parameters - ---------- - x : torch.Tensor - Input data - n_id : torch.Tensor - Node indices - - Returns - ------- - torch.Tensor - Updated representations - """ - - # input shape: [batch_size, seq_len, input_dim] - # output shape: [batch_size, seq_len, output_dim] - # [seq_len, batch_size, input_dim] - batch_size, seq_len, input_dim = x.shape - x = x.permute(1, 0, 2).to(self.device) - - hids = [] - for rnn in self.rnn_modules: - h, _ = rnn(x) # [seq_len, batch_size, output_dim] - hids.append(h) - # [seq_len, batch_size, output_dim, num_dups] - hids = torch.stack(hids, dim=-1) - hids = hids.view(seq_len, batch_size, self.output_dim, self.dup_num) - hids = hids.mean(dim=3) - hids = hids.permute(1, 0, 2) - - return hids - - -class CNNKRNNEncoder(nn.Module): - def __init__( - self, cnn_input_dim, cnn_output_dim, cnn_kernel_size, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device - ): - """Build an encoder composed of CNN and KRNN - - Parameters - ---------- - cnn_input_dim : int - The input dimension of CNN - cnn_output_dim : int - The output dimension of CNN - cnn_kernel_size : int - The size of convolutional kernels - rnn_output_dim : int - The output dimension of KRNN - rnn_dup_num : int - The number of parallel duplicates for KRNN - rnn_layers : int - The number of RNN layers - """ - super().__init__() - - self.cnn_encoder = CNNEncoderBase(cnn_input_dim, cnn_output_dim, cnn_kernel_size, device) - self.krnn_encoder = KRNNEncoderBase(cnn_output_dim, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device) - - def forward(self, x): - """ - Parameters - ---------- - x : torch.Tensor - Input data - n_id : torch.Tensor - Node indices - - Returns - ------- - torch.Tensor - Updated representations - """ - cnn_out = self.cnn_encoder(x) - krnn_out = self.krnn_encoder(cnn_out) - - return krnn_out - - -class KRNNModel(nn.Module): - def __init__(self, fea_dim, cnn_dim, cnn_kernel_size, rnn_dim, rnn_dups, rnn_layers, dropout, device, **params): - """Build a KRNN model - - Parameters - ---------- - fea_dim : int - The feature dimension - cnn_dim : int - The hidden dimension of CNN - cnn_kernel_size : int - The size of convolutional kernels - rnn_dim : int - The hidden dimension of KRNN - rnn_dups : int - The number of parallel duplicates - rnn_layers: int - The number of RNN layers - """ - super().__init__() - - self.encoder = CNNKRNNEncoder( - cnn_input_dim=fea_dim, - cnn_output_dim=cnn_dim, - cnn_kernel_size=cnn_kernel_size, - rnn_output_dim=rnn_dim, - rnn_dup_num=rnn_dups, - rnn_layers=rnn_layers, - dropout=dropout, - device=device, - ) - - self.out_fc = nn.Linear(rnn_dim, 1) - self.device = device - - def forward(self, x): - # x: [batch_size, node_num, seq_len, input_dim] - encode = self.encoder(x) - out = self.out_fc(encode[:, -1, :]).squeeze().to(self.device) - - return out - - -class KRNN(Model): - """KRNN Model - - Parameters - ---------- - d_feat : int - input dimension for each time step - metric: str - the evaluation metric used in early stop - optimizer : str - optimizer name - GPU : str - the GPU ID(s) used for training - """ - - def __init__( - self, - fea_dim=6, - cnn_dim=64, - cnn_kernel_size=3, - rnn_dim=64, - rnn_dups=3, - rnn_layers=2, - dropout=0, - n_epochs=200, - lr=0.001, - metric="", - batch_size=2000, - early_stop=20, - loss="mse", - optimizer="adam", - GPU=0, - seed=None, - **kwargs, - ): - # Set logger. - self.logger = get_module_logger("KRNN") - self.logger.info("KRNN pytorch version...") - - # set hyper-parameters. - self.fea_dim = fea_dim - self.cnn_dim = cnn_dim - self.cnn_kernel_size = cnn_kernel_size - self.rnn_dim = rnn_dim - self.rnn_dups = rnn_dups - self.rnn_layers = rnn_layers - self.dropout = dropout - self.n_epochs = n_epochs - self.lr = lr - self.metric = metric - self.batch_size = batch_size - self.early_stop = early_stop - self.optimizer = optimizer.lower() - self.loss = loss - self.device = get_device(GPU) - self.seed = seed - - self.logger.info( - "KRNN parameters setting:" - "\nfea_dim : {}" - "\ncnn_dim : {}" - "\ncnn_kernel_size : {}" - "\nrnn_dim : {}" - "\nrnn_dups : {}" - "\nrnn_layers : {}" - "\ndropout : {}" - "\nn_epochs : {}" - "\nlr : {}" - "\nmetric : {}" - "\nbatch_size: {}" - "\nearly_stop : {}" - "\noptimizer : {}" - "\nloss_type : {}" - "\nvisible_GPU : {}" - "\nuse_GPU : {}" - "\nseed : {}".format( - fea_dim, - cnn_dim, - cnn_kernel_size, - rnn_dim, - rnn_dups, - rnn_layers, - dropout, - n_epochs, - lr, - metric, - batch_size, - early_stop, - optimizer.lower(), - loss, - GPU, - self.use_gpu, - seed, - ) - ) - - if self.seed is not None: - np.random.seed(self.seed) - torch.manual_seed(self.seed) - - self.krnn_model = KRNNModel( - fea_dim=self.fea_dim, - cnn_dim=self.cnn_dim, - cnn_kernel_size=self.cnn_kernel_size, - rnn_dim=self.rnn_dim, - rnn_dups=self.rnn_dups, - rnn_layers=self.rnn_layers, - dropout=self.dropout, - device=self.device, - ) - if optimizer.lower() == "adam": - self.train_optimizer = optim.Adam(self.krnn_model.parameters(), lr=self.lr) - elif optimizer.lower() == "gd": - self.train_optimizer = optim.SGD(self.krnn_model.parameters(), lr=self.lr) - else: - raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) - - self.fitted = False - self.krnn_model.to(self.device) - - @property - def use_gpu(self): - return self.device != torch.device("cpu") - - def mse(self, pred, label): - loss = (pred - label) ** 2 - return torch.mean(loss) - - def loss_fn(self, pred, label): - mask = ~torch.isnan(label) - - if self.loss == "mse": - return self.mse(pred[mask], label[mask]) - - raise ValueError("unknown loss `%s`" % self.loss) - - def metric_fn(self, pred, label): - mask = torch.isfinite(label) - - if self.metric in ("", "loss"): - return -self.loss_fn(pred[mask], label[mask]) - - raise ValueError("unknown metric `%s`" % self.metric) - - def get_daily_inter(self, df, shuffle=False): - # organize the train data into daily batches - daily_count = df.groupby(level=0, group_keys=False).size().values - daily_index = np.roll(np.cumsum(daily_count), 1) - daily_index[0] = 0 - if shuffle: - # shuffle data - daily_shuffle = list(zip(daily_index, daily_count)) - np.random.shuffle(daily_shuffle) - daily_index, daily_count = zip(*daily_shuffle) - return daily_index, daily_count - - def train_epoch(self, x_train, y_train): - x_train_values = x_train.values - y_train_values = np.squeeze(y_train.values) - self.krnn_model.train() - - indices = np.arange(len(x_train_values)) - np.random.shuffle(indices) - - for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: - break - - feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) - label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) - - pred = self.krnn_model(feature) - loss = self.loss_fn(pred, label) - - self.train_optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_value_(self.krnn_model.parameters(), 3.0) - self.train_optimizer.step() - - def test_epoch(self, data_x, data_y): - # prepare training data - x_values = data_x.values - y_values = np.squeeze(data_y.values) - - self.krnn_model.eval() - - scores = [] - losses = [] - - indices = np.arange(len(x_values)) - - for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: - break - - feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) - label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) - - pred = self.krnn_model(feature) - loss = self.loss_fn(pred, label) - losses.append(loss.item()) - - score = self.metric_fn(pred, label) - scores.append(score.item()) - - return np.mean(losses), np.mean(scores) - - def fit( - self, - dataset: DatasetH, - evals_result=dict(), - save_path=None, - ): - df_train, df_valid, df_test = dataset.prepare( - ["train", "valid", "test"], - col_set=["feature", "label"], - data_key=DataHandlerLP.DK_L, - ) - if df_train.empty or df_valid.empty: - raise ValueError("Empty data from dataset, please check your dataset config.") - - x_train, y_train = df_train["feature"], df_train["label"] - x_valid, y_valid = df_valid["feature"], df_valid["label"] - - save_path = get_or_create_path(save_path) - stop_steps = 0 - train_loss = 0 - best_score = -np.inf - best_epoch = 0 - evals_result["train"] = [] - evals_result["valid"] = [] - - # train - self.logger.info("training...") - self.fitted = True - - for step in range(self.n_epochs): - self.logger.info("Epoch%d:", step) - self.logger.info("training...") - self.train_epoch(x_train, y_train) - self.logger.info("evaluating...") - train_loss, train_score = self.test_epoch(x_train, y_train) - val_loss, val_score = self.test_epoch(x_valid, y_valid) - self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) - evals_result["train"].append(train_score) - evals_result["valid"].append(val_score) - - if val_score > best_score: - best_score = val_score - stop_steps = 0 - best_epoch = step - best_param = copy.deepcopy(self.krnn_model.state_dict()) - else: - stop_steps += 1 - if stop_steps >= self.early_stop: - self.logger.info("early stop") - break - - self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) - self.krnn_model.load_state_dict(best_param) - torch.save(best_param, save_path) - - if self.use_gpu: - torch.cuda.empty_cache() - - def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): - if not self.fitted: - raise ValueError("model is not fitted yet!") - - x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) - index = x_test.index - self.krnn_model.eval() - x_values = x_test.values - sample_num = x_values.shape[0] - preds = [] - - for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: - end = sample_num - else: - end = begin + self.batch_size - x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) - with torch.no_grad(): - pred = self.krnn_model(x_batch).detach().cpu().numpy() - preds.append(pred) - - return pd.Series(np.concatenate(preds), index=index) +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +from typing import Text, Union +import copy +from ...utils import get_or_create_path +from ...log import get_module_logger + +import torch +import torch.nn as nn +import torch.optim as optim + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device + +######################################################################## +######################################################################## +######################################################################## + + +class CNNEncoderBase(nn.Module): + def __init__(self, input_dim, output_dim, kernel_size, device): + """Build a basic CNN encoder + + Parameters + ---------- + input_dim : int + The input dimension + output_dim : int + The output dimension + kernel_size : int + The size of convolutional kernels + """ + super().__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + self.kernel_size = kernel_size + self.device = device + + # set padding to ensure the same length + # it is correct only when kernel_size is odd, dilation is 1, stride is 1 + self.conv = nn.Conv1d(input_dim, output_dim, kernel_size, padding=(kernel_size - 1) // 2) + + def forward(self, x): + """ + Parameters + ---------- + x : torch.Tensor + input data + + Returns + ------- + torch.Tensor + Updated representations + """ + + # input shape: [batch_size, seq_len*input_dim] + # output shape: [batch_size, seq_len, input_dim] + x = x.view(x.shape[0], -1, self.input_dim).permute(0, 2, 1).to(self.device) + y = self.conv(x) # [batch_size, output_dim, conved_seq_len] + y = y.permute(0, 2, 1) # [batch_size, conved_seq_len, output_dim] + + return y + + +class KRNNEncoderBase(nn.Module): + def __init__(self, input_dim, output_dim, dup_num, rnn_layers, dropout, device): + """Build K parallel RNNs + + Parameters + ---------- + input_dim : int + The input dimension + output_dim : int + The output dimension + dup_num : int + The number of parallel RNNs + rnn_layers: int + The number of RNN layers + """ + super().__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + self.dup_num = dup_num + self.rnn_layers = rnn_layers + self.dropout = dropout + self.device = device + + self.rnn_modules = nn.ModuleList() + for _ in range(dup_num): + self.rnn_modules.append(nn.GRU(input_dim, output_dim, num_layers=self.rnn_layers, dropout=dropout)) + + def forward(self, x): + """ + Parameters + ---------- + x : torch.Tensor + Input data + n_id : torch.Tensor + Node indices + + Returns + ------- + torch.Tensor + Updated representations + """ + + # input shape: [batch_size, seq_len, input_dim] + # output shape: [batch_size, seq_len, output_dim] + # [seq_len, batch_size, input_dim] + batch_size, seq_len, input_dim = x.shape + x = x.permute(1, 0, 2).to(self.device) + + hids = [] + for rnn in self.rnn_modules: + h, _ = rnn(x) # [seq_len, batch_size, output_dim] + hids.append(h) + # [seq_len, batch_size, output_dim, num_dups] + hids = torch.stack(hids, dim=-1) + hids = hids.view(seq_len, batch_size, self.output_dim, self.dup_num) + hids = hids.mean(dim=3) + hids = hids.permute(1, 0, 2) + + return hids + + +class CNNKRNNEncoder(nn.Module): + def __init__( + self, cnn_input_dim, cnn_output_dim, cnn_kernel_size, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device + ): + """Build an encoder composed of CNN and KRNN + + Parameters + ---------- + cnn_input_dim : int + The input dimension of CNN + cnn_output_dim : int + The output dimension of CNN + cnn_kernel_size : int + The size of convolutional kernels + rnn_output_dim : int + The output dimension of KRNN + rnn_dup_num : int + The number of parallel duplicates for KRNN + rnn_layers : int + The number of RNN layers + """ + super().__init__() + + self.cnn_encoder = CNNEncoderBase(cnn_input_dim, cnn_output_dim, cnn_kernel_size, device) + self.krnn_encoder = KRNNEncoderBase(cnn_output_dim, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device) + + def forward(self, x): + """ + Parameters + ---------- + x : torch.Tensor + Input data + n_id : torch.Tensor + Node indices + + Returns + ------- + torch.Tensor + Updated representations + """ + cnn_out = self.cnn_encoder(x) + krnn_out = self.krnn_encoder(cnn_out) + + return krnn_out + + +class KRNNModel(nn.Module): + def __init__(self, fea_dim, cnn_dim, cnn_kernel_size, rnn_dim, rnn_dups, rnn_layers, dropout, device, **params): + """Build a KRNN model + + Parameters + ---------- + fea_dim : int + The feature dimension + cnn_dim : int + The hidden dimension of CNN + cnn_kernel_size : int + The size of convolutional kernels + rnn_dim : int + The hidden dimension of KRNN + rnn_dups : int + The number of parallel duplicates + rnn_layers: int + The number of RNN layers + """ + super().__init__() + + self.encoder = CNNKRNNEncoder( + cnn_input_dim=fea_dim, + cnn_output_dim=cnn_dim, + cnn_kernel_size=cnn_kernel_size, + rnn_output_dim=rnn_dim, + rnn_dup_num=rnn_dups, + rnn_layers=rnn_layers, + dropout=dropout, + device=device, + ) + + self.out_fc = nn.Linear(rnn_dim, 1) + self.device = device + + def forward(self, x): + # x: [batch_size, node_num, seq_len, input_dim] + encode = self.encoder(x) + out = self.out_fc(encode[:, -1, :]).squeeze().to(self.device) + + return out + + +class KRNN(Model): + """KRNN Model + + Parameters + ---------- + d_feat : int + input dimension for each time step + metric: str + the evaluation metric used in early stop + optimizer : str + optimizer name + GPU : str + the GPU ID(s) used for training + """ + + def __init__( + self, + fea_dim=6, + cnn_dim=64, + cnn_kernel_size=3, + rnn_dim=64, + rnn_dups=3, + rnn_layers=2, + dropout=0, + n_epochs=200, + lr=0.001, + metric="", + batch_size=2000, + early_stop=20, + loss="mse", + optimizer="adam", + GPU=0, + seed=None, + **kwargs, + ): + # Set logger. + self.logger = get_module_logger("KRNN") + self.logger.info("KRNN pytorch version...") + + # set hyper-parameters. + self.fea_dim = fea_dim + self.cnn_dim = cnn_dim + self.cnn_kernel_size = cnn_kernel_size + self.rnn_dim = rnn_dim + self.rnn_dups = rnn_dups + self.rnn_layers = rnn_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.device = get_device(GPU) + self.seed = seed + + self.logger.info( + "KRNN parameters setting:" + "\nfea_dim : {}" + "\ncnn_dim : {}" + "\ncnn_kernel_size : {}" + "\nrnn_dim : {}" + "\nrnn_dups : {}" + "\nrnn_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nbatch_size: {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nvisible_GPU : {}" + "\nuse_GPU : {}" + "\nseed : {}".format( + fea_dim, + cnn_dim, + cnn_kernel_size, + rnn_dim, + rnn_dups, + rnn_layers, + dropout, + n_epochs, + lr, + metric, + batch_size, + early_stop, + optimizer.lower(), + loss, + GPU, + self.use_gpu, + seed, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.krnn_model = KRNNModel( + fea_dim=self.fea_dim, + cnn_dim=self.cnn_dim, + cnn_kernel_size=self.cnn_kernel_size, + rnn_dim=self.rnn_dim, + rnn_dups=self.rnn_dups, + rnn_layers=self.rnn_layers, + dropout=self.dropout, + device=self.device, + ) + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.krnn_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.krnn_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.krnn_model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def get_daily_inter(self, df, shuffle=False): + # organize the train data into daily batches + daily_count = df.groupby(level=0, group_keys=False).size().values + daily_index = np.roll(np.cumsum(daily_count), 1) + daily_index[0] = 0 + if shuffle: + # shuffle data + daily_shuffle = list(zip(daily_index, daily_count)) + np.random.shuffle(daily_shuffle) + daily_index, daily_count = zip(*daily_shuffle) + return daily_index, daily_count + + def train_epoch(self, x_train, y_train): + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) + self.krnn_model.train() + + indices = np.arange(len(x_train_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.krnn_model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.krnn_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.krnn_model.eval() + + scores = [] + losses = [] + + indices = np.arange(len(x_values)) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.krnn_model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], + col_set=["feature", "label"], + data_key=DataHandlerLP.DK_L, + ) + if df_train.empty or df_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + save_path = get_or_create_path(save_path) + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.krnn_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.krnn_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) + index = x_test.index + self.krnn_model.eval() + x_values = x_test.values + sample_num = x_values.shape[0] + preds = [] + + for begin in range(sample_num)[:: self.batch_size]: + if sample_num - begin < self.batch_size: + end = sample_num + else: + end = begin + self.batch_size + x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) + with torch.no_grad(): + pred = self.krnn_model(x_batch).detach().cpu().numpy() + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=index) diff --git a/qlib/contrib/model/pytorch_localformer.py b/qlib/contrib/model/pytorch_localformer.py index b3df08350f..21ea1a585f 100644 --- a/qlib/contrib/model/pytorch_localformer.py +++ b/qlib/contrib/model/pytorch_localformer.py @@ -1,323 +1,325 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - - -from __future__ import division -from __future__ import print_function - -import numpy as np -import pandas as pd -from typing import Text, Union -import copy -import math -from ...utils import get_or_create_path -from ...log import get_module_logger - -import torch -import torch.nn as nn -import torch.optim as optim - -from ...model.base import Model -from ...data.dataset import DatasetH -from ...data.dataset.handler import DataHandlerLP -from .pytorch_utils import get_device -from torch.nn.modules.container import ModuleList - -# qrun examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml ” - - -class LocalformerModel(Model): - def __init__( - self, - d_feat: int = 20, - d_model: int = 64, - batch_size: int = 2048, - nhead: int = 2, - num_layers: int = 2, - dropout: float = 0, - n_epochs=100, - lr=0.0001, - metric="", - early_stop=5, - loss="mse", - optimizer="adam", - reg=1e-3, - n_jobs=10, - GPU=0, - seed=None, - **kwargs, - ): - # set hyper-parameters. - self.d_model = d_model - self.dropout = dropout - self.n_epochs = n_epochs - self.lr = lr - self.reg = reg - self.metric = metric - self.batch_size = batch_size - self.early_stop = early_stop - self.optimizer = optimizer.lower() - self.loss = loss - self.n_jobs = n_jobs - self.device = get_device(GPU) - self.seed = seed - self.logger = get_module_logger("TransformerModel") - self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) - - if self.seed is not None: - np.random.seed(self.seed) - torch.manual_seed(self.seed) - - self.model = Transformer(d_feat, d_model, nhead, num_layers, dropout, self.device) - if optimizer.lower() == "adam": - self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.reg) - elif optimizer.lower() == "gd": - self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=self.reg) - else: - raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) - - self.fitted = False - self.model.to(self.device) - - @property - def use_gpu(self): - return self.device != torch.device("cpu") - - def mse(self, pred, label): - loss = (pred.float() - label.float()) ** 2 - return torch.mean(loss) - - def loss_fn(self, pred, label): - mask = ~torch.isnan(label) - - if self.loss == "mse": - return self.mse(pred[mask], label[mask]) - - raise ValueError("unknown loss `%s`" % self.loss) - - def metric_fn(self, pred, label): - mask = torch.isfinite(label) - - if self.metric in ("", "loss"): - return -self.loss_fn(pred[mask], label[mask]) - - raise ValueError("unknown metric `%s`" % self.metric) - - def train_epoch(self, x_train, y_train): - x_train_values = x_train.values - y_train_values = np.squeeze(y_train.values) - - self.model.train() - - indices = np.arange(len(x_train_values)) - np.random.shuffle(indices) - - for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: - break - - feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) - label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) - - pred = self.model(feature) - loss = self.loss_fn(pred, label) - - self.train_optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) - self.train_optimizer.step() - - def test_epoch(self, data_x, data_y): - # prepare training data - x_values = data_x.values - y_values = np.squeeze(data_y.values) - - self.model.eval() - - scores = [] - losses = [] - - indices = np.arange(len(x_values)) - - for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: - break - - feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) - label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) - - with torch.no_grad(): - pred = self.model(feature) - loss = self.loss_fn(pred, label) - losses.append(loss.item()) - - score = self.metric_fn(pred, label) - scores.append(score.item()) - - return np.mean(losses), np.mean(scores) - - def fit( - self, - dataset: DatasetH, - evals_result=dict(), - save_path=None, - ): - df_train, df_valid, df_test = dataset.prepare( - ["train", "valid", "test"], - col_set=["feature", "label"], - data_key=DataHandlerLP.DK_L, - ) - if df_train.empty or df_valid.empty: - raise ValueError("Empty data from dataset, please check your dataset config.") - - x_train, y_train = df_train["feature"], df_train["label"] - x_valid, y_valid = df_valid["feature"], df_valid["label"] - - save_path = get_or_create_path(save_path) - stop_steps = 0 - train_loss = 0 - best_score = -np.inf - best_epoch = 0 - evals_result["train"] = [] - evals_result["valid"] = [] - - # train - self.logger.info("training...") - self.fitted = True - - for step in range(self.n_epochs): - self.logger.info("Epoch%d:", step) - self.logger.info("training...") - self.train_epoch(x_train, y_train) - self.logger.info("evaluating...") - train_loss, train_score = self.test_epoch(x_train, y_train) - val_loss, val_score = self.test_epoch(x_valid, y_valid) - self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) - evals_result["train"].append(train_score) - evals_result["valid"].append(val_score) - - if val_score > best_score: - best_score = val_score - stop_steps = 0 - best_epoch = step - best_param = copy.deepcopy(self.model.state_dict()) - else: - stop_steps += 1 - if stop_steps >= self.early_stop: - self.logger.info("early stop") - break - - self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) - self.model.load_state_dict(best_param) - torch.save(best_param, save_path) - - if self.use_gpu: - torch.cuda.empty_cache() - - def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): - if not self.fitted: - raise ValueError("model is not fitted yet!") - - x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) - index = x_test.index - self.model.eval() - x_values = x_test.values - sample_num = x_values.shape[0] - preds = [] - - for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: - end = sample_num - else: - end = begin + self.batch_size - - x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) - - with torch.no_grad(): - pred = self.model(x_batch).detach().cpu().numpy() - - preds.append(pred) - - return pd.Series(np.concatenate(preds), index=index) - - -class PositionalEncoding(nn.Module): - def __init__(self, d_model, max_len=1000): - super(PositionalEncoding, self).__init__() - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0).transpose(0, 1) - self.register_buffer("pe", pe) - - def forward(self, x): - # [T, N, F] - return x + self.pe[: x.size(0), :] - - -def _get_clones(module, N): - return ModuleList([copy.deepcopy(module) for i in range(N)]) - - -class LocalformerEncoder(nn.Module): - __constants__ = ["norm"] - - def __init__(self, encoder_layer, num_layers, d_model): - super(LocalformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.conv = _get_clones(nn.Conv1d(d_model, d_model, 3, 1, 1), num_layers) - self.num_layers = num_layers - - def forward(self, src, mask): - output = src - out = src - - for i, mod in enumerate(self.layers): - # [T, N, F] --> [N, T, F] --> [N, F, T] - out = output.transpose(1, 0).transpose(2, 1) - out = self.conv[i](out).transpose(2, 1).transpose(1, 0) - - output = mod(output + out, src_mask=mask) - - return output + out - - -class Transformer(nn.Module): - def __init__(self, d_feat=6, d_model=8, nhead=4, num_layers=2, dropout=0.5, device=None): - super(Transformer, self).__init__() - self.rnn = nn.GRU( - input_size=d_model, - hidden_size=d_model, - num_layers=num_layers, - batch_first=False, - dropout=dropout, - ) - self.feature_layer = nn.Linear(d_feat, d_model) - self.pos_encoder = PositionalEncoding(d_model) - self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout) - self.transformer_encoder = LocalformerEncoder(self.encoder_layer, num_layers=num_layers, d_model=d_model) - self.decoder_layer = nn.Linear(d_model, 1) - self.device = device - self.d_feat = d_feat - - def forward(self, src): - # src [N, F*T] --> [N, T, F] - src = src.reshape(len(src), self.d_feat, -1).permute(0, 2, 1) - src = self.feature_layer(src) - - # src [N, T, F] --> [T, N, F], [60, 512, 8] - src = src.transpose(1, 0) # not batch first - - mask = None - - src = self.pos_encoder(src) - output = self.transformer_encoder(src, mask) # [60, 512, 8] - - output, _ = self.rnn(output) - - # [T, N, F] --> [N, T*F] - output = self.decoder_layer(output.transpose(1, 0)[:, -1, :]) # [512, 1] - - return output.squeeze() +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +from typing import Text, Union +import copy +import math +from ...utils import get_or_create_path +from ...log import get_module_logger + +import torch +import torch.nn as nn +import torch.optim as optim + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device +from torch.nn.modules.container import ModuleList + +# qrun examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml ” + + +class LocalformerModel(Model): + def __init__( + self, + d_feat: int = 20, + d_model: int = 64, + batch_size: int = 2048, + nhead: int = 2, + num_layers: int = 2, + dropout: float = 0, + n_epochs=100, + lr=0.0001, + metric="", + early_stop=5, + loss="mse", + optimizer="adam", + reg=1e-3, + n_jobs=10, + GPU=0, + seed=None, + **kwargs, + ): + # set hyper-parameters. + self.d_model = d_model + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.reg = reg + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.n_jobs = n_jobs + self.device = get_device(GPU) + self.seed = seed + self.logger = get_module_logger("TransformerModel") + self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.model = Transformer(d_feat, d_model, nhead, num_layers, dropout, self.device) + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.reg) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=self.reg) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred.float() - label.float()) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def train_epoch(self, x_train, y_train): + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) + + self.model.train() + + indices = np.arange(len(x_train_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.model.eval() + + scores = [] + losses = [] + + indices = np.arange(len(x_values)) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) + + with torch.no_grad(): + pred = self.model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], + col_set=["feature", "label"], + data_key=DataHandlerLP.DK_L, + ) + if df_train.empty or df_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + save_path = get_or_create_path(save_path) + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) + index = x_test.index + self.model.eval() + x_values = x_test.values + sample_num = x_values.shape[0] + preds = [] + + for begin in range(sample_num)[:: self.batch_size]: + if sample_num - begin < self.batch_size: + end = sample_num + else: + end = begin + self.batch_size + + x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) + + with torch.no_grad(): + pred = self.model(x_batch).detach().cpu().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=index) + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=1000): + super(PositionalEncoding, self).__init__() + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer("pe", pe) + + def forward(self, x): + # [T, N, F] + return x + self.pe[: x.size(0), :] + + +def _get_clones(module, N): + return ModuleList([copy.deepcopy(module) for i in range(N)]) + + +class LocalformerEncoder(nn.Module): + __constants__ = ["norm"] + + def __init__(self, encoder_layer, num_layers, d_model): + super(LocalformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.conv = _get_clones(nn.Conv1d(d_model, d_model, 3, 1, 1), num_layers) + self.num_layers = num_layers + + def forward(self, src, mask): + output = src + out = src + + for i, mod in enumerate(self.layers): + # [T, N, F] --> [N, T, F] --> [N, F, T] + out = output.transpose(1, 0).transpose(2, 1) + out = self.conv[i](out).transpose(2, 1).transpose(1, 0) + + output = mod(output + out, src_mask=mask) + + return output + out + + +class Transformer(nn.Module): + def __init__(self, d_feat=6, d_model=8, nhead=4, num_layers=2, dropout=0.5, device=None): + super(Transformer, self).__init__() + self.rnn = nn.GRU( + input_size=d_model, + hidden_size=d_model, + num_layers=num_layers, + batch_first=False, + dropout=dropout, + ) + self.feature_layer = nn.Linear(d_feat, d_model) + self.pos_encoder = PositionalEncoding(d_model) + self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout) + self.transformer_encoder = LocalformerEncoder(self.encoder_layer, num_layers=num_layers, d_model=d_model) + self.decoder_layer = nn.Linear(d_model, 1) + self.device = device + self.d_feat = d_feat + + def forward(self, src): + # src [N, F*T] --> [N, T, F] + src = src.reshape(len(src), self.d_feat, -1).permute(0, 2, 1) + src = self.feature_layer(src) + + # src [N, T, F] --> [T, N, F], [60, 512, 8] + src = src.transpose(1, 0) # not batch first + + mask = None + + src = self.pos_encoder(src) + output = self.transformer_encoder(src, mask) # [60, 512, 8] + + output, _ = self.rnn(output) + + # [T, N, F] --> [N, T*F] + output = self.decoder_layer(output.transpose(1, 0)[:, -1, :]) # [512, 1] + + return output.squeeze() diff --git a/qlib/contrib/model/pytorch_localformer_ts.py b/qlib/contrib/model/pytorch_localformer_ts.py index cb659e2f0e..ebb7e5abfa 100644 --- a/qlib/contrib/model/pytorch_localformer_ts.py +++ b/qlib/contrib/model/pytorch_localformer_ts.py @@ -1,303 +1,305 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - - -from __future__ import division -from __future__ import print_function - -import numpy as np -import pandas as pd -import copy -import math -from ...utils import get_or_create_path -from ...log import get_module_logger - -import torch -import torch.nn as nn -import torch.optim as optim -from torch.utils.data import DataLoader - -from ...model.base import Model -from ...data.dataset import DatasetH -from ...data.dataset.handler import DataHandlerLP -from torch.nn.modules.container import ModuleList -from .pytorch_utils import get_device - - -class LocalformerModel(Model): - def __init__( - self, - d_feat: int = 20, - d_model: int = 64, - batch_size: int = 8192, - nhead: int = 2, - num_layers: int = 2, - dropout: float = 0, - n_epochs=100, - lr=0.0001, - metric="", - early_stop=5, - loss="mse", - optimizer="adam", - reg=1e-3, - n_jobs=10, - GPU=0, - seed=None, - **kwargs, - ): - # set hyper-parameters. - self.d_model = d_model - self.dropout = dropout - self.n_epochs = n_epochs - self.lr = lr - self.reg = reg - self.metric = metric - self.batch_size = batch_size - self.early_stop = early_stop - self.optimizer = optimizer.lower() - self.loss = loss - self.n_jobs = n_jobs - self.device = get_device(GPU) - self.seed = seed - self.logger = get_module_logger("TransformerModel") - self.logger.info( - "Improved Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device) - ) - - if self.seed is not None: - np.random.seed(self.seed) - torch.manual_seed(self.seed) - - self.model = Transformer(d_feat, d_model, nhead, num_layers, dropout, self.device) - if optimizer.lower() == "adam": - self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.reg) - elif optimizer.lower() == "gd": - self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=self.reg) - else: - raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) - - self.fitted = False - self.model.to(self.device) - - @property - def use_gpu(self): - return self.device != torch.device("cpu") - - def mse(self, pred, label): - loss = (pred.float() - label.float()) ** 2 - return torch.mean(loss) - - def loss_fn(self, pred, label): - mask = ~torch.isnan(label) - - if self.loss == "mse": - return self.mse(pred[mask], label[mask]) - - raise ValueError("unknown loss `%s`" % self.loss) - - def metric_fn(self, pred, label): - mask = torch.isfinite(label) - - if self.metric in ("", "loss"): - return -self.loss_fn(pred[mask], label[mask]) - - raise ValueError("unknown metric `%s`" % self.metric) - - def train_epoch(self, data_loader): - self.model.train() - - for data in data_loader: - feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) - label = data[:, -1, -1].to(self.device, dtype=torch.float32) - - pred = self.model(feature) - loss = self.loss_fn(pred, label) - - self.train_optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) - self.train_optimizer.step() - - def test_epoch(self, data_loader): - self.model.eval() - - scores = [] - losses = [] - - for data in data_loader: - feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) - label = data[:, -1, -1].to(self.device, dtype=torch.float32) - - with torch.no_grad(): - pred = self.model(feature) - loss = self.loss_fn(pred, label) - losses.append(loss.item()) - - score = self.metric_fn(pred, label) - scores.append(score.item()) - - return np.mean(losses), np.mean(scores) - - def fit( - self, - dataset: DatasetH, - evals_result=dict(), - save_path=None, - ): - dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) - dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) - if dl_train.empty or dl_valid.empty: - raise ValueError("Empty data from dataset, please check your dataset config.") - - dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader - dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader - - train_loader = DataLoader( - dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True - ) - valid_loader = DataLoader( - dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True - ) - - save_path = get_or_create_path(save_path) - - stop_steps = 0 - train_loss = 0 - best_score = -np.inf - best_epoch = 0 - evals_result["train"] = [] - evals_result["valid"] = [] - - # train - self.logger.info("training...") - self.fitted = True - - for step in range(self.n_epochs): - self.logger.info("Epoch%d:", step) - self.logger.info("training...") - self.train_epoch(train_loader) - self.logger.info("evaluating...") - train_loss, train_score = self.test_epoch(train_loader) - val_loss, val_score = self.test_epoch(valid_loader) - self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) - evals_result["train"].append(train_score) - evals_result["valid"].append(val_score) - - if val_score > best_score: - best_score = val_score - stop_steps = 0 - best_epoch = step - best_param = copy.deepcopy(self.model.state_dict()) - else: - stop_steps += 1 - if stop_steps >= self.early_stop: - self.logger.info("early stop") - break - - self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) - self.model.load_state_dict(best_param) - torch.save(best_param, save_path) - - if self.use_gpu: - torch.cuda.empty_cache() - - def predict(self, dataset): - if not self.fitted: - raise ValueError("model is not fitted yet!") - - dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) - dl_test.config(fillna_type="ffill+bfill") - test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs) - self.model.eval() - preds = [] - - for data in test_loader: - feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) - - with torch.no_grad(): - pred = self.model(feature).detach().cpu().numpy() - - preds.append(pred) - - return pd.Series(np.concatenate(preds), index=dl_test.get_index()) - - -class PositionalEncoding(nn.Module): - def __init__(self, d_model, max_len=1000): - super(PositionalEncoding, self).__init__() - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0).transpose(0, 1) - self.register_buffer("pe", pe) - - def forward(self, x): - # [T, N, F] - return x + self.pe[: x.size(0), :] - - -def _get_clones(module, N): - return ModuleList([copy.deepcopy(module) for i in range(N)]) - - -class LocalformerEncoder(nn.Module): - __constants__ = ["norm"] - - def __init__(self, encoder_layer, num_layers, d_model): - super(LocalformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.conv = _get_clones(nn.Conv1d(d_model, d_model, 3, 1, 1), num_layers) - self.num_layers = num_layers - - def forward(self, src, mask): - output = src - out = src - - for i, mod in enumerate(self.layers): - # [T, N, F] --> [N, T, F] --> [N, F, T] - out = output.transpose(1, 0).transpose(2, 1) - out = self.conv[i](out).transpose(2, 1).transpose(1, 0) - - output = mod(output + out, src_mask=mask) - - return output + out - - -class Transformer(nn.Module): - def __init__(self, d_feat=6, d_model=8, nhead=4, num_layers=2, dropout=0.5, device=None): - super(Transformer, self).__init__() - self.rnn = nn.GRU( - input_size=d_model, - hidden_size=d_model, - num_layers=num_layers, - batch_first=False, - dropout=dropout, - ) - self.feature_layer = nn.Linear(d_feat, d_model) - self.pos_encoder = PositionalEncoding(d_model) - self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout) - self.transformer_encoder = LocalformerEncoder(self.encoder_layer, num_layers=num_layers, d_model=d_model) - self.decoder_layer = nn.Linear(d_model, 1) - self.device = device - self.d_feat = d_feat - - def forward(self, src): - # src [N, T, F], [512, 60, 6] - src = self.feature_layer(src) # [512, 60, 8] - - # src [N, T, F] --> [T, N, F], [60, 512, 8] - src = src.transpose(1, 0) # not batch first - - mask = None - - src = self.pos_encoder(src) - output = self.transformer_encoder(src, mask) # [60, 512, 8] - - output, _ = self.rnn(output) - - # [T, N, F] --> [N, T*F] - output = self.decoder_layer(output.transpose(1, 0)[:, -1, :]) # [512, 1] - - return output.squeeze() +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +import copy +import math +from ...utils import get_or_create_path +from ...log import get_module_logger + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from torch.nn.modules.container import ModuleList +from .pytorch_utils import get_device + + +class LocalformerModel(Model): + def __init__( + self, + d_feat: int = 20, + d_model: int = 64, + batch_size: int = 8192, + nhead: int = 2, + num_layers: int = 2, + dropout: float = 0, + n_epochs=100, + lr=0.0001, + metric="", + early_stop=5, + loss="mse", + optimizer="adam", + reg=1e-3, + n_jobs=10, + GPU=0, + seed=None, + **kwargs, + ): + # set hyper-parameters. + self.d_model = d_model + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.reg = reg + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.n_jobs = n_jobs + self.device = get_device(GPU) + self.seed = seed + self.logger = get_module_logger("TransformerModel") + self.logger.info( + "Improved Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.model = Transformer(d_feat, d_model, nhead, num_layers, dropout, self.device) + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.reg) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=self.reg) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred.float() - label.float()) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def train_epoch(self, data_loader): + self.model.train() + + for data in data_loader: + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) + + pred = self.model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_loader): + self.model.eval() + + scores = [] + losses = [] + + for data in data_loader: + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) + + with torch.no_grad(): + pred = self.model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + if dl_train.empty or dl_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader + dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader + + train_loader = DataLoader( + dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True + ) + valid_loader = DataLoader( + dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True + ) + + save_path = get_or_create_path(save_path) + + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(train_loader) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(train_loader) + val_loss, val_score = self.test_epoch(valid_loader) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() + + def predict(self, dataset): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) + dl_test.config(fillna_type="ffill+bfill") + test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs) + self.model.eval() + preds = [] + + for data in test_loader: + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + + with torch.no_grad(): + pred = self.model(feature).detach().cpu().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=dl_test.get_index()) + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=1000): + super(PositionalEncoding, self).__init__() + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer("pe", pe) + + def forward(self, x): + # [T, N, F] + return x + self.pe[: x.size(0), :] + + +def _get_clones(module, N): + return ModuleList([copy.deepcopy(module) for i in range(N)]) + + +class LocalformerEncoder(nn.Module): + __constants__ = ["norm"] + + def __init__(self, encoder_layer, num_layers, d_model): + super(LocalformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.conv = _get_clones(nn.Conv1d(d_model, d_model, 3, 1, 1), num_layers) + self.num_layers = num_layers + + def forward(self, src, mask): + output = src + out = src + + for i, mod in enumerate(self.layers): + # [T, N, F] --> [N, T, F] --> [N, F, T] + out = output.transpose(1, 0).transpose(2, 1) + out = self.conv[i](out).transpose(2, 1).transpose(1, 0) + + output = mod(output + out, src_mask=mask) + + return output + out + + +class Transformer(nn.Module): + def __init__(self, d_feat=6, d_model=8, nhead=4, num_layers=2, dropout=0.5, device=None): + super(Transformer, self).__init__() + self.rnn = nn.GRU( + input_size=d_model, + hidden_size=d_model, + num_layers=num_layers, + batch_first=False, + dropout=dropout, + ) + self.feature_layer = nn.Linear(d_feat, d_model) + self.pos_encoder = PositionalEncoding(d_model) + self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout) + self.transformer_encoder = LocalformerEncoder(self.encoder_layer, num_layers=num_layers, d_model=d_model) + self.decoder_layer = nn.Linear(d_model, 1) + self.device = device + self.d_feat = d_feat + + def forward(self, src): + # src [N, T, F], [512, 60, 6] + src = self.feature_layer(src) # [512, 60, 8] + + # src [N, T, F] --> [T, N, F], [60, 512, 8] + src = src.transpose(1, 0) # not batch first + + mask = None + + src = self.pos_encoder(src) + output = self.transformer_encoder(src, mask) # [60, 512, 8] + + output, _ = self.rnn(output) + + # [T, N, F] --> [N, T*F] + output = self.decoder_layer(output.transpose(1, 0)[:, -1, :]) # [512, 1] + + return output.squeeze() diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index 78582f1f36..1a7899b2da 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -259,6 +259,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py index 825c253a62..baf4a09fe4 100755 --- a/qlib/contrib/model/pytorch_lstm_ts.py +++ b/qlib/contrib/model/pytorch_lstm_ts.py @@ -273,6 +273,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_nn.py b/qlib/contrib/model/pytorch_nn.py index 3b325bfb12..ec985434a3 100644 --- a/qlib/contrib/model/pytorch_nn.py +++ b/qlib/contrib/model/pytorch_nn.py @@ -152,7 +152,6 @@ def __init__( mode="min", factor=0.5, patience=10, - verbose=True, threshold=0.0001, threshold_mode="rel", cooldown=0, @@ -330,6 +329,8 @@ def fit( self.dnn_model.load_state_dict(torch.load(save_path, map_location=self.device)) if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def get_lr(self): assert len(self.train_optimizer.param_groups) == 1 diff --git a/qlib/contrib/model/pytorch_sandwich.py b/qlib/contrib/model/pytorch_sandwich.py index 1a512b6344..f8706d353c 100644 --- a/qlib/contrib/model/pytorch_sandwich.py +++ b/qlib/contrib/model/pytorch_sandwich.py @@ -357,6 +357,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py index 9146af1096..9fd7b0b51a 100644 --- a/qlib/contrib/model/pytorch_sfm.py +++ b/qlib/contrib/model/pytorch_sfm.py @@ -412,6 +412,8 @@ def fit( torch.save(best_param, save_path) if self.device != "cpu": torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def mse(self, pred, label): loss = (pred - label) ** 2 diff --git a/qlib/contrib/model/pytorch_tabnet.py b/qlib/contrib/model/pytorch_tabnet.py index 8395ee4aec..ef338f3e79 100644 --- a/qlib/contrib/model/pytorch_tabnet.py +++ b/qlib/contrib/model/pytorch_tabnet.py @@ -213,6 +213,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_tcn.py b/qlib/contrib/model/pytorch_tcn.py index 5a760402b9..ea2f8cc330 100755 --- a/qlib/contrib/model/pytorch_tcn.py +++ b/qlib/contrib/model/pytorch_tcn.py @@ -268,6 +268,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_tcn_ts.py b/qlib/contrib/model/pytorch_tcn_ts.py index ac750f1001..396ccade7a 100755 --- a/qlib/contrib/model/pytorch_tcn_ts.py +++ b/qlib/contrib/model/pytorch_tcn_ts.py @@ -262,6 +262,8 @@ def fit( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() def predict(self, dataset): if not self.fitted: diff --git a/qlib/contrib/model/pytorch_tcts.py b/qlib/contrib/model/pytorch_tcts.py index bb059fd872..68eb443f86 100644 --- a/qlib/contrib/model/pytorch_tcts.py +++ b/qlib/contrib/model/pytorch_tcts.py @@ -346,6 +346,8 @@ def training( if self.use_gpu: torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() return best_loss diff --git a/qlib/contrib/model/pytorch_transformer.py b/qlib/contrib/model/pytorch_transformer.py index eef57a8b80..360aba34db 100644 --- a/qlib/contrib/model/pytorch_transformer.py +++ b/qlib/contrib/model/pytorch_transformer.py @@ -1,286 +1,288 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - - -from __future__ import division -from __future__ import print_function - -import numpy as np -import pandas as pd -from typing import Text, Union -import copy -import math -from ...utils import get_or_create_path -from ...log import get_module_logger - -import torch -import torch.nn as nn -import torch.optim as optim - -from ...model.base import Model -from ...data.dataset import DatasetH -from ...data.dataset.handler import DataHandlerLP -from .pytorch_utils import get_device - -# qrun examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml ” - - -class TransformerModel(Model): - def __init__( - self, - d_feat: int = 20, - d_model: int = 64, - batch_size: int = 2048, - nhead: int = 2, - num_layers: int = 2, - dropout: float = 0, - n_epochs=100, - lr=0.0001, - metric="", - early_stop=5, - loss="mse", - optimizer="adam", - reg=1e-3, - n_jobs=10, - GPU=0, - seed=None, - **kwargs, - ): - # set hyper-parameters. - self.d_model = d_model - self.dropout = dropout - self.n_epochs = n_epochs - self.lr = lr - self.reg = reg - self.metric = metric - self.batch_size = batch_size - self.early_stop = early_stop - self.optimizer = optimizer.lower() - self.loss = loss - self.n_jobs = n_jobs - self.device = get_device(GPU) - self.seed = seed - self.logger = get_module_logger("TransformerModel") - self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) - - if self.seed is not None: - np.random.seed(self.seed) - torch.manual_seed(self.seed) - - self.model = Transformer(d_feat, d_model, nhead, num_layers, dropout, self.device) - if optimizer.lower() == "adam": - self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.reg) - elif optimizer.lower() == "gd": - self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=self.reg) - else: - raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) - - self.fitted = False - self.model.to(self.device) - - @property - def use_gpu(self): - return self.device != torch.device("cpu") - - def mse(self, pred, label): - loss = (pred.float() - label.float()) ** 2 - return torch.mean(loss) - - def loss_fn(self, pred, label): - mask = ~torch.isnan(label) - - if self.loss == "mse": - return self.mse(pred[mask], label[mask]) - - raise ValueError("unknown loss `%s`" % self.loss) - - def metric_fn(self, pred, label): - mask = torch.isfinite(label) - - if self.metric in ("", "loss"): - return -self.loss_fn(pred[mask], label[mask]) - - raise ValueError("unknown metric `%s`" % self.metric) - - def train_epoch(self, x_train, y_train): - x_train_values = x_train.values - y_train_values = np.squeeze(y_train.values) - - self.model.train() - - indices = np.arange(len(x_train_values)) - np.random.shuffle(indices) - - for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: - break - - feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) - label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) - - pred = self.model(feature) - loss = self.loss_fn(pred, label) - - self.train_optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) - self.train_optimizer.step() - - def test_epoch(self, data_x, data_y): - # prepare training data - x_values = data_x.values - y_values = np.squeeze(data_y.values) - - self.model.eval() - - scores = [] - losses = [] - - indices = np.arange(len(x_values)) - - for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: - break - - feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) - label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) - - with torch.no_grad(): - pred = self.model(feature) - loss = self.loss_fn(pred, label) - losses.append(loss.item()) - - score = self.metric_fn(pred, label) - scores.append(score.item()) - - return np.mean(losses), np.mean(scores) - - def fit( - self, - dataset: DatasetH, - evals_result=dict(), - save_path=None, - ): - df_train, df_valid, df_test = dataset.prepare( - ["train", "valid", "test"], - col_set=["feature", "label"], - data_key=DataHandlerLP.DK_L, - ) - if df_train.empty or df_valid.empty: - raise ValueError("Empty data from dataset, please check your dataset config.") - - x_train, y_train = df_train["feature"], df_train["label"] - x_valid, y_valid = df_valid["feature"], df_valid["label"] - - save_path = get_or_create_path(save_path) - stop_steps = 0 - train_loss = 0 - best_score = -np.inf - best_epoch = 0 - evals_result["train"] = [] - evals_result["valid"] = [] - - # train - self.logger.info("training...") - self.fitted = True - - for step in range(self.n_epochs): - self.logger.info("Epoch%d:", step) - self.logger.info("training...") - self.train_epoch(x_train, y_train) - self.logger.info("evaluating...") - train_loss, train_score = self.test_epoch(x_train, y_train) - val_loss, val_score = self.test_epoch(x_valid, y_valid) - self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) - evals_result["train"].append(train_score) - evals_result["valid"].append(val_score) - - if val_score > best_score: - best_score = val_score - stop_steps = 0 - best_epoch = step - best_param = copy.deepcopy(self.model.state_dict()) - else: - stop_steps += 1 - if stop_steps >= self.early_stop: - self.logger.info("early stop") - break - - self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) - self.model.load_state_dict(best_param) - torch.save(best_param, save_path) - - if self.use_gpu: - torch.cuda.empty_cache() - - def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): - if not self.fitted: - raise ValueError("model is not fitted yet!") - - x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) - index = x_test.index - self.model.eval() - x_values = x_test.values - sample_num = x_values.shape[0] - preds = [] - - for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: - end = sample_num - else: - end = begin + self.batch_size - - x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) - - with torch.no_grad(): - pred = self.model(x_batch).detach().cpu().numpy() - - preds.append(pred) - - return pd.Series(np.concatenate(preds), index=index) - - -class PositionalEncoding(nn.Module): - def __init__(self, d_model, max_len=1000): - super(PositionalEncoding, self).__init__() - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0).transpose(0, 1) - self.register_buffer("pe", pe) - - def forward(self, x): - # [T, N, F] - return x + self.pe[: x.size(0), :] - - -class Transformer(nn.Module): - def __init__(self, d_feat=6, d_model=8, nhead=4, num_layers=2, dropout=0.5, device=None): - super(Transformer, self).__init__() - self.feature_layer = nn.Linear(d_feat, d_model) - self.pos_encoder = PositionalEncoding(d_model) - self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout) - self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers) - self.decoder_layer = nn.Linear(d_model, 1) - self.device = device - self.d_feat = d_feat - - def forward(self, src): - # src [N, F*T] --> [N, T, F] - src = src.reshape(len(src), self.d_feat, -1).permute(0, 2, 1) - src = self.feature_layer(src) - - # src [N, T, F] --> [T, N, F], [60, 512, 8] - src = src.transpose(1, 0) # not batch first - - mask = None - - src = self.pos_encoder(src) - output = self.transformer_encoder(src, mask) # [60, 512, 8] - - # [T, N, F] --> [N, T*F] - output = self.decoder_layer(output.transpose(1, 0)[:, -1, :]) # [512, 1] - - return output.squeeze() +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +from typing import Text, Union +import copy +import math +from ...utils import get_or_create_path +from ...log import get_module_logger + +import torch +import torch.nn as nn +import torch.optim as optim + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device + +# qrun examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml ” + + +class TransformerModel(Model): + def __init__( + self, + d_feat: int = 20, + d_model: int = 64, + batch_size: int = 2048, + nhead: int = 2, + num_layers: int = 2, + dropout: float = 0, + n_epochs=100, + lr=0.0001, + metric="", + early_stop=5, + loss="mse", + optimizer="adam", + reg=1e-3, + n_jobs=10, + GPU=0, + seed=None, + **kwargs, + ): + # set hyper-parameters. + self.d_model = d_model + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.reg = reg + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.n_jobs = n_jobs + self.device = get_device(GPU) + self.seed = seed + self.logger = get_module_logger("TransformerModel") + self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.model = Transformer(d_feat, d_model, nhead, num_layers, dropout, self.device) + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.reg) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=self.reg) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred.float() - label.float()) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def train_epoch(self, x_train, y_train): + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) + + self.model.train() + + indices = np.arange(len(x_train_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.model.eval() + + scores = [] + losses = [] + + indices = np.arange(len(x_values)) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) + + with torch.no_grad(): + pred = self.model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], + col_set=["feature", "label"], + data_key=DataHandlerLP.DK_L, + ) + if df_train.empty or df_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + save_path = get_or_create_path(save_path) + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) + index = x_test.index + self.model.eval() + x_values = x_test.values + sample_num = x_values.shape[0] + preds = [] + + for begin in range(sample_num)[:: self.batch_size]: + if sample_num - begin < self.batch_size: + end = sample_num + else: + end = begin + self.batch_size + + x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) + + with torch.no_grad(): + pred = self.model(x_batch).detach().cpu().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=index) + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=1000): + super(PositionalEncoding, self).__init__() + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer("pe", pe) + + def forward(self, x): + # [T, N, F] + return x + self.pe[: x.size(0), :] + + +class Transformer(nn.Module): + def __init__(self, d_feat=6, d_model=8, nhead=4, num_layers=2, dropout=0.5, device=None): + super(Transformer, self).__init__() + self.feature_layer = nn.Linear(d_feat, d_model) + self.pos_encoder = PositionalEncoding(d_model) + self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout) + self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers) + self.decoder_layer = nn.Linear(d_model, 1) + self.device = device + self.d_feat = d_feat + + def forward(self, src): + # src [N, F*T] --> [N, T, F] + src = src.reshape(len(src), self.d_feat, -1).permute(0, 2, 1) + src = self.feature_layer(src) + + # src [N, T, F] --> [T, N, F], [60, 512, 8] + src = src.transpose(1, 0) # not batch first + + mask = None + + src = self.pos_encoder(src) + output = self.transformer_encoder(src, mask) # [60, 512, 8] + + # [T, N, F] --> [N, T*F] + output = self.decoder_layer(output.transpose(1, 0)[:, -1, :]) # [512, 1] + + return output.squeeze() diff --git a/qlib/contrib/model/pytorch_transformer_ts.py b/qlib/contrib/model/pytorch_transformer_ts.py index 5364bc89d8..3c3861fa2b 100644 --- a/qlib/contrib/model/pytorch_transformer_ts.py +++ b/qlib/contrib/model/pytorch_transformer_ts.py @@ -1,265 +1,267 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - - -from __future__ import division -from __future__ import print_function - -import numpy as np -import pandas as pd -import copy -import math -from ...utils import get_or_create_path -from ...log import get_module_logger - -import torch -import torch.nn as nn -import torch.optim as optim -from torch.utils.data import DataLoader - -from ...model.base import Model -from ...data.dataset import DatasetH -from ...data.dataset.handler import DataHandlerLP -from .pytorch_utils import get_device - - -class TransformerModel(Model): - def __init__( - self, - d_feat: int = 20, - d_model: int = 64, - batch_size: int = 8192, - nhead: int = 2, - num_layers: int = 2, - dropout: float = 0, - n_epochs=100, - lr=0.0001, - metric="", - early_stop=5, - loss="mse", - optimizer="adam", - reg=1e-3, - n_jobs=10, - GPU=0, - seed=None, - **kwargs, - ): - # set hyper-parameters. - self.d_model = d_model - self.dropout = dropout - self.n_epochs = n_epochs - self.lr = lr - self.reg = reg - self.metric = metric - self.batch_size = batch_size - self.early_stop = early_stop - self.optimizer = optimizer.lower() - self.loss = loss - self.n_jobs = n_jobs - self.device = get_device(GPU) - self.seed = seed - self.logger = get_module_logger("TransformerModel") - self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) - - if self.seed is not None: - np.random.seed(self.seed) - torch.manual_seed(self.seed) - - self.model = Transformer(d_feat, d_model, nhead, num_layers, dropout, self.device) - if optimizer.lower() == "adam": - self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.reg) - elif optimizer.lower() == "gd": - self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=self.reg) - else: - raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) - - self.fitted = False - self.model.to(self.device) - - @property - def use_gpu(self): - return self.device != torch.device("cpu") - - def mse(self, pred, label): - loss = (pred.float() - label.float()) ** 2 - return torch.mean(loss) - - def loss_fn(self, pred, label): - mask = ~torch.isnan(label) - - if self.loss == "mse": - return self.mse(pred[mask], label[mask]) - - raise ValueError("unknown loss `%s`" % self.loss) - - def metric_fn(self, pred, label): - mask = torch.isfinite(label) - - if self.metric in ("", "loss"): - return -self.loss_fn(pred[mask], label[mask]) - - raise ValueError("unknown metric `%s`" % self.metric) - - def train_epoch(self, data_loader): - self.model.train() - - for data in data_loader: - feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) - label = data[:, -1, -1].to(self.device, dtype=torch.float32) - - pred = self.model(feature) - loss = self.loss_fn(pred, label) - - self.train_optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) - self.train_optimizer.step() - - def test_epoch(self, data_loader): - self.model.eval() - - scores = [] - losses = [] - - for data in data_loader: - feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) - label = data[:, -1, -1].to(self.device, dtype=torch.float32) - - with torch.no_grad(): - pred = self.model(feature) - loss = self.loss_fn(pred, label) - losses.append(loss.item()) - - score = self.metric_fn(pred, label) - scores.append(score.item()) - - return np.mean(losses), np.mean(scores) - - def fit( - self, - dataset: DatasetH, - evals_result=dict(), - save_path=None, - ): - dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) - dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) - - if dl_train.empty or dl_valid.empty: - raise ValueError("Empty data from dataset, please check your dataset config.") - - dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader - dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader - - train_loader = DataLoader( - dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True - ) - valid_loader = DataLoader( - dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True - ) - - save_path = get_or_create_path(save_path) - - stop_steps = 0 - train_loss = 0 - best_score = -np.inf - best_epoch = 0 - evals_result["train"] = [] - evals_result["valid"] = [] - - # train - self.logger.info("training...") - self.fitted = True - - for step in range(self.n_epochs): - self.logger.info("Epoch%d:", step) - self.logger.info("training...") - self.train_epoch(train_loader) - self.logger.info("evaluating...") - train_loss, train_score = self.test_epoch(train_loader) - val_loss, val_score = self.test_epoch(valid_loader) - self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) - evals_result["train"].append(train_score) - evals_result["valid"].append(val_score) - - if val_score > best_score: - best_score = val_score - stop_steps = 0 - best_epoch = step - best_param = copy.deepcopy(self.model.state_dict()) - else: - stop_steps += 1 - if stop_steps >= self.early_stop: - self.logger.info("early stop") - break - - self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) - self.model.load_state_dict(best_param) - torch.save(best_param, save_path) - - if self.use_gpu: - torch.cuda.empty_cache() - - def predict(self, dataset): - if not self.fitted: - raise ValueError("model is not fitted yet!") - - dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) - dl_test.config(fillna_type="ffill+bfill") - test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs) - self.model.eval() - preds = [] - - for data in test_loader: - feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) - - with torch.no_grad(): - pred = self.model(feature).detach().cpu().numpy() - - preds.append(pred) - - return pd.Series(np.concatenate(preds), index=dl_test.get_index()) - - -class PositionalEncoding(nn.Module): - def __init__(self, d_model, max_len=1000): - super(PositionalEncoding, self).__init__() - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0).transpose(0, 1) - self.register_buffer("pe", pe) - - def forward(self, x): - # [T, N, F] - return x + self.pe[: x.size(0), :] - - -class Transformer(nn.Module): - def __init__(self, d_feat=6, d_model=8, nhead=4, num_layers=2, dropout=0.5, device=None): - super(Transformer, self).__init__() - self.feature_layer = nn.Linear(d_feat, d_model) - self.pos_encoder = PositionalEncoding(d_model) - self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout) - self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers) - self.decoder_layer = nn.Linear(d_model, 1) - self.device = device - self.d_feat = d_feat - - def forward(self, src): - # src [N, T, F], [512, 60, 6] - src = self.feature_layer(src) # [512, 60, 8] - - # src [N, T, F] --> [T, N, F], [60, 512, 8] - src = src.transpose(1, 0) # not batch first - - mask = None - - src = self.pos_encoder(src) - output = self.transformer_encoder(src, mask) # [60, 512, 8] - - # [T, N, F] --> [N, T*F] - output = self.decoder_layer(output.transpose(1, 0)[:, -1, :]) # [512, 1] - - return output.squeeze() +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +import copy +import math +from ...utils import get_or_create_path +from ...log import get_module_logger + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from .pytorch_utils import get_device + + +class TransformerModel(Model): + def __init__( + self, + d_feat: int = 20, + d_model: int = 64, + batch_size: int = 8192, + nhead: int = 2, + num_layers: int = 2, + dropout: float = 0, + n_epochs=100, + lr=0.0001, + metric="", + early_stop=5, + loss="mse", + optimizer="adam", + reg=1e-3, + n_jobs=10, + GPU=0, + seed=None, + **kwargs, + ): + # set hyper-parameters. + self.d_model = d_model + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.reg = reg + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.n_jobs = n_jobs + self.device = get_device(GPU) + self.seed = seed + self.logger = get_module_logger("TransformerModel") + self.logger.info("Naive Transformer:" "\nbatch_size : {}" "\ndevice : {}".format(self.batch_size, self.device)) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.model = Transformer(d_feat, d_model, nhead, num_layers, dropout, self.device) + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.reg) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=self.reg) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred.float() - label.float()) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def train_epoch(self, data_loader): + self.model.train() + + for data in data_loader: + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) + + pred = self.model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_loader): + self.model.eval() + + scores = [] + losses = [] + + for data in data_loader: + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + label = data[:, -1, -1].to(self.device, dtype=torch.float32) + + with torch.no_grad(): + pred = self.model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + + if dl_train.empty or dl_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader + dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader + + train_loader = DataLoader( + dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True + ) + valid_loader = DataLoader( + dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True + ) + + save_path = get_or_create_path(save_path) + + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(train_loader) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(train_loader) + val_loss, val_score = self.test_epoch(valid_loader) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + torch.mps.empty_cache() + + def predict(self, dataset): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) + dl_test.config(fillna_type="ffill+bfill") + test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs) + self.model.eval() + preds = [] + + for data in test_loader: + feature = data[:, :, 0:-1].to(self.device, dtype=torch.float32) + + with torch.no_grad(): + pred = self.model(feature).detach().cpu().numpy() + + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=dl_test.get_index()) + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=1000): + super(PositionalEncoding, self).__init__() + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer("pe", pe) + + def forward(self, x): + # [T, N, F] + return x + self.pe[: x.size(0), :] + + +class Transformer(nn.Module): + def __init__(self, d_feat=6, d_model=8, nhead=4, num_layers=2, dropout=0.5, device=None): + super(Transformer, self).__init__() + self.feature_layer = nn.Linear(d_feat, d_model) + self.pos_encoder = PositionalEncoding(d_model) + self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout) + self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers) + self.decoder_layer = nn.Linear(d_model, 1) + self.device = device + self.d_feat = d_feat + + def forward(self, src): + # src [N, T, F], [512, 60, 6] + src = self.feature_layer(src) # [512, 60, 8] + + # src [N, T, F] --> [T, N, F], [60, 512, 8] + src = src.transpose(1, 0) # not batch first + + mask = None + + src = self.pos_encoder(src) + output = self.transformer_encoder(src, mask) # [60, 512, 8] + + # [T, N, F] --> [N, T*F] + output = self.decoder_layer(output.transpose(1, 0)[:, -1, :]) # [512, 1] + + return output.squeeze()