### Predict Readmission using Pytorch's Logistic Regression and DAN

#### Logistic Regression

In [64]:
import argparse
from argparse import ArgumentParser
from datetime import datetime
import json
import logging
from pathlib import Path
import shutil
from typing import Dict, List, Tuple, Type

import numpy as np
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from scipy import sparse
import torch
from torch import nn
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from torchmetrics import Accuracy, Precision


class LogisticRegressionModel(nn.Module):
    """
    Logistic regression binary classification model
    """

    def __init__(self, num_features):
        """
        # Parameters
        num_features : `int`, required.
            Number of the features.
        # Returns
            `None`
        """
        super().__init__()
        # Hw-TODO: Add a linear layer to weight the features.
        self.linear = nn.Linear(in_features=num_features, out_features=1, bias=True)
        self.dropout = nn.Dropout(0.6)

    def forward(self, features):
        """
        Returns the logits of the model given features. 
        Note that model predictions should be either 0 or 1 based on a threshold.
        # Parameters
        features : `torch.FloatTensor`, required.
            The tensor of features with the shape (batch_size, num_of_features)
        # Returns
        probs : `torch.FloatTensor`, required.
            The tensor of probabilities with the shape (batch_size, 1) or (batch_size,)
        """
        # Hw-TODO: Use `self.linear` you created in `__init__`
        #          and appropriate nonlinearity/activation-function to compute
        #          and return the probabilities of belonging to a class in the logistic regression.
        out = self.linear(features)
        out = self.dropout(out)
        probs = torch.sigmoid(out)
        return probs

In [65]:
class BinaryClassificationLModule(pl.LightningModule):

    def __init__(self, **kwargs):
        super().__init__()

        # Save arguments to `hparams` attribute, see the doc [here](https://pytorch-lightning.readthedocs.io/en/latest/common/hyperparameters.html).
        self.save_hyperparameters()
        data_dir = Path(self.hparams.data_dir)
        #self.hparams.vocab = json.load(
        #    open(data_dir.joinpath(self.hparams.vocab_filename)))
        #self.hparams.vocab_size = len(self.hparams.vocab)

        self.model = self.get_model()
        self.step_count = 0
        self.accuracy = Accuracy()
        self.pr = Precision(threshold=0.5,average='macro',num_classes=1,multiclass=False)

    def forward(self, *args, **kwargs):
        return self.model(*args, **kwargs)

    def training_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = self.batch2labels(batch)
        probs = self(**input)
        probs = probs.squeeze()
        # Hw-TODO: Given probs in shape (batch_size,)
        #          and labels of the same shape,
        #          compute the binary cross entropy loss.
        loss = nn.functional.binary_cross_entropy(probs, labels)

        self.log('train_loss', loss, prog_bar=True)
        self.log('train_acc', self.accuracy(probs, labels.int()), prog_bar=True)
        self.log('train_pr', self.pr(probs, labels.int()), prog_bar=True)
        output_dict = {'loss': loss}
        return output_dict

    def validation_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = self.batch2labels(batch)
        probs = self(**input)
        probs = probs.squeeze()

        # Hw-TODO: Given probs in shape (batch_size,)
        #          and labels of the same shape,
        #          compute the binary cross entropy loss.
        loss = nn.functional.binary_cross_entropy(probs, labels)

        self.log('val_loss', loss)
        self.log('val_acc', self.accuracy(probs, labels.int()))
        self.log('val_pr', self.pr(probs, labels.int()))

    def test_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = self.batch2labels(batch)
        probs = self(**input)
        probs = probs.squeeze()

        # Hw-TODO: Given probs in shape (batch_size,)
        #          and labels of the same shape,
        #          compute the binary cross entropy loss.
        loss = nn.functional.binary_cross_entropy(probs, labels)

        self.log('test_loss', loss)
        self.log('test_acc', self.accuracy(probs, labels.int()))
        self.log('test_pr', self.pr(probs, labels.int()))

    def configure_optimizers(self):
        if self.hparams.optimizer == 'sgd':
            optimizer = torch.optim.SGD(self.model.parameters(),
                                        lr=self.hparams.learning_rate)
        elif self.hparams.optimizer == 'adam':
            if self.hparams.l2_regularization:
                optimizer = torch.optim.Adam(self.model.parameters(),
                                         lr=self.hparams.learning_rate,
                                         weight_decay=1e-5)
            else:
                optimizer = torch.optim.Adam(self.model.parameters(),
                                         lr=self.hparams.learning_rate)
        else:
            raise NotImplementedError
        return optimizer

    def train_dataloader(self):
        return self.get_dataloader('train', self.hparams.train_batch_size, shuffle=True)

    def val_dataloader(self):
        return self.get_dataloader('dev', self.hparams.eval_batch_size, shuffle=False)

    def test_dataloader(self):
        return self.get_dataloader('test', self.hparams.eval_batch_size, shuffle=False)

    def get_model(self) -> nn.Module:
        # To be overridden by inherited classes.
        raise NotImplementedError

    def batch2input(self, batch: Tuple[torch.Tensor]) -> Dict[str, torch.Tensor]:
        # To be overridden by inherited classes.
        raise NotImplementedError

    def batch2labels(self, batch: Tuple[torch.Tensor]) -> torch.Tensor:
        # To be overridden by inherited classes.
        raise NotImplementedError

    def get_dataloader(self,
                       split: str,
                       batch_size: int,
                       shuffle: bool = False) -> DataLoader:
        # To be overridden by inherited classes.
        raise NotImplementedError

    @classmethod
    def add_model_specific_args(cls, parser: ArgumentParser) -> ArgumentParser:
        """
        Add arguments to the parser and return the parser.
        """
        # Required arguments:
        parser.add_argument('--vocab_filename',
                            default=None,
                            type=str,
                            required=False,  # changing to false since we're not using vocab
                            help="File name of the feature.")
        # Optional arguments:
        parser.add_argument('--optimizer',
                            default='adam',
                            type=str,
                            help="The optimizer to use, such as sgd or adam.")
        parser.add_argument('--learning_rate',
                            default=1e-3,
                            type=float,
                            help="The initial learning rate for training.")
        parser.add_argument('--l2_regularization',
                            default=False,
                            help="Whether to add L2 regularization.")
        parser.add_argument('--max_epochs',
                            default=10,
                            type=int,
                            help="The number of epochs to train your model.")
        parser.add_argument('--train_batch_size', default=32, type=int)
        parser.add_argument('--eval_batch_size', default=32, type=int)
        parser.add_argument('--seed',
                            type=int,
                            default=42,
                            help="The random seed for initialization")
        parser.add_argument('--do_train',
                            action="store_true",
                            default=True,
                            help="Whether to run training.")
        parser.add_argument('--do_predict',
                            action="store_true",
                            help="Whether to run predictions on the test set.")
        parser.add_argument('--data_dir',
                            default="data",
                            type=str,
                            help="The input data dir. Should contain the training files.")
        parser.add_argument('--output_dir',
                            type=str,
                            help=("The output directory where the model predictions "
                                  "and checkpoints will be written."))
        # NOTE: Set --gpus 0 or change the default value to 0 if not using GPUS.
        # See this [link](https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu.html) for usage of this argument.
        parser.add_argument('--gpus',
                            default=1,
                            type=int,
                            help="The number of GPUs allocated for this, 0 meaning none")
        parser.add_argument('--num_workers',
                            default=8,
                            type=int,
                            help="Config `DataLoader` of pytorch")
        return parser


def generic_train(args: argparse.Namespace,
                  model_class: Type[pl.LightningModule]) -> Dict:
    """
        Train (and optionally predict) and return dict results.
        # Parameters
        args : `argparse.Namespace`, required.
            Configuration of the training and the model
        model_class : `Type[pl.LightningModule]`, required.
            Class of the model to be trained.
        # Returns
        A `dict` object containing the following keys and types.
            trainer: `pl.Trainer`
            model: `pl.LightningModule`
            val_results_best: `list[dict]`
                If `args.do_predict==True`
            test_results_best: `list[dict]`
                If `args.do_predict==True`
            best_model_path: `Path`
                Path to the checkpoint of the best model.
        """
    pl.seed_everything(args.seed)

    tensorboard_log_dir = Path(args.output_dir).joinpath('tensorboard_logs')
    tensorboard_log_dir.mkdir(parents=True, exist_ok=True)

    # Tensorboard logger
    tensorboard_logger = pl_loggers.TensorBoardLogger(
        save_dir=tensorboard_log_dir,
        version='version_' + datetime.now().strftime('%Y%m%d-%H%M%S'),
        name='',
        default_hp_metric=True)
    # Checkpoint callback
    checkpoint_dir = Path(args.output_dir).joinpath(tensorboard_logger.version,
                                                    'checkpoints')
    checkpoint_callback = pl.callbacks.ModelCheckpoint(dirpath=checkpoint_dir,
                                                       filename='{epoch}-{val_acc:.2f}',
                                                       monitor='val_pr',
                                                       mode='max',
                                                       save_top_k=1,
                                                       verbose=True)

    dict_args = vars(args)
    model = model_class(**dict_args)
    trainer = pl.Trainer.from_argparse_args(args,
                                            logger=tensorboard_logger,
                                            callbacks=[checkpoint_callback])

    output_dict = {'trainer': trainer, 'model': model}

    if args.do_train:
        trainer.fit(model=model)
        # Track model performance under differnt hparams settings in "Hparams" of TensorBoard
        tensorboard_logger.log_hyperparams(
            params=model.hparams,
            metrics={'hp_metric': checkpoint_callback.best_model_score.item()})
        tensorboard_logger.save()

        # Save the best model to `best_model.ckpt`
        best_model_path = checkpoint_dir.joinpath('best_model.ckpt')
        logger.info(f"Copy best model from {checkpoint_callback.best_model_path} "
                    f"to {best_model_path}.")
        shutil.copy(checkpoint_callback.best_model_path, best_model_path)

        output_dict.update({
            'trainer': trainer,
            'model': model,
            'best_model_path': best_model_path
        })

    # Optionally, predict on test set.
    if args.do_predict:
        best_model_path = checkpoint_dir.joinpath('best_model.ckpt')
        model = model.load_from_checkpoint(best_model_path)
        val_results_best = trainer.validate(model, verbose=True)
        test_results_best = trainer.test(model, verbose=True)
        print("Validation accuracy on the best model: {: .4f}".format(
            val_results_best[0]['val_acc']))
        print("Validation precision on the best model: {: .4f}".format(
            val_results_best[0]['val_pr']))
        print('val_results_best all',val_results_best[0])
        #print('val_results_best_trulyall',val_results_best)
        print("Test accuracy on the best model: {: .4f}".format(
            test_results_best[0]['test_acc']))
        print("Test precision on the best model: {: .4f}".format(
            test_results_best[0]['test_pr']))
        output_dict.update({
            'val_results_best': val_results_best,
            'test_results_best': test_results_best,
        })

    return output_dict


In [66]:
class FeatureBasedBinaryClassificationLModule(BinaryClassificationLModule):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def get_model(self) -> nn.Module:
        return LogisticRegressionModel(num_features=764)

    def batch2input(self, batch):
        return {'features': batch[0]}

    def batch2labels(self, batch):
        return batch[1]

    def get_dataloader(self,
                       split: str,
                       batch_size: int,
                       shuffle: bool = False) -> DataLoader:
        # NOTE: In order to use different features, change feature_name by
        # passing `--feature_name <feature_name>` in the training loop in
        # the cell below, or revise the code here for correct paths if needed.
        data_dir = Path(self.hparams.data_dir)
        features_filepath = data_dir.joinpath(
            f"{split}_{self.hparams.feature_name}_features.npz")
        labels_filepath = data_dir.joinpath(f"{split}_{self.hparams.label_name}_labels.npz")
        features = sparse.load_npz(features_filepath).todense()
        print('features shape:',features.shape)
        #labels = np.load(labels_filepath, allow_pickle=True)["arr_0"]
        labels = np.asarray(sparse.load_npz(labels_filepath).todense()).ravel()
        print('labels shape:',labels.shape)
        dataset = torch.utils.data.TensorDataset(
            torch.from_numpy(features).float(),
            torch.from_numpy(labels).float())

        logger.info(f"Loading {split} features and labels "
                    f"from {features_filepath} and {labels_filepath}")
        data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                                  batch_size=batch_size,
                                                  shuffle=shuffle,
                                                  num_workers=self.hparams.num_workers)
        return data_loader

    @classmethod
    def add_model_specific_args(cls, parser: ArgumentParser) -> ArgumentParser:
        parser = super().add_model_specific_args(parser)
        # Required arguments:
        parser.add_argument('--feature_name',
                            default=None,
                            type=str,
                            required=True,
                            help="Name of the feature")
                            
        parser.add_argument('--label_name',
                            default=None,
                            type=str,
                            required=True,
                            help="Name of the label")
        # Optional arguments:
        parser.add_argument('--task',
                            default='featurebinarycls',
                            type=str,
                            help="Name of the task.")
        return parser


In [32]:
labels = np.asarray(sparse.load_npz('./data/train_labels.npz').todense()).ravel()
np.unique(labels)

array([0., 1., 2., 3.])

### Predict using RoBERTa

In [68]:
# num_classes=1, multiclass=False, dropout=0.6

logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S",
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# Load hyperparameters
parser = ArgumentParser()
parser = FeatureBasedBinaryClassificationLModule.add_model_specific_args(parser)

# NOTE: You should replace `unigram_binary` in the assignment statement of `args_str =...`
# with whatever feature that you are experimented with.
# You can also configure other options listed in the method of add_model_specific_args of
# the pytorch-lightning model `FeatureBasedBinaryClassificationLModule`.
args_str = ("--feature_name roberta --max_epochs 10 --label_name readmit "
            "--output_dir output/ftrlogistic --optimizer adam --do_train --do_predict")

args = parser.parse_args(args_str.split())

# If output_dir not provided, a folder is generated
if args.output_dir is None:
    args.output_dir = str(
        Path('output').joinpath(
            f"{args.task}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"))
Path(args.output_dir).mkdir(parents=True, exist_ok=True)

print(f"Parsed arguments: {args}")

training_outout = generic_train(args=args,
                                model_class=FeatureBasedBinaryClassificationLModule)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                    | Params
-----------------------------------------------------
0 | model    | LogisticRegressionModel | 765   
1 | accuracy | Accuracy                | 0     
2 | pr       | Precision               | 0     
-----------------------------------------------------
765       Trainable params
0         Non-trainable params
765       Total params
0.003     Total estimated model params size (MB)


Parsed arguments: Namespace(vocab_filename=None, optimizer='adam', learning_rate=0.001, l2_regularization=False, max_epochs=10, train_batch_size=32, eval_batch_size=32, seed=42, do_train=True, do_predict=True, data_dir='data', output_dir='output/ftrlogistic', gpus=1, num_workers=8, feature_name='roberta', label_name='readmit', task='featurebinarycls')


Validation sanity check: 0it [00:00, ?it/s]

2022-03-14 23:46:53 - INFO - __main__ - Loading dev features and labels from data/dev_roberta_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Global seed set to 42
2022-03-14 23:46:54 - INFO - __main__ - Loading train features and labels from data/train_roberta_features.npz and data/train_readmit_labels.npz


features shape: (34416, 764)
labels shape: (34416,)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 1075: val_pr reached 0.01125 (best 0.01125), saving model to "/mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/ftrlogistic/version_20220314-234653/checkpoints/epoch=0-val_acc=0.93.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 2151: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 3227: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 4303: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 5379: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 6455: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 7531: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 8607: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 9683: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 10759: val_pr was not in top 1
2022-03-14 23:50:46 - INFO - __main__ - Copy best model from /mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/ftrlogistic/version_20220314-234653/checkpoints/epoch=0-val_acc=0.93.ckpt to output/ftrlogistic/version_20220314-234653/checkpoints/best_model.ckpt.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2022-03-14 23:50:47 - INFO - __main__ - Loading dev features and labels from data/dev_roberta_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9340190291404724,
 'val_loss': 6.598099708557129,
 'val_pr': 0.0112496018409729}
--------------------------------------------------------------------------------


2022-03-14 23:50:52 - INFO - __main__ - Loading test features and labels from data/test_roberta_features.npz and data/test_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9347162842750549,
 'test_loss': 6.479560852050781,
 'test_pr': 0.00790261197835207}
--------------------------------------------------------------------------------
Validation accuracy on the best model:  0.9340
Validation precision on the best model:  0.0112
val_results_best all {'val_loss': 6.598099708557129, 'val_acc': 0.9340190291404724, 'val_pr': 0.0112496018409729}
Test accuracy on the best model:  0.9347
Test precision on the best model:  0.0079


In [69]:
# dropout=0.6, using l2 regularization

logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S",
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# Load hyperparameters
parser = ArgumentParser()
parser = FeatureBasedBinaryClassificationLModule.add_model_specific_args(parser)

# NOTE: You should replace `unigram_binary` in the assignment statement of `args_str =...`
# with whatever feature that you are experimented with.
# You can also configure other options listed in the method of add_model_specific_args of
# the pytorch-lightning model `FeatureBasedBinaryClassificationLModule`.
args_str = ("--feature_name roberta --max_epochs 10 --label_name readmit "
            "--output_dir output/ftrlogistic --optimizer adam --do_train --do_predict "
            "--l2_regularization True ")

args = parser.parse_args(args_str.split())

# If output_dir not provided, a folder is generated
if args.output_dir is None:
    args.output_dir = str(
        Path('output').joinpath(
            f"{args.task}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"))
Path(args.output_dir).mkdir(parents=True, exist_ok=True)

print(f"Parsed arguments: {args}")

training_outout = generic_train(args=args,
                                model_class=FeatureBasedBinaryClassificationLModule)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                    | Params
-----------------------------------------------------
0 | model    | LogisticRegressionModel | 765   
1 | accuracy | Accuracy                | 0     
2 | pr       | Precision               | 0     
-----------------------------------------------------
765       Trainable params
0         Non-trainable params
765       Total params
0.003     Total estimated model params size (MB)


Parsed arguments: Namespace(vocab_filename=None, optimizer='adam', learning_rate=0.001, l2_regularization='True', max_epochs=10, train_batch_size=32, eval_batch_size=32, seed=42, do_train=True, do_predict=True, data_dir='data', output_dir='output/ftrlogistic', gpus=1, num_workers=8, feature_name='roberta', label_name='readmit', task='featurebinarycls')


Validation sanity check: 0it [00:00, ?it/s]

2022-03-14 23:51:27 - INFO - __main__ - Loading dev features and labels from data/dev_roberta_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Global seed set to 42
2022-03-14 23:51:28 - INFO - __main__ - Loading train features and labels from data/train_roberta_features.npz and data/train_readmit_labels.npz


features shape: (34416, 764)
labels shape: (34416,)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 1075: val_pr reached 0.01497 (best 0.01497), saving model to "/mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/ftrlogistic/version_20220314-235127/checkpoints/epoch=0-val_acc=0.93.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 2151: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 3227: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 4303: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 5379: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 6455: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 7531: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 8607: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 9683: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 10759: val_pr was not in top 1
2022-03-14 23:55:26 - INFO - __main__ - Copy best model from /mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/ftrlogistic/version_20220314-235127/checkpoints/epoch=0-val_acc=0.93.ckpt to output/ftrlogistic/version_20220314-235127/checkpoints/best_model.ckpt.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2022-03-14 23:55:27 - INFO - __main__ - Loading dev features and labels from data/dev_roberta_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9259130358695984,
 'val_loss': 7.400588512420654,
 'val_pr': 0.014968477189540863}
--------------------------------------------------------------------------------


2022-03-14 23:55:31 - INFO - __main__ - Loading test features and labels from data/test_roberta_features.npz and data/test_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.926174521446228,
 'test_loss': 7.374264717102051,
 'test_pr': 0.00883233081549406}
--------------------------------------------------------------------------------
Validation accuracy on the best model:  0.9259
Validation precision on the best model:  0.0150
val_results_best all {'val_loss': 7.400588512420654, 'val_acc': 0.9259130358695984, 'val_pr': 0.014968477189540863}
Test accuracy on the best model:  0.9262
Test precision on the best model:  0.0088


In [70]:
# dropout=0.6, using sgd

logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S",
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# Load hyperparameters
parser = ArgumentParser()
parser = FeatureBasedBinaryClassificationLModule.add_model_specific_args(parser)

# NOTE: You should replace `unigram_binary` in the assignment statement of `args_str =...`
# with whatever feature that you are experimented with.
# You can also configure other options listed in the method of add_model_specific_args of
# the pytorch-lightning model `FeatureBasedBinaryClassificationLModule`.
args_str = ("--feature_name roberta --max_epochs 10 --label_name readmit "
            "--output_dir output/ftrlogistic --optimizer sgd --do_train --do_predict "
            )

args = parser.parse_args(args_str.split())

# If output_dir not provided, a folder is generated
if args.output_dir is None:
    args.output_dir = str(
        Path('output').joinpath(
            f"{args.task}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"))
Path(args.output_dir).mkdir(parents=True, exist_ok=True)

print(f"Parsed arguments: {args}")

training_outout = generic_train(args=args,
                                model_class=FeatureBasedBinaryClassificationLModule)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                    | Params
-----------------------------------------------------
0 | model    | LogisticRegressionModel | 765   
1 | accuracy | Accuracy                | 0     
2 | pr       | Precision               | 0     
-----------------------------------------------------
765       Trainable params
0         Non-trainable params
765       Total params
0.003     Total estimated model params size (MB)


Parsed arguments: Namespace(vocab_filename=None, optimizer='sgd', learning_rate=0.001, l2_regularization=False, max_epochs=10, train_batch_size=32, eval_batch_size=32, seed=42, do_train=True, do_predict=True, data_dir='data', output_dir='output/ftrlogistic', gpus=1, num_workers=8, feature_name='roberta', label_name='readmit', task='featurebinarycls')


Validation sanity check: 0it [00:00, ?it/s]

2022-03-14 23:59:30 - INFO - __main__ - Loading dev features and labels from data/dev_roberta_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Global seed set to 42
2022-03-14 23:59:31 - INFO - __main__ - Loading train features and labels from data/train_roberta_features.npz and data/train_readmit_labels.npz


features shape: (34416, 764)
labels shape: (34416,)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 1075: val_pr reached 0.00000 (best 0.00000), saving model to "/mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/ftrlogistic/version_20220314-235930/checkpoints/epoch=0-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 2151: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 3227: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 4303: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 5379: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 6455: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 7531: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 8607: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 9683: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 10759: val_pr was not in top 1
2022-03-15 00:03:24 - INFO - __main__ - Copy best model from /mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/ftrlogistic/version_20220314-235930/checkpoints/epoch=0-val_acc=0.96.ckpt to output/ftrlogistic/version_20220314-235930/checkpoints/best_model.ckpt.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2022-03-15 00:03:24 - INFO - __main__ - Loading dev features and labels from data/dev_roberta_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9569423794746399, 'val_loss': 4.305761337280273, 'val_pr': 0.0}
--------------------------------------------------------------------------------


2022-03-15 00:03:29 - INFO - __main__ - Loading test features and labels from data/test_roberta_features.npz and data/test_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9591214060783386, 'test_loss': 4.0878586769104, 'test_pr': 0.0}
--------------------------------------------------------------------------------
Validation accuracy on the best model:  0.9569
Validation precision on the best model:  0.0000
val_results_best all {'val_loss': 4.305761337280273, 'val_acc': 0.9569423794746399, 'val_pr': 0.0}
Test accuracy on the best model:  0.9591
Test precision on the best model:  0.0000


In [71]:
# dropout=0.6, using l2 regularization, 20 epochs

logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S",
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# Load hyperparameters
parser = ArgumentParser()
parser = FeatureBasedBinaryClassificationLModule.add_model_specific_args(parser)

# NOTE: You should replace `unigram_binary` in the assignment statement of `args_str =...`
# with whatever feature that you are experimented with.
# You can also configure other options listed in the method of add_model_specific_args of
# the pytorch-lightning model `FeatureBasedBinaryClassificationLModule`.
args_str = ("--feature_name roberta --max_epochs 20 --label_name readmit "
            "--output_dir output/ftrlogistic --optimizer adam --do_train --do_predict "
            "--l2_regularization True ")

args = parser.parse_args(args_str.split())

# If output_dir not provided, a folder is generated
if args.output_dir is None:
    args.output_dir = str(
        Path('output').joinpath(
            f"{args.task}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"))
Path(args.output_dir).mkdir(parents=True, exist_ok=True)

print(f"Parsed arguments: {args}")

training_outout = generic_train(args=args,
                                model_class=FeatureBasedBinaryClassificationLModule)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                    | Params
-----------------------------------------------------
0 | model    | LogisticRegressionModel | 765   
1 | accuracy | Accuracy                | 0     
2 | pr       | Precision               | 0     
-----------------------------------------------------
765       Trainable params
0         Non-trainable params
765       Total params
0.003     Total estimated model params size (MB)


Parsed arguments: Namespace(vocab_filename=None, optimizer='adam', learning_rate=0.001, l2_regularization='True', max_epochs=20, train_batch_size=32, eval_batch_size=32, seed=42, do_train=True, do_predict=True, data_dir='data', output_dir='output/ftrlogistic', gpus=1, num_workers=8, feature_name='roberta', label_name='readmit', task='featurebinarycls')


Validation sanity check: 0it [00:00, ?it/s]

2022-03-15 00:13:36 - INFO - __main__ - Loading dev features and labels from data/dev_roberta_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Global seed set to 42
2022-03-15 00:13:38 - INFO - __main__ - Loading train features and labels from data/train_roberta_features.npz and data/train_readmit_labels.npz


features shape: (34416, 764)
labels shape: (34416,)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 1075: val_pr reached 0.01497 (best 0.01497), saving model to "/mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/ftrlogistic/version_20220315-001335/checkpoints/epoch=0-val_acc=0.93.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 2151: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 3227: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 4303: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 5379: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 6455: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 7531: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 8607: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 9683: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 10759: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 11835: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 12911: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 12, global step 13987: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 13, global step 15063: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 14, global step 16139: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 15, global step 17215: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 16, global step 18291: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 17, global step 19367: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 18, global step 20443: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 19, global step 21519: val_pr was not in top 1
2022-03-15 00:21:37 - INFO - __main__ - Copy best model from /mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/ftrlogistic/version_20220315-001335/checkpoints/epoch=0-val_acc=0.93.ckpt to output/ftrlogistic/version_20220315-001335/checkpoints/best_model.ckpt.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2022-03-15 00:21:37 - INFO - __main__ - Loading dev features and labels from data/dev_roberta_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9259130358695984,
 'val_loss': 7.400588512420654,
 'val_pr': 0.014968477189540863}
--------------------------------------------------------------------------------


2022-03-15 00:21:42 - INFO - __main__ - Loading test features and labels from data/test_roberta_features.npz and data/test_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.926174521446228,
 'test_loss': 7.374264717102051,
 'test_pr': 0.00883233081549406}
--------------------------------------------------------------------------------
Validation accuracy on the best model:  0.9259
Validation precision on the best model:  0.0150
val_results_best all {'val_loss': 7.400588512420654, 'val_acc': 0.9259130358695984, 'val_pr': 0.014968477189540863}
Test accuracy on the best model:  0.9262
Test precision on the best model:  0.0088


In [None]:
# You may uncomment and run the commands below to use Tensorboard in a notebook.
#%reload_ext  tensorboard
#%tensorboard --logdir output/dan

### Predict using BioClinicalBERT

In [72]:
# dropout=0.6, using l2 regularization

logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S",
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# Load hyperparameters
parser = ArgumentParser()
parser = FeatureBasedBinaryClassificationLModule.add_model_specific_args(parser)

# NOTE: You should replace `unigram_binary` in the assignment statement of `args_str =...`
# with whatever feature that you are experimented with.
# You can also configure other options listed in the method of add_model_specific_args of
# the pytorch-lightning model `FeatureBasedBinaryClassificationLModule`.
args_str = ("--feature_name bc_512 --max_epochs 10 --label_name readmit "
            "--output_dir output/bc_512_logistic --optimizer adam --do_train --do_predict "
            "--l2_regularization True ")

args = parser.parse_args(args_str.split())

# If output_dir not provided, a folder is generated
if args.output_dir is None:
    args.output_dir = str(
        Path('output').joinpath(
            f"{args.task}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"))
Path(args.output_dir).mkdir(parents=True, exist_ok=True)

print(f"Parsed arguments: {args}")

training_outout = generic_train(args=args,
                                model_class=FeatureBasedBinaryClassificationLModule)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                    | Params
-----------------------------------------------------
0 | model    | LogisticRegressionModel | 765   
1 | accuracy | Accuracy                | 0     
2 | pr       | Precision               | 0     
-----------------------------------------------------
765       Trainable params
0         Non-trainable params
765       Total params
0.003     Total estimated model params size (MB)


Parsed arguments: Namespace(vocab_filename=None, optimizer='adam', learning_rate=0.001, l2_regularization='True', max_epochs=10, train_batch_size=32, eval_batch_size=32, seed=42, do_train=True, do_predict=True, data_dir='data', output_dir='output/bc_512_logistic', gpus=1, num_workers=8, feature_name='bc_512', label_name='readmit', task='featurebinarycls')


Validation sanity check: 0it [00:00, ?it/s]

2022-03-15 00:30:56 - INFO - __main__ - Loading dev features and labels from data/dev_bc_512_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Global seed set to 42
2022-03-15 00:30:58 - INFO - __main__ - Loading train features and labels from data/train_bc_512_features.npz and data/train_readmit_labels.npz


features shape: (34416, 764)
labels shape: (34416,)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 1075: val_pr reached 0.00000 (best 0.00000), saving model to "/mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/bc_512_logistic/version_20220315-003056/checkpoints/epoch=0-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 2151: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 3227: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 4303: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 5379: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 6455: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 7531: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 8607: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 9683: val_pr was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 10759: val_pr was not in top 1
2022-03-15 00:34:57 - INFO - __main__ - Copy best model from /mnt/c/Users/natra/Documents/Education/UChicago/NLP/n2c2-track2-nlp-uchicago/output/bc_512_logistic/version_20220315-003056/checkpoints/epoch=0-val_acc=0.96.ckpt to output/bc_512_logistic/version_20220315-003056/checkpoints/best_model.ckpt.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2022-03-15 00:34:58 - INFO - __main__ - Loading dev features and labels from data/dev_bc_512_features.npz and data/dev_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9569423794746399, 'val_loss': 4.305761337280273, 'val_pr': 0.0}
--------------------------------------------------------------------------------


2022-03-15 00:35:02 - INFO - __main__ - Loading test features and labels from data/test_bc_512_features.npz and data/test_readmit_labels.npz


features shape: (11473, 764)
labels shape: (11473,)


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9591214060783386, 'test_loss': 4.0878586769104, 'test_pr': 0.0}
--------------------------------------------------------------------------------
Validation accuracy on the best model:  0.9569
Validation precision on the best model:  0.0000
val_results_best all {'val_loss': 4.305761337280273, 'val_acc': 0.9569423794746399, 'val_pr': 0.0}
Test accuracy on the best model:  0.9591
Test precision on the best model:  0.0000


### DAN

In [7]:
class DeepAveragingNetworksModel(nn.Module):

    def __init__(self,
                 #vocab,
                 vocab_size: int,
                 word_embedding_size: int,
                 hidden_size: int,
                 num_intermediate_layers: int,
                 dropout_rate: float,
                 use_glove: bool = False):
        """
        # Parameters
        vocab : `dict[str, int]`, required.
            A map from the word type to the index of the word.
        vocab_size : `int`, required.
            Size of the vocabulary.
        word_embedding_size : `int`, required.
            Size of word embeddings.
        hidden_size : `int`, required.
            Size of hidden layer or number of hidden units per layer.
        num_intermediate_layers : `int`, required.
            Number of intermediate layers, the arg takes 0 or greater integers.
        dropout_rate : `float`, required.
            Dropout rate.
        use_glove : `bool`, optional.
            Whether or not to use Glove embeddings instead of randomly initialized ones.
        """
        super().__init__()
        # Return zero vector for input with padding_idx (0)
        self.embedding = nn.Embedding(vocab_size, word_embedding_size)

        # Hw-TODO: Add the intermediate layers, output layer, dropout layer,
        #          and activation function according to DAN.
        #          You may find [nn.Modulelist](https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html)
        #          useful to have multiple intermediate layers.
        if num_intermediate_layers == 0:
            self.hidden_layers = None
            self.output_layer = nn.Linear(word_embedding_size, 1)
        else:
            self.hidden_layers = nn.ModuleList(
                [nn.Linear(word_embedding_size, hidden_size)] + [
                    nn.Linear(hidden_size, hidden_size)
                    for _ in range(num_intermediate_layers - 1)
                ])
            self.output_layer = nn.Linear(hidden_size, 1)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids, lengths):
        """
        # Parameters
        input_ids : `torch.Tensor`, required.
            Tensor of shape (batch_size, feature_length).
            Each row is a datapoint represented by input words.
        lengths: `torch.Tensor`, required.
            Tensor of shape (batch_size, 1). Token length of input text.
            Used to compute average word embeddings.
        # Returns
        probs : `torch.Tensor`
            Tensor of shape (batch_size)
        """
        print('input ids shape:',input_ids.shape)
        out = self.embedding(input_ids)  # shape: (batch_sz, max_len, embedding_sz)
        
        # Hw-TODO: Use the intermediate layers, output layer, dropout layer,
        #          and activation function you created in __init__
        #          and other appropriate non-linearity for the output layer
        #          to compute the probabilies of a class, assign these probabilities
        #          to a variable named "probs".
 
        out = torch.sum(input_ids, dim=1) / lengths  # shape: (batch_sz, embedding_sz)
        if self.hidden_layers is not None:
            for hidden_layer in self.hidden_layers:
                out = hidden_layer(out)
                out = self.activation(out)
                out = self.dropout(out)
        out = self.output_layer(out)
        probs = torch.sigmoid(out)

        return probs # you will define this variable in the preceding code.


In [8]:
"""
class SST2Dataset(Dataset):


    def __init__(self, data, tokenizer):
        self.data = data
        #self.vocab = vocab
        self.max_len = 512  # change when running full bioclinical bert
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        note = []
        #label, text = int(self.data[index][0]), self.data[index][1]
        #tokens = self.tokenizer.tokenize(text.lower())
        # If word does not exist, give <unk> token id
        #token_ids = [self.vocab.get(t, 1) for t in tokens]
        length = self.max_len
        features = sparse.load_npz(features_filepath).todense()
        print('features shape:',features.shape)
        #labels = np.load(labels_filepath, allow_pickle=True)["arr_0"]
        labels = np.asarray(sparse.load_npz(labels_filepath).todense()).ravel()
        
        # Truncate or pad to max length
        #padded_token_ids = token_ids[:50] + [0] * (self.max_len - length)
        return padded_token_ids, length, label

    def collate_fn(self, batch_data):
        padded_token_ids, lengths, labels = list(zip(*batch_data))
        return (
            torch.LongTensor(padded_token_ids).view(-1, self.max_len),
            #torch.FloatTensor(lengths).view(-1, 1),
            torch.FloatTensor(labels).view(-1, 1),
        )

    def __len__(self):
        return len(self.data)
"""

class DeepAveragingBinaryClassificationLModule(BinaryClassificationLModule):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def get_model(self) -> nn.Module:
        return DeepAveragingNetworksModel(
            #vocab=self.hparams.vocab,
            vocab_size=self.hparams.train_batch_size,
            word_embedding_size=self.hparams.word_embedding_size,
            hidden_size=self.hparams.hidden_size,
            num_intermediate_layers=self.hparams.num_intermediate_layers,
            dropout_rate=self.hparams.dropout_rate,
            use_glove=self.hparams.use_glove)

    def batch2input(self, batch):
        return {'input_ids': batch[0], 'lengths': 512}

    def batch2labels(self, batch):
        return batch[1].squeeze()

    def get_dataloader(self, split, batch_size, shuffle=False) -> DataLoader:
        data_dir = Path(self.hparams.data_dir)
        features_filepath = data_dir.joinpath(
            f"{split}_{self.hparams.feature_name}_features.npz")
        labels_filepath = data_dir.joinpath(split + "_labels.npz")
        features = sparse.load_npz(features_filepath).todense()
        print('features shape:',features.shape)
        #labels = np.load(labels_filepath, allow_pickle=True)["arr_0"]
        labels = np.asarray(sparse.load_npz(labels_filepath).todense()).ravel()
        print('labels shape:',labels.shape)
        dataset = torch.utils.data.TensorDataset(
            torch.from_numpy(features).int(),
            torch.from_numpy(labels).float())

        logger.info(f"Loading {split} data and labels from {labels_filepath}")
        data_loader = DataLoader(dataset=dataset,
                                 batch_size=batch_size,
                                 shuffle=shuffle,
                                 num_workers=self.hparams.num_workers
                                 #,collate_fn=dataset.collate_fn
                                 )

        return data_loader

    def configure_optimizers(self):
        if self.hparams.optimizer == 'sgd':
            optimizer = torch.optim.SGD(self.model.parameters(),
                                        lr=self.hparams.learning_rate)
        elif self.hparams.optimizer == 'adam':
            optimizer = torch.optim.Adam(self.model.parameters(),
                                         lr=self.hparams.learning_rate)
        else:
            raise NotImplementedError
        # Hw-TODO: Add more optimizers and experiment with at least 2
        #          optimizers other than vanilla SGD.
        #          You can configure which optimizer to use by modifying
        #          args_str or args passted to the function generic_train.
        return optimizer

    @classmethod
    def add_model_specific_args(cls, parser: ArgumentParser) -> ArgumentParser:
        parser = super().add_model_specific_args(parser)

        # Required arguments
        parser.add_argument('--num_intermediate_layers',
                            type=int,
                            help="number of intermediate layers")
        # Optional arguments
        parser.add_argument('--dropout_rate',
                            default=0.5,
                            type=float,
                            help="Dropout rate")
        parser.add_argument('--word_embedding_size',
                            default=300,
                            type=int,
                            help="Size of word embeddings")
        parser.add_argument('--hidden_size',
                            default=300,
                            type=int,
                            help="Size of hidden layer")
        parser.add_argument('--use_glove',
                            action="store_true",
                            help="Whether to run predictions on the test set.")
        parser.add_argument('--task',
                            default='danbinarycls',
                            type=str,
                            help="Name of the task.")
        parser.add_argument('--feature_name',
                            default=None,
                            type=str,
                            required=True,
                            help="Name of the feature")
        return parser

In [9]:
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S",
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# Load hyperparameters
parser = ArgumentParser()
parser = DeepAveragingBinaryClassificationLModule.add_model_specific_args(parser)

# NOTE: You should replace --optimizer <optimizer> with the name of the optimizer
# with which you are experimenting with, and the same goes for word_embedding_size.
# You can also configure other options listed in the method of add_model_specific_args of
# the pytorch-lightning module `DeepAveragingBinaryClassificationLModule`.
args_str = ("--max_epochs 2 "
            "--optimizer sgd --num_intermediate_layers 1 --feature_name roberta_readmit "
            "--output_dir output/dan  --do_train --do_predict --gpus 0")
args = parser.parse_args(args_str.split())

# If output_dir not provided, a folder is generated
if args.output_dir is None:
    args.output_dir = str(
        Path('output').joinpath(
            f"{args.task}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"))
Path(args.output_dir).mkdir(parents=True, exist_ok=True)

print(f"Parsed arguments: {args}")

training_outout = generic_train(args=args,
                                model_class=DeepAveragingBinaryClassificationLModule)

Global seed set to 42
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(

  | Name     | Type                       | Params
--------------------------------------------------------
0 | model    | DeepAveragingNetworksModel | 100 K 
1 | accuracy | Accuracy                   | 0     
--------------------------------------------------------
100 K     Trainable params
0         Non-trainable params
100 K     Total params
0.401     Total estimated model params size (MB)


Parsed arguments: Namespace(vocab_filename=None, optimizer='sgd', learning_rate=0.001, max_epochs=2, train_batch_size=32, eval_batch_size=32, seed=42, do_train=True, do_predict=True, data_dir='data', output_dir='output/dan', gpus=0, num_workers=8, num_intermediate_layers=1, dropout_rate=0.5, word_embedding_size=300, hidden_size=300, use_glove=False, task='danbinarycls', feature_name='roberta_readmit')


Validation sanity check: 0it [00:00, ?it/s]

2022-03-14 19:12:05 - INFO - __main__ - Loading dev data and labels from data/dev_labels.npz


features shape: (58328, 512)
labels shape: (58328,)
input ids shape: torch.Size([32, 512])


IndexError: index out of range in self

### References

[stackoverflow: python - Adding L1/L2 regularization in PyTorch?](https://stackoverflow.com/questions/42704283/adding-l1-l2-regularization-in-pytorch)  
