In [1]:
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'

In [2]:
%load_ext autoreload
%autoreload 2

- Stratify negatives by sequence length

- Marginalize out fragment types

In [16]:
hparams = dict(
    # datamodule
    hdf_path='./data/ProteomeTools.hdf',
    batch_size=1024,
    train_val_split=0.9,
    cdhit_threshold=0.5,
    cdhit_word_length=3,
    tmp_env='TMPDIR',
    num_workers=20,
    random_state=0,
    
    # model
    model_dim=256,
    model_depth=4,
    num_heads=4,
    lr=1e-4,
    dropout=0.1, 
    max_length=100,
    temperature=1.0,
    negative_sampling=True,

    # trainer
    num_gpus=1,
    max_epochs=100,
    precision=32,
    strategy='ddp',
    
    # cluster
    num_nodes=4,
    num_cpus=20,
    conda_env='MSPretraining',
    time='0-24:00:00',
    
    # tensorboard
    login_node='login-2'
)

In [17]:
args = ' '.join([f'--{k} {v}' for k,v in hparams.items()])

## SBATCH --signal=SIGUSR1@90

slurm = f'''#!/bin/bash -l 

#SBATCH --nodes={hparams['num_nodes']}
#SBATCH --gres=gpu:volta:{hparams['num_gpus']}
#SBATCH --ntasks-per-node={max(1,hparams['num_gpus'])}
#SBATCH --cpus-per-task={hparams['num_cpus']}
#SBATCH --time={hparams['time']}

source activate {hparams['conda_env']}
''' + '''
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1

# Set some environment variables needed by torch.distributed 
export MASTER_ADDR=$(hostname -s)
# Get unused port
export MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')

echo "MASTER_ADDR : ${MASTER_ADDR}"
echo "MASTER_PORT : ${MASTER_PORT}"
''' + f'''
srun python train.py {args}
'''

%store slurm >submit.sh

Writing 'slurm' (str) to file 'submit.sh'.


In [18]:
!sbatch submit.sh

Submitted batch job 16627573


In [None]:
from src.torch_helpers import start_tensorboard

start_tensorboard(login_node=hparams['login_node'])

In [6]:
from src.datamodule import MSDataModule
from src.model import MSTransformer

# [last_ckpt] = !ls -t1 ./lightning_logs/large/checkpoints/*.ckpt | head -n1
# print(last_ckpt)

# model = MSTransformer.load_from_checkpoint(last_ckpt)
model = MSTransformer(**hparams)
dm = MSDataModule(**dict(model.hparams))

dm.setup()
# model = model.cpu()
# model.eval();

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# from pytorch_lightning import Trainer

# trainer = Trainer(
#     gpus=0,
#     precision=32
# )

# trainer.fit(model, dm)