In [5]:
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'

In [6]:
# disable for training!
# %load_ext autoreload
# %autoreload 2

In [29]:
!ls ./lightning_logs/version_17869980/checkpoints/

'epoch=92-step=114948-best.ckpt'


In [2]:
hparams = dict(
    # datamodule
    hdf_path='./data/ProteomeTools.hdf',
    batch_size=512,
    train_split=0.85,
    val_split=0.05,
    cdhit_threshold=0.5,
    cdhit_word_length=3,
    tmp_env='TMPDIR',
    num_workers=4, # dont need many when loading everything into ram
    random_state=0,
    
    # model
    model_dim=128, # same size as CARP-600k
    model_depth=16,
    lr=5e-4,
    dropout=0, # this is only being applied to spectra rn

    # trainer
    num_gpus=1,
    max_epochs=1000,
    precision=32,
    strategy='ddp_find_unused_parameters_false',
#     strategy='dp',
    es_monitor='val_cross_entropy',
    es_mode='min',
    es_patience=10,
#     val_check_interval=1,
#     resume_from_checkpoint='"./lightning_logs/version_17869980/checkpoints/epoch=92-step=114948-best.ckpt"',
    resume_from_checkpoint=None,
    
    # cluster
    num_nodes=4,
    num_cpus=8,
    conda_env='MSPretraining',
    time='0-6:00:00',
    
    # tensorboard
    login_node='login-2'
)

args = ' '.join([f'--{k} {v}' for k,v in hparams.items()])

print(f'python train.py {args}')

python train.py --hdf_path ./data/ProteomeTools.hdf --batch_size 512 --train_split 0.85 --val_split 0.05 --cdhit_threshold 0.5 --cdhit_word_length 3 --tmp_env TMPDIR --num_workers 4 --random_state 0 --model_dim 128 --model_depth 16 --lr 0.0005 --dropout 0 --num_gpus 1 --max_epochs 1000 --precision 32 --strategy ddp_find_unused_parameters_false --es_monitor val_cross_entropy --es_mode min --es_patience 10 --resume_from_checkpoint None --num_nodes 4 --num_cpus 8 --conda_env MSPretraining --time 0-6:00:00 --login_node login-2


In [3]:
args = ' '.join([f'--{k} {v}' for k,v in hparams.items()])

#SBATCH --signal=SIGUSR1@90

slurm = f'''#!/bin/bash -l 

#SBATCH --nodes={hparams['num_nodes']}
#SBATCH --gres=gpu:volta:{hparams['num_gpus']}
#SBATCH --ntasks-per-node={max(1,hparams['num_gpus'])}
#SBATCH --cpus-per-task={hparams['num_cpus']}
#SBATCH --time={hparams['time']}

source activate {hparams['conda_env']}
''' + '''
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1

# Set some environment variables needed by torch.distributed 
export MASTER_ADDR=$(hostname -s)
# Get unused port
export MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')

echo "MASTER_ADDR : ${MASTER_ADDR}"
echo "MASTER_PORT : ${MASTER_PORT}"
''' + f'''
srun python train.py {args}
'''

%store slurm >submit.sh

!sbatch submit.sh

Writing 'slurm' (str) to file 'submit.sh'.
Submitted batch job 17908562


In [4]:
from src.torch_helpers import start_tensorboard

start_tensorboard(login_node=hparams['login_node'])

RuntimeError: KeyboardInterrupt: 

In [1]:
from src.datamodule import MSDataModule
from src.model import MSTransformer

dm = MSDataModule(**hparams)

KeyboardInterrupt: 

In [None]:
import torch
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from src.torch_helpers import NoValProgressBar

seed_everything(hparams['random_state'], workers=True)

# torch.autograd.set_detect_anomaly(True) 

model = MSTransformer(**hparams)

!rm -rf ./lightning_logs/version_$SLURM_JOBID

trainer = Trainer(
    gpus=1,
    precision=hparams['precision'],
#     val_check_interval=hparams['val_check_interval'],
    max_epochs=1000,
    max_time="00:6:00:00",
    callbacks=[
#         EarlyStopping(
#             monitor=hparams['es_monitor'],
#             mode=hparams['es_mode'],
#             patience=hparams['es_patience']
#         ),
        NoValProgressBar(),
        ModelCheckpoint(
            monitor=hparams['es_monitor'],
            mode=hparams['es_mode'],
            save_top_k=1,
            filename='{epoch}-{step}-best'
        )
    ]
)

trainer.fit(model, dm)

# rename it

Global seed set to 0
Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-433d5424-0d96-1e9d-aaab-7d900550b968]

  | Name      | Type         | Params
-------------------------------------------
0 | x_encoder | ByteNet      | 603 K 
1 | y_encoder | ByteNet      | 604 K 
2 | conv1     | MaskedConv1d | 32.9 K
3 | relu      | ReLU         | 0     
4 | conv2     | MaskedConv1d | 3.1 K 
-------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.977     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 5:  78%|███████▊  | 4068/5234 [54:37<15:39,  1.24it/s, loss=2.55, v_num=1.79e+7]7]  

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 27:  80%|███████▉  | 4178/5234 [4:16:29<1:04:49,  3.68s/it, loss=2.4, v_num=1.79e+7]  

In [None]:
# from tqdm import tqdm
# train_seqs = {item['sequence'] for item,_ in tqdm(zip(dm.train_dataset,range(1000)),position=0)}
# val_seqs = {item['sequence'] for item,_ in tqdm(zip(dm.val_dataset,range(1000)),position=0)}
# test_seqs = {item['sequence'] for item,_ in tqdm(zip(dm.test_dataset,range(1000)),position=0)}
# train_seqs&val_seqs, train_seqs&test_seqs, test_seqs&val_seqs

In [None]:
# from src.datamodule import MSDataModule
# from src.model import MSTransformer
# from src.plotting import faststem
# from src.spectrum import fragment_mz_tensor
# from tqdm import tqdm

# # [last_ckpt] = !ls -t1 ./lightning_logs/*/checkpoints/*.ckpt | head -n1
# # print(last_ckpt)
# # model = MSTransformer.load_from_checkpoint(last_ckpt)
# # dm = MSDataModule(**dict(model.hparams))

# dm.setup()

# model = model.cpu()
# model.eval();

# for i, batch in enumerate(dm.predict_dataloader()):
#     batch['y_pred'] = model.predict_step(batch)

#     mz = fragment_mz_tensor(batch['sequence'][0]).ravel()
#     y = batch['y'][0].detach().cpu().numpy().ravel()
#     y_pred = batch['y_pred'][0].detach().cpu().numpy().ravel()
    
#     plt.figure(figsize=(6,3))
#     faststem(mz,y)
#     faststem(mz,-y_pred)
#     yl = max(np.abs(plt.ylim()))
#     plt.ylim([-yl,yl])
#     plt.title(f"{batch['sequence'][0]} {batch['charge'][0]}+")
    
#     if i == 10:
#         break

In [None]:
# the CNN does not shrink unseen peaks to zero, while the transformer does