In [1]:
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'

In [2]:
# disable for training!
# %load_ext autoreload
# %autoreload 2

In [3]:
hparams = dict(
    # datamodule
    hdf_path='./data/ProteomeTools.hdf',
    batch_size=1024,
    train_split=0.85,
    val_split=0.05,
#     train_split=0.05,
#     val_split=0.01,
    cdhit_threshold=0.5,
    cdhit_word_length=3,
    tmp_env='TMPDIR',
    num_workers=8, # dont need many when loading everything into ram
    random_state=0,
    
    # model
    model_dim=128, # same size as CARP-600k
    model_depth=16, # HALF - temporary
    lr=1e-4,
    dropout=0.0, 

    # trainer
    num_gpus=1,
    max_epochs=1000,
    precision=32,
    strategy='ddp',
    es_monitor='val_cross_entropy',
    es_mode='min',
    es_patience=3,
    val_check_interval=0.1,
    
    # cluster
    num_nodes=1,
    num_cpus=20,
    conda_env='MSPretraining',
    time='0-12:00:00',
    
    # tensorboard
    login_node='login-2'
)

In [4]:
args = ' '.join([f'--{k} {v}' for k,v in hparams.items()])

slurm = f'''#!/bin/bash -l 

#SBATCH --nodes={hparams['num_nodes']}
#SBATCH --gres=gpu:volta:{hparams['num_gpus']}
#SBATCH --ntasks-per-node={max(1,hparams['num_gpus'])}
#SBATCH --cpus-per-task={hparams['num_cpus']}
#SBATCH --time={hparams['time']}

#SBATCH --signal=SIGUSR1@90

source activate {hparams['conda_env']}
''' + '''
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1

# Set some environment variables needed by torch.distributed 
export MASTER_ADDR=$(hostname -s)
# Get unused port
export MASTER_PORT=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')

echo "MASTER_ADDR : ${MASTER_ADDR}"
echo "MASTER_PORT : ${MASTER_PORT}"
''' + f'''
srun python train.py {args}
'''

%store slurm >submit.sh

# !sbatch submit.sh

Writing 'slurm' (str) to file 'submit.sh'.


In [5]:
# from src.torch_helpers import start_tensorboard

# start_tensorboard(login_node=hparams['login_node'])

In [6]:
from src.datamodule import MSDataModule
from src.model import MSTransformer

dm = MSDataModule(**hparams)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from src.torch_helpers import NoValProgressBar

seed_everything(hparams['random_state'], workers=True)

torch.autograd.set_detect_anomaly(True) 

model = MSTransformer(**hparams)

!rm -rf ./lightning_logs/version_$SLURM_JOBID

trainer = Trainer(
    gpus=1,
    precision=hparams['precision'],
    val_check_interval=hparams['val_check_interval'],
    max_epochs=1,
#     gradient_clip_val=0.1,
    callbacks=[
        EarlyStopping(
            monitor=hparams['es_monitor'],
            mode=hparams['es_mode'],
            patience=hparams['es_patience']
        ),
        NoValProgressBar(),
        # checkpoitn the best so far
    ]
)

trainer.fit(model, dm)

# rename it

Global seed set to 0
Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name      | Type         | Params
-------------------------------------------
0 | x_encoder | ByteNet      | 603 K 
1 | y_encoder | ByteNet      | 604 K 
2 | conv1     | MaskedConv1d | 32.9 K
3 | relu      | ReLU         | 0     
4 | conv2     | MaskedConv1d | 3.1 K 
-------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.977     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 0:   2%|▏         | 66/3923 [00:25<25:18,  2.54it/s, loss=2.75, v_num=1.79e+7]  

In [None]:
for batch in dm.train_dataloader():
    break

In [None]:
model.step(batch,'train')

In [None]:
# from tqdm import tqdm
# train_seqs = {item['sequence'] for item,_ in tqdm(zip(dm.train_dataset,range(1000)),position=0)}
# val_seqs = {item['sequence'] for item,_ in tqdm(zip(dm.val_dataset,range(1000)),position=0)}
# test_seqs = {item['sequence'] for item,_ in tqdm(zip(dm.test_dataset,range(1000)),position=0)}
# train_seqs&val_seqs, train_seqs&test_seqs, test_seqs&val_seqs

In [None]:
# from src.datamodule import MSDataModule
# from src.model import MSTransformer
# from src.plotting import faststem
# from src.spectrum import fragment_mz_tensor
# from tqdm import tqdm

# # [last_ckpt] = !ls -t1 ./lightning_logs/*/checkpoints/*.ckpt | head -n1
# # print(last_ckpt)
# # model = MSTransformer.load_from_checkpoint(last_ckpt)
# # dm = MSDataModule(**dict(model.hparams))

# dm.setup()

# model = model.cpu()
# model.eval();

# for i, batch in enumerate(dm.predict_dataloader()):
#     batch['y_pred'] = model.predict_step(batch)

#     mz = fragment_mz_tensor(batch['sequence'][0]).ravel()
#     y = batch['y'][0].detach().cpu().numpy().ravel()
#     y_pred = batch['y_pred'][0].detach().cpu().numpy().ravel()
    
#     plt.figure(figsize=(6,3))
#     faststem(mz,y)
#     faststem(mz,-y_pred)
#     yl = max(np.abs(plt.ylim()))
#     plt.ylim([-yl,yl])
#     plt.title(f"{batch['sequence'][0]} {batch['charge'][0]}+")
    
#     if i == 10:
#         break

In [None]:
# the CNN does not shrink unseen peaks to zero, while the transformer does