In [1]:
import pandas as pd
import torch
import pytorch_lightning as pl
import sys

sys.path.append("../../lsm/")
from msdatasets import MSDataset
from pretrain_MAE import SSModel
import numpy as np
from tqdm import tqdm
import os

path = '' # define path to data

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  from .autonotebook import tqdm as notebook_tqdm


### Import model

In [2]:
lsm_checkpoint = '' # define checkpoint path 
model = SSModel.load_from_checkpoint(f'{path}/trained_models/{lsm_checkpoint}').cuda().eval()
device = torch.device('cuda')



### Import test data

In [3]:
test_unknown = MSDataset(f'{path}/datasets/test/test.zarr/')
test_known = MSDataset(f'{path}/datasets/test/disjoint_test.zarr/')
test_casmi = MSDataset(f'{path}/datasets/test/casmi.zarr/')

Data is (12274, 5) dimensions
Data is (1000, 5) dimensions
Data is (464, 5) dimensions


### Generate predictions

In [4]:
unknown_dataloader = DataLoader(test_unknown, batch_size=256, shuffle=False, num_workers=0)
known_dataloader = DataLoader(test_known, batch_size=256, shuffle=False, num_workers=0)
casmi_dataloader = DataLoader(test_casmi, batch_size=256, shuffle=False, num_workers=0)

In [5]:
def get_preds(model, dataloader, dataset):
    pl.seed_everything(42)
    
    #unknown predictions
    unknown_gt_mz = np.zeros((len(dataset), 64))
    unknown_gt_inty = np.zeros((len(dataset), 64))
    preds_mz = np.zeros((len(dataset), 64))
    preds_inty = np.zeros((len(dataset), 64))

    masked_inty_preds = []
    masked_inty_gt = []
    masked_mz_preds = []
    masked_mz_gt = []

    j = 0
    for batch in tqdm(dataloader):
        mz, inty, precursormz, pad_idx = batch['mz'], batch['inty'], batch['precursormz'], batch['pad_idx']
        mz, inty, precursormz, pad_idx = mz.to(device), inty.to(device), precursormz.to(device), pad_idx.to(device)
        
        mz1_logits, mz2_logits, inty_logits, mask = model(precursormz, mz, inty)
        mask = mask.detach().cpu().numpy()
        mz1_logits = mz1_logits.detach().cpu().numpy()
        mz2_logits = mz2_logits.detach().cpu().numpy()
        inty_logits = inty_logits.detach().cpu().numpy()
        mz = mz.detach().cpu().numpy()
        inty = inty.detach().cpu().numpy()
        
        
        # convert logits to classifications
        mz1 = np.argmax(mz1_logits, axis=-1)
        mz2 = np.argmax(mz2_logits, axis=-1)
        inty_logits = np.argmax(inty_logits, axis=-1)    
        pred_mz = mz1 + mz2/1000
        pred_inty = inty_logits
        # add to predictions list 
        for i in range(len(pad_idx)):      
            unknown_gt_mz[i+j, :pad_idx[i]] = mz[i, :pad_idx[i]]
            unknown_gt_inty[i+j, :pad_idx[i]] = inty[i, :pad_idx[i]]
            preds_mz[i+j, :pad_idx[i]] = pred_mz[i, :pad_idx[i]]
            preds_inty[i+j, :pad_idx[i]] = pred_inty[i, :pad_idx[i]]


            unpadded_pred_mz = pred_mz[i, :pad_idx[i]]
            unpadded_pred_inty = pred_inty[i, :pad_idx[i]]
            unpadded_gt_mz = mz[i, :pad_idx[i]]
            unpadded_gt_inty = inty[i, :pad_idx[i]]
            unpadded_mask = mask[i, :pad_idx[i]]
            
            # add masked values to list
            masked_mz_preds.append(unpadded_pred_mz[unpadded_mask])
            masked_mz_gt.append(unpadded_gt_mz[unpadded_mask])
            masked_inty_preds.append(unpadded_pred_inty[unpadded_mask])
            masked_inty_gt.append(unpadded_gt_inty[unpadded_mask])

        j += len(pred_inty)

    masked_mz_preds = np.concatenate(np.array(masked_mz_preds).flatten())
    masked_mz_gt = np.concatenate(np.array(masked_mz_gt).flatten())
    masked_inty_preds = np.concatenate(np.array(masked_inty_preds).flatten())
    masked_inty_gt = np.concatenate(np.array(masked_inty_gt).flatten())
    
    return unknown_gt_mz, unknown_gt_inty, preds_mz, preds_inty, masked_mz_preds, masked_mz_gt, masked_inty_preds, masked_inty_gt

In [9]:
unknown_gt_mz, unknown_gt_inty, preds_mz, preds_inty, masked_mz_preds, masked_mz_gt, masked_inty_preds, masked_inty_gt = get_preds(model, unknown_dataloader, test_unknown)

#get number of non-zero peaks in unknown_gt
non_zero = np.sum(np.count_nonzero(unknown_gt_inty, axis=-1))

#calculate the MAE of the model for both mz and inty
mae_inty = np.mean(np.abs(preds_inty - unknown_gt_inty)) / (non_zero/(unknown_gt_mz.shape[0] * unknown_gt_mz.shape[1]))
mae_mz  = np.mean(np.abs(preds_mz - unknown_gt_mz)) / (non_zero/(unknown_gt_mz.shape[0] * unknown_gt_mz.shape[1]))
print(mae_inty, mae_mz)

#calculate the MAE of the model for both mz and inty on masked data
mae_inty_masked = np.mean(np.abs(masked_inty_preds - masked_inty_gt))
mae_mz_masked  = np.mean(np.abs(masked_mz_preds - masked_mz_gt))
print(mae_inty_masked, mae_mz_masked)

Global seed set to 42
100%|██████████| 48/48 [00:15<00:00,  3.17it/s]

31.76453322136245 1.7797870946598928
120.4629590256052 6.758868442819854



  masked_mz_preds = np.concatenate(np.array(masked_mz_preds).flatten())
  masked_mz_gt = np.concatenate(np.array(masked_mz_gt).flatten())
  masked_inty_preds = np.concatenate(np.array(masked_inty_preds).flatten())
  masked_inty_gt = np.concatenate(np.array(masked_inty_gt).flatten())


In [42]:
# save these to pickles
inty_gt = pd.DataFrame(unknown_gt_inty)
inty_preds = pd.DataFrame(preds_inty)
mz_gt = pd.DataFrame(unknown_gt_mz)
mz_preds = pd.DataFrame(preds_mz)
masked_inty_gt = pd.DataFrame(masked_inty_gt)
masked_inty_preds = pd.DataFrame(masked_inty_preds)
masked_mz_gt = pd.DataFrame(masked_mz_gt)
masked_mz_preds = pd.DataFrame(masked_mz_preds)

# print the shapes of all of the dataframes
print(inty_gt.shape)
print(inty_preds.shape)
print(mz_gt.shape)
print(mz_preds.shape)
print(masked_inty_gt.shape)
print(masked_inty_preds.shape)
print(masked_mz_gt.shape)
print(masked_mz_preds.shape)

os.makedirs('../../results/pretrain/', exist_ok=True)

inty_gt.to_pickle(f'../../results/pretrain/unknown_inty_gt.pkl')
inty_preds.to_pickle(f'../../results/pretrain/unknown_inty_preds.pkl')
mz_gt.to_pickle(f'../../results/pretrain/unknown_mz_gt.pkl')
mz_preds.to_pickle(f'../../results/pretrain/unknown_mz_preds.pkl')
masked_inty_gt.to_pickle(f'../../results/pretrain/unknown_masked_inty_gt.pkl')
masked_inty_preds.to_pickle(f'../../results/pretrain/unknown_masked_inty_preds.pkl')
masked_mz_gt.to_pickle(f'../../results/pretrain/unknown_masked_mz_gt.pkl')
masked_mz_preds.to_pickle(f'../../results/pretrain/unknown_masked_mz_preds.pkl')

# make small df with mae values
mae_df = pd.DataFrame({'MAE_inty': [mae_inty], 'MAE_mz': [mae_mz], 'MAE_inty_masked': [mae_inty_masked], 'MAE_mz_masked': [mae_mz_masked]})
mae_df.to_csv(f'../../results/pretrain/unknown_mae_df.csv')

(12274, 64)
(12274, 64)
(12274, 64)
(12274, 64)
(65846, 1)
(65846, 1)
(65846, 1)
(65846, 1)


In [39]:
known_gt_mz, known_gt_inty, preds_mz, preds_inty, masked_mz_preds, masked_mz_gt, masked_inty_preds, masked_inty_gt = get_preds(model, known_dataloader, test_known)

#get number of non-zero peaks in unknown_gt
non_zero = np.sum(np.count_nonzero(known_gt_inty, axis=-1))
print(non_zero)

#calculate the MAE of the model for both mz and inty
mae_inty = np.mean(np.abs(preds_inty - known_gt_inty)) / (non_zero/(known_gt_mz.shape[0] * known_gt_mz.shape[1]))
mae_mz  = np.mean(np.abs(preds_mz - known_gt_mz)) / (non_zero/(known_gt_mz.shape[0] * known_gt_mz.shape[1]))
print(mae_inty, mae_mz)

#calculate the MAE of the model for both mz and inty on masked data
mae_inty_masked = np.mean(np.abs(masked_inty_preds - masked_inty_gt))
mae_mz_masked  = np.mean(np.abs(masked_mz_preds - masked_mz_gt))
print(mae_inty_masked, mae_mz_masked)

Global seed set to 42
100%|██████████| 4/4 [00:01<00:00,  3.64it/s]

11451
30.49017553052135 1.3373643115507274
122.56621499548329 5.49030040877014



  masked_mz_preds = np.concatenate(np.array(masked_mz_preds).flatten())
  masked_mz_gt = np.concatenate(np.array(masked_mz_gt).flatten())
  masked_inty_preds = np.concatenate(np.array(masked_inty_preds).flatten())
  masked_inty_gt = np.concatenate(np.array(masked_inty_gt).flatten())


In [40]:
# save these to pickles
inty_gt = pd.DataFrame(known_gt_inty)
inty_preds = pd.DataFrame(preds_inty)
mz_gt = pd.DataFrame(known_gt_mz)
mz_preds = pd.DataFrame(preds_mz)
masked_inty_gt = pd.DataFrame(masked_inty_gt)
masked_inty_preds = pd.DataFrame(masked_inty_preds)
masked_mz_gt = pd.DataFrame(masked_mz_gt)
masked_mz_preds = pd.DataFrame(masked_mz_preds)

# print the shapes of all of the dataframes
print(inty_gt.shape)
print(inty_preds.shape)
print(mz_gt.shape)
print(mz_preds.shape)
print(masked_inty_gt.shape)
print(masked_inty_preds.shape)
print(masked_mz_gt.shape)
print(masked_mz_preds.shape)

os.makedirs('../../results/pretrain/', exist_ok=True)

inty_gt.to_pickle(f'../../results/pretrain/known_inty_gt.pkl')
inty_preds.to_pickle(f'../../results/pretrain/known_inty_preds.pkl')
mz_gt.to_pickle(f'../../results/pretrain/known_mz_gt.pkl')
mz_preds.to_pickle(f'../../results/pretrain/known_mz_preds.pkl')
masked_inty_gt.to_pickle(f'../../results/pretrain/known_masked_inty_gt.pkl')
masked_inty_preds.to_pickle(f'../../results/pretrain/known_masked_inty_preds.pkl')
masked_mz_gt.to_pickle(f'../../results/pretrain/known_masked_mz_gt.pkl')
masked_mz_preds.to_pickle(f'../../results/pretrain/known_masked_mz_preds.pkl')

# make small df with mae values
mae_df = pd.DataFrame({'MAE_inty': [mae_inty], 'MAE_mz': [mae_mz], 'MAE_inty_masked': [mae_inty_masked], 'MAE_mz_masked': [mae_mz_masked]})
mae_df.to_csv(f'../../results/pretrain/known_mae_df.csv')

(1000, 64)
(1000, 64)
(1000, 64)
(1000, 64)
(5535, 1)
(5535, 1)
(5535, 1)
(5535, 1)


In [48]:
casmi_gt_mz, casmi_gt_inty, preds_mz, preds_inty, masked_mz_preds, masked_mz_gt, masked_inty_preds, masked_inty_gt = get_preds(model, casmi_dataloader, test_casmi)

#get number of non-zero peaks in unknown_gt
non_zero = np.sum(np.count_nonzero(casmi_gt_inty, axis=-1))
print(non_zero)

#calculate the MAE of the model for both mz and inty
mae_inty = np.mean(np.abs(preds_inty - casmi_gt_inty)) / (non_zero/(casmi_gt_mz.shape[0] * casmi_gt_mz.shape[1]))
mae_mz  = np.mean(np.abs(preds_mz - casmi_gt_mz)) / (non_zero/(casmi_gt_mz.shape[0] * casmi_gt_mz.shape[1]))
print(mae_inty, mae_mz)

#calculate the MAE of the model for both mz and inty on masked data
mae_inty_masked = np.mean(np.abs(masked_inty_preds - masked_inty_gt))
mae_mz_masked  = np.mean(np.abs(masked_mz_preds - masked_mz_gt))
print(mae_inty_masked, mae_mz_masked)

Global seed set to 42
100%|██████████| 2/2 [00:00<00:00,  3.70it/s]

10346
33.17910303498937 2.012001481326664
122.69696176008381 7.974461207849934



  masked_mz_preds = np.concatenate(np.array(masked_mz_preds).flatten())
  masked_mz_gt = np.concatenate(np.array(masked_mz_gt).flatten())
  masked_inty_preds = np.concatenate(np.array(masked_inty_preds).flatten())
  masked_inty_gt = np.concatenate(np.array(masked_inty_gt).flatten())


In [38]:
# save these to pickles
inty_gt = pd.DataFrame(casmi_gt_inty)
inty_preds = pd.DataFrame(preds_inty)
mz_gt = pd.DataFrame(casmi_gt_mz)
mz_preds = pd.DataFrame(preds_mz)
masked_inty_gt = pd.DataFrame(masked_inty_gt)
masked_inty_preds = pd.DataFrame(masked_inty_preds)
masked_mz_gt = pd.DataFrame(masked_mz_gt)
masked_mz_preds = pd.DataFrame(masked_mz_preds)

# print the shapes of all of the dataframes
print(inty_gt.shape)
print(inty_preds.shape)
print(mz_gt.shape)
print(mz_preds.shape)
print(masked_inty_gt.shape)
print(masked_inty_preds.shape)
print(masked_mz_gt.shape)
print(masked_mz_preds.shape)

os.makedirs('../../results/pretrain/', exist_ok=True)

inty_gt.to_pickle(f'../../results/pretrain/casmi_inty_gt.pkl')
inty_preds.to_pickle(f'../../results/pretrain/casmi_inty_preds.pkl')
mz_gt.to_pickle(f'../../results/pretrain/casmi_mz_gt.pkl')
mz_preds.to_pickle(f'../../results/pretrain/casmi_mz_preds.pkl')
masked_inty_gt.to_pickle(f'../../results/pretrain/casmi_masked_inty_gt.pkl')
masked_inty_preds.to_pickle(f'../../results/pretrain/casmi_masked_inty_preds.pkl')
masked_mz_gt.to_pickle(f'../../results/pretrain/casmi_masked_mz_gt.pkl')
masked_mz_preds.to_pickle(f'../../results/pretrain/casmi_masked_mz_preds.pkl')

# make small df with mae values
mae_df = pd.DataFrame({'MAE_inty': [mae_inty], 'MAE_mz': [mae_mz], 'MAE_inty_masked': [mae_inty_masked], 'MAE_mz_masked': [mae_mz_masked]})
mae_df.to_csv(f'../../results/pretrain/casmi_mae_df.csv')

(464, 64)
(464, 64)
(464, 64)
(464, 64)
(3818, 1)
(3818, 1)
(3818, 1)
(3818, 1)
