# FF model


## Settings

In [59]:
from prediction_utils import get_galaxy_filename

base = "../data"
dataset = "nist"
kind = "in_database"


wv_path = f"gas2vec/{kind}.model"

data_train_path =f"{base}/{dataset}/{kind}/train.msp"
data_val_path =f"{base}/{dataset}/{kind}/val.msp"
data_test_path =f"{base}/{dataset}/{kind}/test.msp"

data_realtest_path = get_galaxy_filename('RI using kovats of Mass spectra from RAMClustR', base, ["enh", "pred"])

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data loading

In [60]:
# from matchms.importing import load_from_msp

from matchms.importing import load_from_msp
from helpers import get_mz_vector, get_his_size
from data_utils import spectrum_processing
from data_utils import FixedSizeDS
from torch.utils.data import Dataset, DataLoader
import numpy as np

import torch
import torch.nn as nn
import tqdm
import torch.nn.functional as F

# only for visual 
from spec2vec import SpectrumDocument
from data_utils import BasicCoder


In [61]:
def load_process_documents(path):
    # Load data from MSP file and apply filters
    spectrums = [spectrum_processing(s, min_rel_int=None, n_required_peaks=1) for s in load_from_msp(path, metadata_harmonization=False)]
    # Omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]
    # Create spectrum documents
    documents = [SpectrumDocument(s, n_decimals=0) for s in spectrums]
    return documents, spectrums

# documents_train = load_process_documents(data_train_path)
# documents_val = load_process_documents(data_val_path)
# documents_test = load_process_documents(data_test_path)
documents_supertest, spectrums_supertest = load_process_documents(data_realtest_path)

In [62]:

datasets = {
    "fixed_train": FixedSizeDS(spectrums_supertest),
    "fixed_supertest": FixedSizeDS(spectrums_supertest)
}

dataloaders = {
}

In [63]:
datasets["fixed_train"][0][1].sum()

tensor(0.)

## Models definition

In [64]:
class Linear(nn.Module):
    def __init__(self, max_mz=1001):
        super(Linear, self).__init__()
        self.in_features = max_mz
        self.out_features = max_mz
    
        self.linear = nn.Linear(self.in_features, self.out_features)
        self.sigm = nn.Sigmoid()
    
    def forward(self, x):
        x = self.linear(x)
        return self.sigm(x)
    
class MLP(nn.Module):
    def __init__(self, max_mz=1001, hidden_layers=(2000,)):
        super(MLP, self).__init__()
        self.in_features = max_mz
        self.out_features = max_mz
        
        assert len(hidden_layers) != 0
        self.relu = nn.ReLU()        
        layers = [nn.Linear(self.in_features, hidden_layers[0])]
        
        for i in range(len(hidden_layers)-1):
            layers.append(self.relu)
            layers.append(nn.Linear(hidden_layers[i], hidden_layers[i+1]))
            
        
        self.layers = nn.Sequential(*layers)
        self.linear = nn.Linear(hidden_layers[-1], self.out_features)
        self.sigm = nn.Sigmoid()

        
    def forward(self, x):
        x = self.layers(x)
        x= self.relu(x)
        x = self.linear(x)
        return self.sigm(x)
    

In [65]:
linear = Linear(datasets["fixed_train"].max_mz)
mlp_1 = MLP(datasets["fixed_train"].max_mz, (1000,))
mlp_2 = MLP(datasets["fixed_train"].max_mz, (1000,1000))
mlp_3 = MLP(datasets["fixed_train"].max_mz, (1000,1000, 1000))
mlp_1s = MLP(datasets["fixed_train"].max_mz, (500,))
mlp_2s = MLP(datasets["fixed_train"].max_mz, (500,500))
mlp_3s = MLP(datasets["fixed_train"].max_mz, (500,500, 500))



In [66]:
# discver whether to use GPU or not
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
print(device)


cpu


## Prediction

In [67]:
class PredictorFF():
    def __init__(self,model, max_mz=1001, threshold=.5):
        self.model = model
        self.max_mz= max_mz
        self.threshold = threshold
        
    @classmethod
    def from_file(cls, file, device="cpu", max_mz=1001, threshold=.5):
        model = torch.load(file,  map_location=torch.device(device)) 
        return cls(model, max_mz=max_mz, threshold=threshold)
   

    def __call__(self, X):
        with torch.no_grad():
            return self.model(X)
    
    def predict_random_all(self, ds, probs, cum_level=.95, filtered=True, \
                           device="cpu", **kwargs):
        # reset rng before each prediction to have comparable results
        self.rng = np.random.default_rng(42)

        spectrums = ds.spectrums
        m_pred_per_p = [[None for _ in range(len(spectrums))] for _ in range(len(probs))]
        m_y_per_p = [[None for _ in range(len(spectrums))] for _ in range(len(probs))]
        some_pred_per_p = [[None for _ in range(len(spectrums))] for _ in range(len(probs))]
        
        for i, spec in enumerate(spectrums):
            vect = get_mz_vector(spec, self.max_mz)
            
            mat = np.zeros(shape=(len(probs),vect.shape[0]), dtype=np.float32)

            # descending = np.argsort(spec.peaks.intensities)[::-1]
            
            # argsorted = np.argsort(vect)[::-1]
            his_size = get_his_size(spec, cum_level)
            his_ind = np.argpartition(vect, -his_size)[-his_size:]
       
            for m, p in enumerate(probs):
                
                ## cripple vector
                mat[m] = vect.copy()
                
                # if too little peaks are present in the high intensity section, simply cut out top m most intense peaks 
                # cut_out_kths = self.rng.choice(max(n_peaks_considered, m), size=m, replace=False)
                #cut_out_kths = self.rng.choice(n_peaks_considered, size=min(m, n_peaks_considered), replace=False)
                
                # his_ind = np.argsort(mat[m])[::-1][:his_size]
                mask_missing = self.rng.uniform(0,1, self.max_mz) < p
                
                #cut_out_indices = argsorted[his_ind[mask_missing]] 
                
                mat[m][mask_missing] = 0
                
                his_mask = np.zeros_like(vect) == 1
                his_mask[his_ind] = True
                
                m_y_per_p[m][i] = np.where(mask_missing & his_mask)[0]
                
            # get predictions
            with torch.no_grad():
                mat_ = torch.from_numpy(mat).to(device)
                pred = self.model(mat_)
                pred = pred.cpu().numpy()    
            
            if filtered:
                pred[mat!=0] = 0
                
            # get best peaks above threshold (except the given ones)    
            for m in range(len(probs)):
                some = np.where(pred[m] > self.threshold)[0]
                some_pred_per_p[m][i] = some
            
            # get best m peaks (except the given ones)
            for m in range(len(probs)):
                next_m = np.argsort(pred[m])[::-1][:len(m_y_per_p[m][i])]
                m_pred_per_p[m][i] = next_m     
            
        return some_pred_per_p, m_pred_per_p, m_y_per_p



## Model Selection
done on the val set

In [68]:
P_FOLDER = f"predictions/{kind}"
probs = [0, .05, .1, .15, .2, .25, .3, .35, .4, .45, .5]

In [69]:
import os
from metrics import metrics_klj, metrics_intlj

predictors = { 
            "linear_.8": PredictorFF.from_file(f"models/{kind}/linear", max_mz=1001, device=device, threshold=.8),
            "linear_.5": PredictorFF.from_file(f"models/{kind}/linear", max_mz=1001, device=device, threshold=.5),
            "linear_.3": PredictorFF.from_file(f"models/{kind}/linear", max_mz=1001, device=device, threshold=.3),
            "linear_.1": PredictorFF.from_file(f"models/{kind}/linear", max_mz=1001, device=device, threshold=.1)
}


In [70]:
best_p_name="linear_3"

# MLP

In [71]:
P_FOLDER = f"predictions/{kind}"
probs = [0, .05, .1, .15, .2, .25, .3, .35, .4, .45, .5]

In [72]:
import os
from metrics import metrics_klj, metrics_intlj

predictors = { 
            "mlp_1_.8": PredictorFF.from_file(f"models/{kind}/mlp_1", max_mz=1001, device=device, threshold=.8),
            "mlp_1_.5": PredictorFF.from_file(f"models/{kind}/mlp_1", max_mz=1001, device=device, threshold=.5),
            "mlp_1_.3": PredictorFF.from_file(f"models/{kind}/mlp_1", max_mz=1001, device=device, threshold=.3),
            "mlp_1_.1": PredictorFF.from_file(f"models/{kind}/mlp_1", max_mz=1001, device=device, threshold=.1),
            "mlp_2_.8": PredictorFF.from_file(f"models/{kind}/mlp_2", max_mz=1001, device=device, threshold=.8),
            "mlp_2_.5": PredictorFF.from_file(f"models/{kind}/mlp_2", max_mz=1001, device=device, threshold=.5),
            "mlp_2_.3": PredictorFF.from_file(f"models/{kind}/mlp_2", max_mz=1001, device=device, threshold=.3),
            "mlp_2_.1": PredictorFF.from_file(f"models/{kind}/mlp_2", max_mz=1001, device=device, threshold=.1),
            "mlp_3_.8": PredictorFF.from_file(f"models/{kind}/mlp_3", max_mz=1001, device=device, threshold=.8),
            "mlp_3_.5": PredictorFF.from_file(f"models/{kind}/mlp_3", max_mz=1001, device=device, threshold=.5),
            "mlp_3_.3": PredictorFF.from_file(f"models/{kind}/mlp_3", max_mz=1001, device=device, threshold=.3),
            "mlp_3_.1": PredictorFF.from_file(f"models/{kind}/mlp_3", max_mz=1001, device=device, threshold=.1),
            "mlp_2s_.8": PredictorFF.from_file(f"models/{kind}/mlp_2s", max_mz=1001, device=device, threshold=.8),
            "mlp_2s_.5": PredictorFF.from_file(f"models/{kind}/mlp_2s", max_mz=1001, device=device, threshold=.5),
            "mlp_2s_.3": PredictorFF.from_file(f"models/{kind}/mlp_2s", max_mz=1001, device=device, threshold=.3),
            "mlp_2s_.1": PredictorFF.from_file(f"models/{kind}/mlp_2s", max_mz=1001, device=device, threshold=.1)


}


In [73]:
P_FOLDER

'predictions/in_database'

In [85]:
best_p_name = "mlp_2_.3"

## Supertest

In [86]:
ds = datasets["fixed_supertest"]
batch_size = 1
predictor = predictors[best_p_name]

In [87]:
some_pred_per_p, _, _ = predictor.predict_random_all(ds,[0])
preds = some_pred_per_p[0]

In [88]:
from prediction_utils import enhance_spectra, predict_spectra
        
enhanced_spectra = [*enhance_spectra(spectrums_supertest, preds, 5)]
predicted_spectra = [*predict_spectra(spectrums_supertest, preds, 5)]


In [89]:
from matchms.exporting import save_as_msp

In [90]:
save_as_msp(enhanced_spectra, f"{data_realtest_path[:-4]}_enh_{best_p_name}.msp")
save_as_msp(predicted_spectra, f"{data_realtest_path[:-4]}_pred_{best_p_name}.msp")

put(f"{data_realtest_path[:-4]}_enh_{best_p_name}.msp")
put(f"{data_realtest_path[:-4]}_pred_{best_p_name}.msp")

In [39]:
some_pred_per_p

[[array([79]),
  array([77]),
  array([], dtype=int64),
  array([ 81, 156, 226, 241]),
  array([279]),
  array([174]),
  array([250]),
  array([137, 140, 167, 203]),
  array([191]),
  array([221]),
  array([], dtype=int64),
  array([159, 242]),
  array([229]),
  array([], dtype=int64),
  array([186, 187]),
  array([], dtype=int64),
  array([190]),
  array([122, 158, 230]),
  array([], dtype=int64),
  array([181, 194, 197, 224, 226]),
  array([159]),
  array([104, 133, 257]),
  array([105, 249]),
  array([131]),
  array([], dtype=int64),
  array([219]),
  array([328]),
  array([165]),
  array([157]),
  array([315]),
  array([], dtype=int64),
  array([], dtype=int64),
  array([], dtype=int64),
  array([232]),
  array([254, 312]),
  array([296, 401]),
  array([316]),
  array([266, 286]),
  array([155, 212, 223, 348]),
  array([260]),
  array([274]),
  array([], dtype=int64),
  array([133, 152, 212]),
  array([ 90, 134]),
  array([120]),
  array([268]),
  array([134, 292]),
  array([218]),

In [53]:
assert False

AssertionError: 

In [33]:
def loader_gen(ds, batch_size):
    
    for i in range((len(ds)//batch_size)):
        n_sam = batch_size #if i*batch_size < len(ds) else len(ds) % batch_size
        batch = torch.zeros(batch_size, 1001), torch.zeros(batch_size, 1001)
        for j in range(n_sam):
            batch[0][j] = ds[i*batch_size+j][0]
            batch[1][j] = ds[i*batch_size+j][1]
        yield batch
            
    n_sam = len(ds)% batch_size
            
    batch = torch.zeros(batch_size, 1001), torch.zeros(batch_size, 1001)    
    for j in range((len(ds) % batch_size)):
        batch[0][j] = ds[(i+1)*batch_size+j][0]
        batch[1][j] = ds[(i+1)*batch_size+j][1]
        yield batch
            
loader = loader_gen(ds, batch_size)
len_loader = len(ds)//batch_size + 1


X_intens = [np.sort(s.peaks.intensities)[::-1] for s in ds.spectrums]



In [35]:
len(ds)

68

In [36]:
from tqdm import tqdm
preds = []
for b, X_dict_batch_ in enumerate(tqdm(loader)):
    #print(X_dict_batch_)
#     X_dict_batch = {"input_ids": torch.atleast_2d(X_dict_batch_["input_ids"][X_dict_batch_["attention_mask"] == 1])}
    #print(X_dict_batch)
    pred = predictor.predict_l_next(X_dict_batch, l=10)[0]
#     pred = predictor.coder.index2mz[pred]
    preds.append(pred)

0it [00:00, ?it/s]


AttributeError: 'PredictorFF' object has no attribute 'predict_l_next'

In [None]:
len(preds)

In [None]:
hugg_ds.ref_docs[0]

In [None]:
spectrums_supertest[1].metadata

In [None]:
spectrums_supertest[0].peaks.mz