In [1]:
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
import random as rand
import math
import queue
import csv
from collections import OrderedDict
from IPython.display import clear_output
import csv
from heapq import merge
from sklearn import preprocessing
import gc
import os
from os import listdir
from os.path import isfile, join
import shutil
from configparser import ConfigParser
import ast
import sys
import re
from pathlib import Path
import pickle
import progressbar

from sklearn.datasets import make_circles, make_moons
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from IPython.core.debugger import set_trace
from torch.utils import data
from torch.utils.data.sampler import SubsetRandomSampler
from torch.autograd import Variable

rand.seed(37)

#from src.snapconfig import config
from src.snaputils import simulatespectra as sim
from src.snaptrain import process, dataset, model
from src.snaputils import reader

In [2]:
# Temporary config func. Original one in the project.
class config:
    """Define constants"""
    AAMass = OrderedDict([('A', 71.037114), ('C', 103.009185), ('D', 115.026943), ('E', 129.042593),
                          ('F', 147.068414), ('G', 57.021464), ('H', 137.058912), ('I', 113.084064),
                          ('K', 128.094963), ('L', 113.084064), ('M', 131.040485), ('N', 114.042927),
                          ('P', 97.052764), ('Q', 128.058578), ('R', 156.101111), ('S', 87.032028),
                          ('T', 101.047679), ('V', 99.068414), ('W', 186.079313), ('Y', 163.0633),
                          ('p', 79.97), ('o', 15.99), ('h', 0.98), ('c', 57.02), ('a', 42.01),
                          ('r', -17.03), ('y', 43.01), ('d', -18.01), ('t', 26.02)])

    ModMass = {"Oxidation": 15.994915, "CAM": 57.02146, "Carbamidomethyl": 57.02146, "ICAT_light": 227.12,
               "ICAT_heavy": 236.12, "AB_old_ICATd0": 442.20, "AB_old_ICATd8": 450.20, "Acetyl": 42.0106,
               "Deamidation": 0.9840, "Pyro-cmC": -17.026549, "Pyro-glu": -17.026549, "Pyro_glu": -18.010565,
               "Amide": -0.984016, "Phospho": 79.9663, "Methyl": 14.0157, "Carbamyl": 43.00581}

    ModCHAR = OrderedDict([("15.99", "o"), ("0.98", "h"), ("57.02", "c"), ("42.01", "a"), ("-17.03", "r"),
                           ("79.97", "p"), ("43.01", "y"), ("-18.01", "d"), ("26.02", "t")])
    # ModCHAR = {"15.99": "o", "0.98": "h", "57.02": "c", "42.01": "a", "-17.03": "r", "79.97": "p"}
    Ignore = ["U", "X"]
    Mods = [{"mod_char": "p", "aas": ["S", "T", "Y"]}
            # {"mod_char": "o", "aas": ["nt", "M"]}
           ]
    H2O = 18.015
    NH3 = 17.031
    PROTON = 1.00727647
    DEFAULT_PARAM_PATH = os.path.join(os.getcwd(), 'config.ini')
    PARAM_PATH = None
    l_config = None


    def get_config(section='input', key=None):
        """Read the configuration parameters and return a dictionary."""

        # If file path is given use it otherwise use default.
        file_path = config.PARAM_PATH if config.PARAM_PATH else config.DEFAULT_PARAM_PATH

        # Read config and convert each value to appropriate type.
        # Only for the first time.
        if not config.l_config:
            config.l_config = dict()
            config_ = ConfigParser()
            assert isinstance(file_path, str)
            config_.read(file_path)
            for section_ in config_.sections():
                config.l_config[section_] = dict()
                for key_ in config_[section_]:
                    try:
                        config.l_config[section_][key_] = ast.literal_eval(config_[section_][key_])
                    except (ValueError, SyntaxError):
                        config.l_config[section_][key_] = config_[section_][key_]

        if section and section in config.l_config:
            if key and key in config.l_config[section]:
                return config.l_config[section][key]
            return config.l_config[section]
        return config.l_config

In [3]:
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12344'
    dist.init_process_group(backend='nccl', world_size=world_size, rank=rank)
    # dist.init_process_group(backend='nccl', world_size=world_size, rank=rank)

def cleanup():
    dist.destroy_process_group()


def apply_filter(filt, file_name):
    try:
        file_parts = re.search(r"(\d+)-(\d+)-(\d+.\d+)-(\d+)-(\d+).[pt|npy]", file_name)
        charge = int(file_parts[4])
        mods = int(file_parts[5])
    except:
        print(file_name)
        print(file_parts)
    
    if ((filt["charge"] == 0 or charge <= filt["charge"])
        and (mods <= filt["mods"])):
        return True
    
    return False


def load_file_names(filt, listing_path):
    'Load the peptide and corresponding spectra file names that satisfy the filter'
    with open(listing_path, 'rb') as f:
        dir_listing = pickle.load(f)

    pep_file_names = []
    spec_file_names_lists = []
    for pep, spec_list in dir_listing:
        spec_file_list = []
        for spec in spec_list:
            if apply_filter(filt, spec):
                spec_file_list.append(spec)
        if spec_file_list:
            pep_file_names.append(pep)
            spec_file_names_lists.append(spec_file_list)

    assert len(pep_file_names) == len(spec_file_names_lists)
    return pep_file_names, spec_file_names_lists


def psm_collate(batch):
    specs = torch.cat([item[0] for item in batch], 0)
    peps = torch.stack([item[1] for item in batch], 0)
    dpeps = torch.stack([item[2] for item in batch if len(item[2]) > 0])
    peps_set = set(map(tuple, peps.tolist()))
    dpeps_set = set(map(tuple, dpeps.tolist()))
    dpeps_list = list(dpeps_set - dpeps_set.intersection(peps_set))
    dpeps = torch.tensor(dpeps_list, dtype=torch.long)
    counts = np.array([item[3] for item in batch])
    return [specs, peps, dpeps, counts]

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
in_tensor_dir = "./data/train-ready/train_lstm_mods_mass_hcd_all/"
rank = 0
world_size = 1
setup(rank, world_size)

batch_size  = config.get_config(section="ml", key="batch_size")
#batch_size = 512
charge      = config.get_config(section='input', key='charge')
use_mods    = config.get_config(section='input', key='use_mods')
num_mods    = config.get_config(section='input', key='num_mods')
filt        = {'charge': charge, 'mods': num_mods if use_mods else 0}
test_size   = config.get_config(section='ml', key='test_size')

in_tensor_dir = config.get_config(section='preprocess', key='in_tensor_dir')

print(in_tensor_dir)

listing_path = join(in_tensor_dir, 'pep_spec.pkl')

./data/train-ready/train_lstm_mods_mass_hcd_all/


In [5]:
pep_file_names, spec_file_names_lists = load_file_names(filt=filt, listing_path=listing_path)

in_dataset = dataset.LabeledSpectra(in_tensor_dir, pep_file_names, spec_file_names_lists)

vocab_size = in_dataset.vocab_size

in_loader = torch.utils.data.DataLoader(dataset=in_dataset, batch_size=batch_size, num_workers=8, collate_fn=psm_collate)

model_ = torch.load("./models/model-all-42-0.3-0.0005.pt")

batch_size = config.get_config(section='ml', key='batch_size')

h = model_.module.init_hidden(batch_size)

g_counts = []
Q = []
P = []
D = []
with torch.no_grad():
    with progressbar.ProgressBar(max_value=len(in_loader)) as p_bar:
        for idx, data in enumerate(in_loader):
            h = tuple([e.data for e in h])
            q_len = len(data[0])
            p_len = len(data[1])
            d_len = len(data[2])
            if batch_size > p_len:
                seq_len = config.get_config(section='ml', key='pep_seq_len')
                zero_pad = torch.zeros(batch_size - p_len, seq_len, dtype=torch.long)
                data[1] = torch.cat((data[1], zero_pad))

            if batch_size > d_len:
                seq_len = config.get_config(section='ml', key='pep_seq_len')
                zero_pad = torch.zeros(batch_size - d_len, seq_len, dtype=torch.long)
                data[2] = torch.cat((data[2], zero_pad))
            data[0] = data[0].to(device)
            data[1] = data[1].to(device)
            data[2] = data[2].to(device)
            g_counts.extend(data[3])

            model_out = model_(data[:-1], h) # Q, P, D, h
            Q.append(model_out[0].to("cpu"))
            P.append(model_out[1][:p_len].to("cpu"))
            D.append(model_out[2][:d_len].to("cpu"))
            del model_out
            p_bar.update(idx)

Q = torch.cat(Q, 0)
P = torch.cat(P, 0)
D = torch.cat(D, 0)

Vocabulary Size: 30
dataset size: 2003627


100% (1957 of 1957) |####################| Elapsed Time: 0:04:00 Time:  0:04:00


RuntimeError: DataLoader worker (pid(s) 59156) exited unexpectedly

In [28]:
a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
b = torch.tensor([[7., 8., 9.]])
l = [a, b]
print(torch.cat(l, 0))

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])


In [19]:
c = []
c.extend(a)
c.extend(b)
print(c)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
