In [1]:
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
import random as rand
import queue
import csv
from collections import OrderedDict
from IPython.display import clear_output
import csv
from heapq import merge
from sklearn import preprocessing
import gc
import os
from os import listdir
from os.path import isfile, join
import shutil
from configparser import ConfigParser
import ast
import sys
import re
from pathlib import Path

from sklearn.datasets import make_circles, make_moons
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from IPython.core.debugger import set_trace
from torch.utils import data
from torch.utils.data.sampler import SubsetRandomSampler
from torch.autograd import Variable

rand.seed(37)

#from src.snapconfig import config
from src.snapprocess import simulatespectra as sim
from src.snapprocess import process
from src.snaputils import reader

In [2]:
# Temporary config func. Original one in the project.
class config:
    """Define constants"""
    AAMass = {'A': 71.037114, 'C': 103.009185, 'D': 115.026943, 'E': 129.042593, 'F': 147.068414, 'G': 57.021464,
              'H': 137.058912, 'I': 113.084064, 'K': 128.094963, 'L': 113.084064, 'M': 131.040485, 'N': 114.042927,
              'P': 97.052764, 'Q': 128.058578, 'R': 156.101111, 'S': 87.032028, 'T': 101.047679, 'V': 99.068414,
              'W': 186.079313, 'Y': 163.0633}

    H2O = 18.015
    NH3 = 17.031
    PROTON = 1.00727647
    DEFAULT_PARAM_PATH = os.path.join(os.getcwd(), 'config.ini')
    PARAM_PATH = None
    l_config = None


    def get_config(section='input', key=None):
        """Read the configuration parameters and return a dictionary."""

        # If file path is given use it otherwise use default.
        file_path = config.PARAM_PATH if config.PARAM_PATH else config.DEFAULT_PARAM_PATH

        # Read config and convert each value to appropriate type.
        # Only for the first time.
        if not config.l_config:
            config.l_config = dict()
            config_ = ConfigParser()
            assert isinstance(file_path, str)
            config_.read(file_path)
            for section_ in config_.sections():
                config.l_config[section_] = dict()
                for key_ in config_[section_]:
                    try:
                        config.l_config[section_][key_] = ast.literal_eval(config_[section_][key_])
                    except (ValueError, SyntaxError):
                        config.l_config[section_][key_] = config_[section_][key_]

        if section and section in config.l_config:
            if key and key in config.l_config[section]:
                return config.l_config[section][key]
            return config.l_config[section]
        return config.l_config

In [3]:
def preprocess_msps(msp_dir, out_dir):
    in_path = Path(msp_dir)
    assert in_path.exists() and in_path.is_dir()
    
    msp_files = [join(msp_dir, f) for f in listdir(msp_dir) if
                 isfile(join(msp_dir, f)) and f.split('.')[-1] == 'msp']
    assert len(msp_files) > 0
    
    out_path = Path(out_dir)
    if out_path.exists() and out_path.is_dir():
        shutil.rmtree(out_path)
    out_path.mkdir()
    Path(join(out_path, 'spectra')).mkdir()
    Path(join(out_path, 'peptides')).mkdir()
        
    print('reading {} files'.format(len(msp_files)))
    
    count = 0
    max_peaks = max_moz = 0
    for species_id, msp_file in enumerate(msp_files):
        print('Reading: {}'.format(msp_file))
        
        f = open(msp_file, "r")
        lines = f.readlines()
        f.close()

        pep_list = []
        dataset = []
        label = []

        # FIXME: config should use only one get_config call.
        spec_size = config.get_config(section='input', key='spec_size')
        charge = config.get_config(section='input', key='charge')
        use_mods = config.get_config(section='input', key='use_mods')
        num_species = config.get_config(section='input', key='num_species')
        seq_len = config.get_config(section='ml', key='pep_seq_len')

        print('len of file: ' + str(len(lines)))
        limit = 200000
        pep = []
        spec = []
        pep_set = set()
        is_name = is_mw = is_num_peaks = False
        prev = 0
        i = 0
        while i < len(lines) and limit > 0:
            line = lines[i]
            i += 1
            if line.startswith('Name:'):
                name_groups = re.search(r"Name:\s(?P<pep>[a-zA-Z]+)/(?P<charge>\d+)"
                                        r"(?:_(?P<num_mods>\d+)(?P<mods>.*))?", line)
                if not name_groups:
                    continue
                    
                pep = name_groups['pep']
                if len(pep) + 1 > seq_len:
                    continue
                    
                l_charge = int(name_groups['charge'])
                num_mods = int(name_groups['num_mods'])

                is_name = True

            if is_name and line.startswith('MW:'):
                mass = float(re.findall(r"MW:\s([-+]?[0-9]*\.?[0-9]*)", line)[0])
                if round(mass) < spec_size:
                    is_mw = True
                    # limit = limit - 1
                else:
                    is_name = is_mw = is_num_peaks = False
                    continue

            if is_name and is_mw and line.startswith('Num peaks:'):
                num_peaks = int(re.findall(r"Num peaks:\s([0-9]*\.?[0-9]*)", line)[0])
                if num_peaks > max_peaks:
                    max_peaks = num_peaks

                spec = np.zeros(spec_size)
                while lines[i] != '\n':
                    mz_line = lines[i]
                    i += 1
                    mz_splits = mz_line.split('\t')
                    moz, intensity = float(mz_splits[0]), float(mz_splits[1])
                    if moz > max_moz:
                        max_moz = moz
                    spec[round(moz)] += round(intensity)

                # for k in range(1, charge + 1):
                #     spec[-k] = 0
                # spec[-l_charge] = 1000.0
                spec = np.clip(spec, None, 1000.0)
                # spec = preprocessing.scale(spec)

                is_num_peaks = True

            if is_name and is_mw and is_num_peaks:
                is_name = is_mw = is_num_peaks = False
                
                #pep = '{}{}{}'.format(charge, species_id, pep)

                """output the data to """
                spec_tensor = torch.tensor((np.asarray(spec) - 3.725) / 51.479, dtype=torch.float)
                
                torch.save(spec_tensor, 
                           join(out_dir, 'spectra', '{}-{}-{}-{}-{}.pt'
                                .format(count, species_id, mass, l_charge, int(num_mods > 0))))
                
                pep_file_name = '{}-{}-{}-{}-{}.pep'.format(count, species_id, mass, l_charge, int(num_mods > 0))
                    
                with open(join(out_path, 'peptides', pep_file_name), 'w+') as f:
                    f.write(pep)

                count = count + 1
                pep = 0
                spec = []
                new = int((i / len(lines)) * 100)
                if new > prev:
                    clear_output(wait=True)
                    print(str(new) + '%')
                    prev = new

        print('max peaks: ' + str(max_peaks))
        print('count: ' + str(count))
        print('max moz: ' + str(max_moz))
#         return pep_list, dataset, label
#         tmp_pep_list, tmp_dataset, tmp_labels = read_msp(msp_file, species_id, decoy)
#         pep_list.extend(tmp_dataset)
#         dataset.extend(tmp_dataset)
#         label.extend(tmp_labels)

In [4]:
msp_dir = "/disk/raptor/lclhome/mtari008/Proteogenomics/DeepSNAP/data/msp-labeled/"
in_tensor_dir = "/disk/raptor/lclhome/mtari008/Proteogenomics/DeepSNAP/data/train_lstm/"

In [5]:
# transformer = transforms.Normalize(mean=[3.725], std=[51.479])
# preprocess_msps(msp_dir, in_tensor_dir)

In [168]:
class LabeledSpectra(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, dir_path, filt, test=False):
        'Initialization'
        
        self.aas         = ['_PAD'] + list(config.AAMass.keys())
        self.aa2idx      = {a:i for i, a in enumerate(self.aas)}
        self.idx2aa      = {i:a for i, a in enumerate(self.aas)}
        
        self.spec_path   = join(dir_path, 'spectra')
        self.pep_path    = join(dir_path, 'peptides')
        self.charge      = filt['charge'] if 'charge' in filt else config.get_config(section='input', key='charge')
        self.num_species = config.get_config(section='input', key='num_species')
        # self.vocab_size  = len(self.aa2idx) + self.charge + self.num_species + 1
        self.vocab_size  = round(max(config.AAMass.values())) + 1
        self.seq_len     = config.get_config(section='ml', key='pep_seq_len')
        self.modified    = filt['modified'] if 'modified' in filt else False
        self.test_size   = config.get_config(section='ml', key='test_size')
        self.test        = test
        
        self.file_names  = []
        for file in listdir(self.spec_path):
            if self.apply_filter(file):
                self.file_names.append(file)
        
        print('dataset size: {}'.format(len(self.file_names)))        
        
        self.train_files, self.test_files = train_test_split(
            self.file_names, test_size = self.test_size, random_state = rand.randint(0, 1000), shuffle = True)
        
        if self.test:
            print('test size: {}'.format(len(self.test_files)))
        else:
            print('train size: {}'.format(len(self.train_files)))
        
    def __len__(self):
        'Denotes the total number of samples'
        if self.test:
            return len(self.test_files)
        else:
            return len(self.train_files)

    def __getitem__(self, index):
        'Generates one sample of data'
        file_name = ''
        # Select sample
        if self.test:
            file_name = self.test_files[index]
        else:
            file_name = self.train_files[index]
            
        spec_file_name = join(self.spec_path, file_name)
        pep_file_name  = join(self.pep_path, file_name.replace('.pt', '.pep'))
        
        # Load data and get label
        spec_torch = torch.load(spec_file_name)
        
        # Load peptide and convert to idx array
        f = open(pep_file_name, "r")
        pep = f.readlines()[0].strip()
        f.close()
        
        pepl = np.zeros(len(pep))
        file_parts = re.search(r"(\d+)-(\d+)-(\d+.\d+)-(\d)-(0|1).pt", file_name)
        pepl[0] = int(file_parts[4]) + len(self.aas)  # coded value of charge
        pepl[1] = int(file_parts[2]) + self.charge + 1 + len(self.aas) # coded value of specie id
        
        # for i in range(2, len(pep)):
        #     pepl[i] = self.aa2idx[pep[i]]
        for i, aa in enumerate(pep[2:]):
            pepl[i + 2] = self.aa2idx[aa]
            # pepl[i + 2] = round(config.AAMass[aa])
        
        pepl = self.pad_left(pepl, self.seq_len)
        pep_torch = torch.tensor(pepl, dtype=torch.long)
        
        return [spec_torch, pep_torch]
    
    def apply_filter(self, file_name):
        file_parts = re.search(r"(\d+)-(\d+)-(\d+.\d+)-(\d)-(0|1).pt", file_name)
        charge = int(file_parts[4])
        modified = bool(int(file_parts[5]))
        
        if ((self.charge == 0 or charge <= self.charge)
            and (self.modified or self.modified == modified)):
            return True
        
        return False
    
    def pad_left(self, arr, size):
        out = np.zeros(size)
        out[-len(arr):] = arr
        return out

In [7]:
# data_mean = np.mean(dataset)
# data_std = np.std(dataset)

In [8]:
# batch_size = config.get_config(section='ml', key='batch_size')
batch_size = 256
charge = config.get_config(section='input', key='charge')
use_mods = config.get_config(section='input', key='use_mods')
filt = {'charge':charge, 'modified':use_mods}


train_dataset = LabeledSpectra(in_tensor_dir, filt, test=False)
test_dataset = LabeledSpectra(in_tensor_dir, filt, test=True)

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=8)

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=8)

dataset size: 587162
train size: 469729
dataset size: 587162
test size: 117433


In [176]:
do_learn = True
save_frequency = 2
lr = 0.0001
num_epochs = 200
weight_decay = 0.0001
margin = 0.2
vocab_size = train_dataset.vocab_size
#torch.manual_seed(0)
#torch.cuda.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cuda')

In [177]:
#train_loader = torch.load('train_loader.pt')
#test_loader = torch.load('test_loader.pt')

In [178]:
class Net(nn.Module):
    def __init__(self, vocab_size, output_size=512, embedding_dim=512, hidden_lstm_dim=1024, lstm_layers=2, drop_prob=0.5):
        super(Net, self).__init__()
        
        self.spec_size = config.get_config(section='input', key='spec_size')
        self.output_size = output_size
        self.lstm_layers = lstm_layers
        self.hidden_lstm_dim = hidden_lstm_dim
        self.embedding_dim = embedding_dim
        self.searching = False
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_lstm_dim, self.lstm_layers, 
                            dropout=drop_prob, batch_first=True, bidirectional=True)
        
        self.linear1_1 = nn.Linear(self.spec_size, 1024)
        self.linear1_2 = nn.Linear(1024, 512)
        self.linear1_3 = nn.Linear(512, 256)
        
        self.linear2_1 = nn.Linear(2048, 1024)
        self.linear2_2 = nn.Linear(1024, 512)
        self.linear2_3 = nn.Linear(512, 256)
        
        self.dropout1 = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.2)
        #self.dropout3 = nn.Dropout(0.3)
        
    def forward(self, data, hidden):
        specs = data[0]
        peps = data[1]
        # print(peps.type())
        # print('Input to the model size: {}'.format(specs.size()))
        # print('Input to the model size: {}'.format(peps.size()))
        # peps = peps.unsqueeze(-1).float()
        # print(peps.type())
        
        embeds = self.embedding(peps)
        lstm_out, hidden = self.lstm(embeds, hidden)
        # print(lstm_out.size())
        lstm_out = lstm_out[:, -1, :]
        # print(lstm_out.size())
        #lstm_out = torch.mean(lstm_out, dim=1)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_lstm_dim * 2)
        out = self.dropout2(lstm_out)
        
        out = self.linear2_1(out)
        out = F.relu(out)
        out = self.dropout2(out)
        
        out = self.linear2_2(out)
        out = F.relu(out)
        
        # out = out.view(batch_size, peps.size()[1], 512)
        # out = out[:,-1,:]
        out_pep = F.normalize(out)
        
        out = self.linear1_1(specs.view(-1, self.spec_size))
        out = F.relu(out)
        out = self.dropout2(out)
        out = self.linear1_2(out)
        out = F.relu(out)
        out_spec = F.normalize(out)
        
        res = out_spec, out_pep, hidden
        return res
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.lstm_layers * 2, batch_size, self.hidden_lstm_dim).zero_().to(device),
                      weight.new(self.lstm_layers * 2, batch_size, self.hidden_lstm_dim).zero_().to(device))
        return hidden
    
    def name(self):
        return "Net"

In [179]:
#hinge = torch.nn.HingeEmbeddingLoss()
triplet_loss = nn.TripletMarginLoss(margin=margin, p=2, reduction='sum')
l2_squared = nn.MSELoss(reduction='none')
pdist = nn.PairwiseDistance(p=2)
zero_tensor = torch.tensor(0.).to(device)
clip = 5

In [180]:
def train(model, device, train_loader, epoch, optimizer):
    model.train()
    h = model.init_hidden(batch_size)
    
    accurate_labels = 0
    all_labels = 0
    for (batch_idx, data) in enumerate(train_loader):
        h = tuple([e.data for e in h])
        data[0], data[1] = data[0].to(device), data[1].to(device)
        
        optimizer.zero_grad()
        
        Q, P, h = model(data, h)
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        """Mine the hardest triplets. Get rid of N.""" 
        QxQ = process.pairwise_distances(Q)    # calculate distance matrix for spectra
        PxP = process.pairwise_distances(P)    # calculate distance matrix for peptides
        QxP_ = process.pairwise_distances(Q, P) # calculate distance matrix for spectra-peptides
        
        # Set the diagonal of all distance matrices to inf so we don't get self as the closest negative.
        QxQ.fill_diagonal_(float("inf"))
        PxP.fill_diagonal_(float("inf"))
        QxP = QxP_.clone()
        QxP.fill_diagonal_(float("inf"))
        
        #print(QP.argmin(1)[:100])
        
#         pos = torch.sum(l2_squared(Q, P), dim=1) + margin
        
#         QxQ_min = QxQ.gather(1, torch.randint(len(Q), (len(Q),), device=device).view(-1,1))             # farthest spectrum for each spectrum
#         PxP_min = PxP.gather(1, torch.randint(len(Q), (len(Q),), device=device).view(-1,1))             # farthest peptide for each peptide
#         QxP_min = QxP.gather(1, torch.randint(len(Q), (len(Q),), device=device).view(-1,1))             # farthest peptide for each spectrum
#         PxQ_min = QxP.gather(0, torch.randint(len(Q), (len(Q),), device=device).view(1,-1))             # farthest spectrum for each peptide
        
        QxQ_min = QxQ.min(1).values              # nearest spectrum for each spectrum
        PxP_min = PxP.min(1).values              # nearest peptide for each peptide
        QxP_min = QxP.min(1).values              # nearest peptide for each spectrum
        PxQ_min = QxP.min(0).values              # nearest spectrum for each peptide
        
        #neg = QxQ_min + PxP_min + QxP_min + PxQ_min
        
#         divider = torch.tensor(float(len(pos)))
#         loss = torch.sum(torch.max(pos - QxQ_min, zero_tensor)) / divider
#         loss += torch.sum(torch.max(pos - PxP_min, zero_tensor)) / divider
#         loss += torch.sum(torch.max(pos - QxP_min, zero_tensor)) / divider
#         loss += torch.sum(torch.max(pos - PxQ_min, zero_tensor)) / divider
        
        #divider = torch.sum(pos - neg > 0)
        #loss = torch.sum(torch.max(pos - neg, zero_tensor)) / divider
        
        QxQ_min = Q[QxQ.min(1).indices]              # nearest spectrum for each spectrum
        PxP_min = P[PxP.min(1).indices]              # nearest peptide for each peptide
        QxP_min = P[QxP.min(1).indices]              # nearest peptide for each spectrum
        PxQ_min = Q[QxP.min(0).indices]              # nearest spectrum for each peptide
        loss = triplet_loss(Q, P, QxQ_min)           # spectrum-spectrum negatives
        loss += triplet_loss(Q, P, QxP_min)          # spectrum-peptide negatives
        loss += triplet_loss(P, Q, PxP_min)          # peptide-peptide negatives
        loss += triplet_loss(P, Q, PxQ_min)          # peptide-spectrum negatives
        
        loss = loss / 4
                
        loss.backward()
            
        optimizer.step()
        
        seq = torch.arange(0, len(Q), step=1, device=device, requires_grad=False)
        accurate_labels = accurate_labels + torch.sum(QxP_.argmin(1) == seq) # use QP_ since it doesn't have diag set to zero
        
        all_labels = all_labels + len(Q)  
    
    accuracy = 100. * float(accurate_labels) / all_labels
    train_accuracy.append(accuracy)
    train_loss.append(loss)
    print('Epoch: ' + str(epoch))
    print('Train accuracy: {}/{} ({:.3f}%)\tLoss: {:.6f}'.format(accurate_labels, all_labels, accuracy, loss))
    

In [181]:
def test(model, device, test_loader):
    model.eval()
    
    with torch.no_grad():
        accurate_labels = 0
        all_labels = 0
        loss = 0
        h = model.init_hidden(batch_size)
        
        for (batch_idx, data) in enumerate(test_loader):
            h = tuple([e.data for e in h])
            data[0], data[1] = data[0].to(device), data[1].to(device)
        
            optimizer.zero_grad()
            
            Q, P, h = model(data, h)
            
            """Mine the hardest triplets. Get rid of N.""" 
            QxQ = process.pairwise_distances(Q)    # calculate distance matrix for spectra
            PxP = process.pairwise_distances(P)    # calculate distance matrix for peptides
            QxP_ = process.pairwise_distances(Q, P) # calculate distance matrix for spectra-peptides

            # Set the diagonal of all distance matrices to inf so we don't get self as the closest negative.
            QxQ.fill_diagonal_(float("inf"))
            PxP.fill_diagonal_(float("inf"))
            QxP = QxP_.clone()    # clone to measure accuracy. can be done in a better way.
            QxP.fill_diagonal_(float("inf"))

            #print(QP.argmin(1)[:100])

            pos = 4 * (torch.sum(l2_squared(Q, P), dim=1) + margin)

            QxQ_min = QxQ.min(1).values              # farthest spectrum for each spectrum
            PxP_min = PxP.min(1).values              # farthest peptide for each peptide
            QxP_min = QxP.min(1).values              # farthest peptide for each spectrum
            PxQ_min = QxP.min(0).values              # farthest spectrum for each peptide

            #neg = QxQ_min + PxP_min + QxP_min + PxQ_min
        
#             divider = torch.tensor(float(len(pos)))
#             loss = torch.sum(torch.max(pos - QxQ_min, zero_tensor)) / divider
#             loss += torch.sum(torch.max(pos - PxP_min, zero_tensor)) / divider
#             loss += torch.sum(torch.max(pos - QxP_min, zero_tensor)) / divider
#             loss += torch.sum(torch.max(pos - PxQ_min, zero_tensor)) / divider

            QxQ_min = Q[QxQ.min(1).indices]              # nearest spectrum for each spectrum
            PxP_min = P[PxP.min(1).indices]              # nearest peptide for each peptide
            QxP_min = P[QxP.min(1).indices]              # nearest peptide for each spectrum
            PxQ_min = Q[QxP.min(0).indices]              # nearest spectrum for each peptide
            loss = triplet_loss(Q, P, QxQ_min)     # spectrum-spectrum negatives
            loss += triplet_loss(Q, P, QxP_min)    # spectrum-peptide negatives
            loss += triplet_loss(P, Q, PxP_min)    # peptide-peptide negatives
            loss += triplet_loss(P, Q, PxQ_min)    # peptide-spectrum negatives
            
            loss = loss / 4

            #divider = torch.tensor(float(len(pos)))
            #divider = torch.sum(pos - neg > 0)
            #loss = torch.sum(torch.max(pos - neg, zero_tensor)) / divider
            
#             loss =  torch.sum(torch.max(pos - QxQ_min, zero_tensor)) / divider
#             loss += torch.sum(torch.max(pos - PxP_min, zero_tensor)) / divider
#             loss += torch.sum(torch.max(pos - QxP_min, zero_tensor)) / divider
#             loss += torch.sum(torch.max(pos - PxQ_min, zero_tensor)) / divider
            
            seq = torch.arange(0, len(Q), step=1, device=device, requires_grad=False)
            accurate_labels = accurate_labels + torch.sum(QxP_.argmin(1) == seq) # use QP_ since it doesn't have diag set to zero
            
            all_labels = all_labels + len(Q)
                
        accuracy = 100. * float(accurate_labels) / all_labels
        test_accuracy.append(accuracy)
        test_loss.append(loss)
        print('Test accuracy: {}/{} ({:.3f}%)\tLoss: {:.6f}'.format(accurate_labels, all_labels, accuracy, loss))

In [182]:
def oneshot(model, device, data):
    model.eval()
    
    with torch.no_grad():
        for i in range(len(data)):
            data[i] = data[i].to(device)
            
        output = model(data)
        return torch.squeeze(torch.argmax(output, dim=1)).cpu().item()

In [183]:
#with redirect_output("deepSNAP_redirect.txt"):
train_loss = []
test_loss = []
train_accuracy = []
test_accuracy = []
#drop_prob=0.5
print(vocab_size)
model = Net(vocab_size, output_size=512, embedding_dim=256, hidden_lstm_dim=1024, lstm_layers=1).to(device)
# model.linear1_1.weight.requires_grad = False
# model.linear1_1.bias.requires_grad = False
# model.linear1_2.weight.requires_grad = False
# model.linear1_2.bias.requires_grad = False

if do_learn: # training mode

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    #optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        train(model, device, train_loader, epoch, optimizer)
        test(model, device, test_loader)

187
Epoch: 0
Train accuracy: 12897/469504 (2.747%)	Loss: 52.419701
Test accuracy: 30348/117248 (25.884%)	Loss: 51.912865
Epoch: 1
Train accuracy: 72499/469504 (15.442%)	Loss: 51.678970
Test accuracy: 51587/117248 (43.998%)	Loss: 51.370903
Epoch: 2
Train accuracy: 204240/469504 (43.501%)	Loss: 49.222218
Test accuracy: 77877/117248 (66.421%)	Loss: 46.636387
Epoch: 3
Train accuracy: 314567/469504 (67.000%)	Loss: 34.650257
Test accuracy: 94243/117248 (80.379%)	Loss: 26.470903
Epoch: 4
Train accuracy: 365597/469504 (77.869%)	Loss: 29.054005
Test accuracy: 100180/117248 (85.443%)	Loss: 20.521399
Epoch: 5
Train accuracy: 387634/469504 (82.562%)	Loss: 25.808140
Test accuracy: 103421/117248 (88.207%)	Loss: 17.253811
Epoch: 6
Train accuracy: 401285/469504 (85.470%)	Loss: 21.894783
Test accuracy: 105527/117248 (90.003%)	Loss: 14.608198
Epoch: 7
Train accuracy: 411182/469504 (87.578%)	Loss: 19.986324
Test accuracy: 107205/117248 (91.434%)	Loss: 13.615899
Epoch: 8
Train accuracy: 418643/469504 (89.

Epoch: 69
Train accuracy: 466686/469504 (99.400%)	Loss: 2.830349
Test accuracy: 114556/117248 (97.704%)	Loss: 3.297306
Epoch: 70
Train accuracy: 466675/469504 (99.397%)	Loss: 1.839272
Test accuracy: 114619/117248 (97.758%)	Loss: 3.069195
Epoch: 71
Train accuracy: 466799/469504 (99.424%)	Loss: 3.193799
Test accuracy: 114601/117248 (97.742%)	Loss: 3.093598
Epoch: 72
Train accuracy: 466913/469504 (99.448%)	Loss: 1.718107
Test accuracy: 114581/117248 (97.725%)	Loss: 3.119102
Epoch: 73
Train accuracy: 466821/469504 (99.429%)	Loss: 2.940211
Test accuracy: 114607/117248 (97.748%)	Loss: 3.488417
Epoch: 74
Train accuracy: 466896/469504 (99.445%)	Loss: 3.328394
Test accuracy: 114586/117248 (97.730%)	Loss: 3.625930
Epoch: 75
Train accuracy: 467020/469504 (99.471%)	Loss: 2.340656
Test accuracy: 114585/117248 (97.729%)	Loss: 3.791187
Epoch: 76
Train accuracy: 466998/469504 (99.466%)	Loss: 3.920566
Test accuracy: 114584/117248 (97.728%)	Loss: 3.633781
Epoch: 77
Train accuracy: 467006/469504 (99.468%

KeyboardInterrupt: 

In [None]:
output.show()

In [184]:
torch.save(model, 'models/lstm_97.7%_v3.0.pt')

  "type " + obj.__name__ + ". It won't be checked "


remove modifications  
use one charge

In [None]:
T = torch.tensor([[1., 1.], [1., 2.], [3., 4.]])
S = torch.tensor([[2., 3.], [3., 2.], [4., 3.], [5, 3]])

mul = torch.mm(T, S.t())
print('mul: ')
print(mul)
adder = torch.tensor([1., 2., 3., 4.])
added = adder + mul
print('added: ')
print(added)

norm = T.pow(2).sum(1)
print(norm)
exp_norm = norm.expand(4, -1).t()
print(exp_norm)
# pdist = nn.PairwiseDistance(p=2)
# output = pdist(T, S)
# output
# print(T)
# print(T.t())
# test = torch.tensor([1, 2, 3])
# expanded_test = test.expand(3, -1)
# print(expanded_test)

In [None]:
def pairwise_distance(A, B):
    """Compute the 2D matrix of distances between all the embeddings.

    Args:
        embeddings: tensor of shape (batch_size, embed_dim)
        squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
                 If false, output is the pairwise euclidean distance matrix.

    Returns:
        pairwise_distances: tensor of shape (batch_size, batch_size)
    """
    # Get the dot product between all embeddings
    # shape (batch_size, batch_size)
    dot_product = torch.mm(A, B.t())

    # Get squared L2 norm for each embedding. We can just take the diagonal of `dot_product`.
    # This also provides more numerical stability (the diagonal of the result will be exactly 0).
    # shape (batch_size,)
    
    A_L2_norm = A.pow(2).sum(1)
    B_L2_norm = B.pow(2).sum(1)
    
    # Compute the pairwise distance matrix as we have:
    # ||a - b||^2 = ||a||^2  - 2 <a, b> + ||b||^2
    # shape (batch_size, batch_size)
    distances = A_L2_norm[:, None] - (2.0 * dot_product) + B_L2_norm
    
    # Because of computation errors, some distances might be negative so we put everything >= 0.0
    distances = torch.max(distances, torch.tensor(0.0))

#     if not squared:
#         # Because the gradient of sqrt is infinite when distances == 0.0 (ex: on the diagonal)
#         # we need to add a small epsilon where distances == 0.0
#         mask = tf.to_float(tf.equal(distances, 0.0))
#         distances = distances + mask * 1e-16

#         distances = tf.sqrt(distances)

#         # Correct the epsilon added: set the distances on the mask to be exactly 0.0
#         distances = distances * (1.0 - mask)

    return distances

In [None]:
dists = pairwise_distance(T, T)
print(T)
print(S)
print(dists)

In [None]:
T = torch.tensor([[1., 1.], [1., 2.], [3., 4.]])  
S = torch.tensor([[2., 3.], [3., 2.], [4., 3.], [5, 3]])

In [None]:
def pairwise_distances(x, y=None):
    '''
    Input: x is a Nxd matrix
           y is an optional Mxd matirx
    Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
            if y is not given then use 'y=x'.
    i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
    '''
    x_norm = (x**2).sum(1).view(-1, 1)
    if y is not None:
        y_t = torch.transpose(y, 0, 1)
        y_norm = (y**2).sum(1).view(1, -1)
    else:
        y_t = torch.transpose(x, 0, 1)
        y_norm = x_norm.view(1, -1)
    
    dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t)
    # Ensure diagonal is zero if x=y
    if y is None:
        dist = dist - torch.diag(dist.diag())
    dist[dist != dist] = 0 # set all nan values to zero
    return torch.clamp(dist, 0.0, np.inf)

In [None]:
a = torch.tensor([[ 1.4335, -1.0990, -0.8586],
        [ 2.1553,  2.7028, -0.8020],
        [ 1.0524,  0.1599, -0.0374]])
b = torch.tensor([[ 3., 7., 2.],
                [ 6.,  8., 2.],
                [ 9.,  5., 7.]])
dists = process.pairwise_distances(x=a, y=None)
print(dists)
print(b.max(1).values)
print(b[b.max(1).indices])
#print(b.max(0))

In [None]:
a = torch.tensor([[ 3., 1., 4.],
                [ 0.,  2., 3.],
                [ 5.,  3., 6.]])
b = torch.tensor([[ 1., 2., 1.],
                [ 2.,  2., 1.],
                [ 1.,  1., 1.]])
dist = (a - b) ** 2
print(l2_squared(a, b))
print(torch.sum(l2_squared(a, b), 1))
print(a.min(1).values)

In [None]:
a0 = torch.tensor([1., -1., 3.])
max0 = torch.max(a0, torch.tensor(0.))
print(max0)

In [None]:
a = np.asarray([1, 2, 3, 4])
b = np.asarray([0, 1, 0])
a.append(b)
print(a)

In [None]:
def test_print(i):
    print(i)

In [None]:
pep_path = join(in_tensor_dir, 'peptides')
charge1 = 0
charge2 = 0
charge3 = 0
for file in listdir(pep_path):
    file_parts = re.search(r"(\d+)-(\d+)-(\d+.\d+)-(\d)-(0|1).pep", file)
    charge = int(file_parts[4])
    if charge == 1:
        charge1 += 1
    elif charge == 2:
        charge2 += 1
    elif charge == 3:
        charge3 += 1
print('charge 1 count: {}'.format(charge1))
print('charge 2 count: {}'.format(charge2))
print('charge 3 count: {}'.format(charge3))

unmod = 0
mod = 0
for file in listdir(pep_path):
    file_parts = re.search(r"(\d+)-(\d+)-(\d+.\d+)-(\d)-(0|1).pep", file)
    m = int(file_parts[5])
    if m == 0:
        unmod += 1
    elif m == 1:
        mod += 1
print('Unmodified count: {}'.format(unmod))
print('Modified count: {}'.format(mod))

In [None]:
line = "Name: AAAAAAAAAAAAAAAGAGAGAK/2_0"
name_groups = re.search(r"Name:\s(?P<pep>[a-zA-Z]+)/(?P<charge>\d+)"
                                    r"(?:_(?P<num_mods>\d+)(?P<mods>.*))?", line)
pep = name_groups['pep']
l_charge = int(name_groups['charge'])
modified = (name_groups['num_mods']) != '0'

print(pep)
print(l_charge)
print(modified)

In [None]:
print(type(round(1.1)))