## Preliminaries

Setting the classification tolerance in ppm; dismissing entities below the intensity floor in <?>.

In [1]:
import os
import json
import pymzml
import numpy as np
import pandas as pd

INTENSITY_FLOOR = 10
TOLERANCE_FACTOR = 7

HOME_PATH = os.path.expanduser('~')
DATA_PATH = HOME_PATH + '/Projects/mlinb/data/'
DATA_FILENAME = "abcdefgh_1.mzML"

Downloading the data.

In [2]:
import urllib

URL = "https://www.dropbox.com/s/h89znvb6okbfebb/abcdefgh_1.mzML?dl=1"

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

if not os.path.exists(DATA_PATH + DATA_FILENAME):
    urllib.urlretrieve(URL, DATA_PATH + DATA_FILENAME)

## Initialisation

Running the data.

In [3]:
run = pymzml.run.Reader(DATA_PATH + DATA_FILENAME)

Reading entities in the form of (mass, intensity) tuples.

In [4]:
preprocessed_mnis = []  
for spectrum in run:
    for mass, intensity in spectrum.peaks:
        if intensity > INTENSITY_FLOOR:
            preprocessed_mnis.append((mass, intensity, spectrum['id']))

Using numpy to sort the tuples in ascending mass order.

In [5]:
mnis_dtype = [('mass', float), ('intensity', float), ('id', int)]
mnis = np.array(preprocessed_mnis,dtype = mnis_dtype)
mnis.sort(order = 'mass') 

## Classification

Extracting the words.

In [8]:
words = []
starting_class_mass = mnis[0][0]
previous_mass = mnis[0][0]
for entity in mnis:
    mass = entity[0]
    tolerance = (previous_mass / 1000000) * TOLERANCE_FACTOR
    if mass - previous_mass > tolerance:
        words.append({'start': starting_class_mass, 'end': previous_mass})
        starting_class_mass = mass
    previous_mass = mass 
words.append({'start': starting_class_mass, 'end': previous_mass})

#### Optional: Pickling the words.

In [10]:
words_series = pd.Series(words)
words_series.to_pickle('../heavy_pickles/words.pickle')

#### Shortcut: Loading the words pickle.

In [8]:
words = pd.read_pickle('../heavy_pickles/words.pickle')

An auxiliary method for classification within a certain range.

In [13]:
def classify(classes, feature):
    for class_ in classes:
        if feature >= class_['start'] and feature <= class_['end']:
            return str(class_)

Building the corpus.

In [None]:
corpus = {}
for entity in mnis:
    key = str(entity[2])
    if key not in corpus:
        corpus[key] = {}
    class_ = classify(words, entity[0])
    if class_ not in corpus[key]:
        corpus[key][class_] = 0
    corpus[key][class_] += entity[1]

#### Optional: Pickling the corpus.

In [11]:
corpus_series = pd.Series(corpus)
corpus_series.to_pickle('../pickles/corpus.pickle')