# A demonstration on defining the vocabulary.

## Preliminaries

Setting the classification tolerance in ppm; dismissing entities below the intensity floor in <?>.

In [1]:
import json
import pymzml
import numpy as np

INTENSITY_FLOOR = 10
TOLERANCE_FACTOR = 7
DATA_PATH = "../data/"
DATA_FILENAME = "abcdefgh_1.mzML"

Auxiliary method for classification with the certain range.

In [2]:
def classify(classes, feature):
    for class_ in classes:
        if feature >= class_[0] and feature <= class_[1]:
            return str(class_)

## Initialisation

Running the data.

In [3]:
run = pymzml.run.Reader(DATA_PATH + DATA_FILENAME)

Reading entities in a form of (mass, intensity) tuples.

In [4]:
preprocessed_mnis = []  
for spectrum in run:
    for mass, intensity in spectrum.peaks:
        if intensity > INTENSITY_FLOOR:
            preprocessed_mnis.append((mass, intensity, spectrum['id']))
mnis_dtype = [('mass', float), ('intensity', float), ('id', int)]

Using numpy to sort the tuples in ascending mass order.

In [5]:
mnis = np.array(preprocessed_mnis, dtype=mnis_dtype)
mnis.sort(order='mass') 

## Classification

Extracting the words.

In [6]:
words = []
starting_class_mass = mnis[0][0]
previous_mass = mnis[0][0]
for entity in mnis:
    mass = entity[0]
    tolerance = (previous_mass / 1000000) * TOLERANCE_FACTOR
    if mass - previous_mass > tolerance:
        words.append((starting_class_mass, previous_mass))
        starting_class_mass = mass
    previous_mass = mass 
words.append((starting_class_mass, previous_mass))

Building the vocabulary.

In [None]:
vocabulary = {}
for entity in mnis:
    key = str(entity[2])
    if key not in vocabulary:
        vocabulary[key] = {}
    class_ = classify(words, entity[0])
    if class_ not in vocabulary[key]:
        vocabulary[key][class_] = 0
    vocabulary[key][class_] += entity[1]