# A demonstration on defining the vocabulary.

## Preliminaries

Setting the classification tolerance in ppm; dismissing entities below the intensity floor in <?>.

In [1]:
import os
import json
import pymzml
import numpy as np

INTENSITY_FLOOR = 10
TOLERANCE_FACTOR = 7

HOME_PATH = os.path.expanduser('~')
DATA_PATH = HOME_PATH + '/Projects/mlinb/tasks/defining_the_vocabulary/data/'
DATA_FILENAME = "abcdefgh_1.mzML"

An auxiliary method for classification within a certain range.

In [2]:
def classify(classes, feature):
    for class_ in classes:
        if feature >= class_[0] and feature <= class_[1]:
            return str(class_)

Downloading the data.

In [None]:
import urllib.request

URL = "https://www.dropbox.com/s/h89znvb6okbfebb/abcdefgh_1.mzML?dl=1"

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

urllib.request.urlretrieve(URL, DATA_PATH + DATA_FILENAME)

## Initialisation

Running the data.

In [3]:
run = pymzml.run.Reader(DATA_PATH + DATA_FILENAME)

Reading entities in the form of (mass, intensity) tuples.

In [4]:
preprocessed_mnis = []  
for spectrum in run:
    for mass, intensity in spectrum.peaks:
        if intensity > INTENSITY_FLOOR:
            preprocessed_mnis.append((mass, intensity, spectrum['id']))

Using numpy to sort the tuples in ascending mass order.

In [5]:
mnis_dtype = [('mass', float), ('intensity', float), ('id', int)]
mnis = np.array(preprocessed_mnis, dtype=mnis_dtype)
mnis.sort(order='mass') 

## Classification

Extracting the words.

In [6]:
words = []
starting_class_mass = mnis[0][0]
previous_mass = mnis[0][0]
for entity in mnis:
    mass = entity[0]
    tolerance = (previous_mass / 1000000) * TOLERANCE_FACTOR
    if mass - previous_mass > tolerance:
        words.append((starting_class_mass, previous_mass))
        starting_class_mass = mass
    previous_mass = mass 
words.append((starting_class_mass, previous_mass))

print(len(words))

2442


Building the vocabulary.

In [7]:
corpus = {}
for entity in mnis:
    key = str(entity[2])
    if key not in corpus:
        corpus[key] = {}
    class_ = classify(words, entity[0])
    if class_ not in corpus[key]:
        corpus[key][class_] = 0
    corpus[key][class_] += entity[1]
    
print(len(corpus))

6327


## LDA

Importing the LDA.

In [8]:
import sys
sys.path.append(HOME_PATH + '/Projects/lda/code')
from lda import VariationalLDA

Initialising the LDA.

In [9]:
v_lda = VariationalLDA(corpus = corpus, K = 10) # corpus would be a better name than vocabulary

Found 2442 unique words
Object created with 6327 documents


Running the LDA.

In [10]:
v_lda.run_vb(n_its = 100)

Initialising
Starting iterations
Iteration 0 (change = 12.2630226207) (12.762605 seconds, I think I'll finish in 21.2710083333 minutes)
Iteration 1 (change = 0.158379570547) (11.695488 seconds, I think I'll finish in 19.2975552 minutes)
Iteration 2 (change = 0.291422242113) (11.56731 seconds, I think I'll finish in 18.893273 minutes)
Iteration 3 (change = 0.415132512274) (11.572065 seconds, I think I'll finish in 18.70817175 minutes)
Iteration 4 (change = 0.528016208687) (11.712827 seconds, I think I'll finish in 18.7405232 minutes)
Iteration 5 (change = 0.618927460205) (11.712385 seconds, I think I'll finish in 18.5446095833 minutes)
Iteration 6 (change = 0.673600430646) (11.53889 seconds, I think I'll finish in 18.0775943333 minutes)
Iteration 7 (change = 0.70309866495) (11.648023 seconds, I think I'll finish in 18.05443565 minutes)
Iteration 8 (change = 0.695463176039) (11.89893 seconds, I think I'll finish in 18.245026 minutes)
Iteration 9 (change = 0.658157969236) (11.942611 secon

In [13]:
topic_as_dict = v_lda.get_topic_as_dict(0)

print(topic_as_dict)

{'(151.09847872359833, 151.14774846579857)': 1.5393106839369105e-05, '(1172.055956896874, 1172.1804047037635)': 1.5969584910656512e-06, '(277.98797509867501, 278.06580782869469)': 2.5163647624259175e-05, '(1740.5028626970227, 1747.0751096451438)': 0.00017658605975382353, '(272.12429753288126, 272.16606638851204)': 3.1577590358407075e-09, '(198.06762349229547, 198.13858880026723)': 9.0920606733219903e-05, '(381.73894958401314, 382.77873954256165)': 7.1919521190346808e-05, '(968.06467060484977, 968.08721790753862)': 4.9227901765333674e-10, '(263.75916279237083, 263.77428016266151)': 1.2755438504644066e-09, '(441.03770890187093, 441.74079945490018)': 0.0050324103693776901, '(310.06312272346935, 310.59277614054275)': 0.0063041029493050883, '(497.75772377031421, 498.02028078441288)': 1.5554035219248046e-08, '(559.18461300940112, 559.97023876067703)': 4.5652014990580611e-05, '(540.50293221564391, 540.8740077319593)': 7.2189988294892034e-07, '(555.89736620968199, 556.07120554024232)': 1.94620