# Summary
Prepare data for data visualisation team:
- Save processed, only annotated with inchikey spectra
- Create 2D + 3D tSNE of these spectra after converting these spectra to embedding vectors
- Save both tSNE outputs linked by spectrum_ids
- Save classifications file linked by spectrum_ids

In [1]:
import os
import gensim
import pickle
import time
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

### Load preprocessed data

In [2]:
data_path = "/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/datasets/ALL_GNPS_210409_positive/"
embedding_path = "/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/embeddings/ALL_GNPS_210409_positive/"

base = "ALL_GNPS_210409_positive_cleaned"
spectrum_file = os.path.join(data_path, base+".pickle")
os.path.exists(spectrum_file)

True

In [9]:
processed_spectrums_file = os.path.join(data_path, base+"_peaks_processed_s2v.pickle")
with open(processed_spectrums_file, 'rb') as inf:
    spectrums_processed = pickle.load(inf)
len(spectrums_processed)

199780

### Keep annotated spectra
Annotated with inchikey

In [8]:
annot_spectrums_processed = []
for spec in spectrums_processed:
    inchikey = spec.metadata.get("inchikey")
    if inchikey:
        annot_spectrums_processed.append(spec)
len(annot_spectrums_processed)

187152

In [18]:
spectrums_ids = [s.metadata.get("spectrum_id") for s in annot_spectrums_processed]

In [19]:
any([s==None for s in spectrums_ids])

False

### Save as json

In [11]:
from matchms.exporting import save_as_json
out_json = os.path.join(data_path, base+"_peaks_processed_s2v_only_annotated.json")
print(out_json)
save_as_json(annot_spectrums_processed, out_json)

/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/datasets/ALL_GNPS_210409_positive/ALL_GNPS_210409_positive_cleaned_peaks_processed_s2v_only_annotated.json


In [13]:
out_annot_proc = os.path.join(data_path, base+"_peaks_processed_s2v_only_annotated.pickle")
with open(out_annot_proc, 'wb') as outf:
    pickle.dump(annot_spectrums_processed, outf)

### Create spectral embedding vectors

In [12]:
# import model
model_file = os.path.join(embedding_path, "ALL_GNPS_210409_positive_cleaned_spec2vec_embedding_iter_15.model")
model = gensim.models.Word2Vec.load(model_file)

In [14]:
from spec2vec import SpectrumDocument
documents_spectrums_processed = [SpectrumDocument(s, n_decimals=2) for s in annot_spectrums_processed]
print(len(documents_spectrums_processed))

187152


In [20]:
from spec2vec.vector_operations import calc_vector
spectrum_vectors = [calc_vector(model, s) for s in documents_spectrums_processed]

### Compute tSNE

In [23]:
spectrum_vectors = np.vstack(spectrum_vectors)

In [26]:
spectrum_vectors.shape

(187152, 300)

In [30]:
#2d
from sklearn.manifold import TSNE
tsne_2d = TSNE(n_components=2, random_state=42)
tsne_2d_res = tsne_2d.fit_transform(spectrum_vectors)

In [31]:
#3d
from sklearn.manifold import TSNE
tsne_3d = TSNE(n_components=3, random_state=42)
tsne_3d_res = tsne_3d.fit_transform(spectrum_vectors)

In [32]:
tsne_2d_res.shape, tsne_3d_res.shape

((187152, 2), (187152, 3))

### Save tSNE results

In [40]:
print(tsne_2d_res[0])
for a in tsne_2d_res:
    print(','.join(map(str,a)))
    break

[ -3.786377 -23.733685]
-3.786377,-23.733685


In [34]:
tsne_2d_pickle = os.path.join(data_path, base+"_peaks_processed_s2v_only_annotated_tsne2D.pickle")
tsne_3d_pickle = os.path.join(data_path, base+"_peaks_processed_s2v_only_annotated_tsne3D.pickle")
pickle.dump(tsne_2d_res, open(tsne_2d_pickle, 'wb'))
pickle.dump(tsne_3d_res, open(tsne_3d_pickle, 'wb'))

In [42]:
tsne_2d_out = os.path.join(data_path, base+"_peaks_processed_s2v_only_annotated_tsne2D.csv")
with open(tsne_2d_out, 'w') as outf:
    for s_id, tsne_r in zip(spectrums_ids, tsne_2d_res):
        outf.write("{},{}\n".format(s_id, ','.join(map(str, tsne_r))))

In [43]:
tsne_3d_out = os.path.join(data_path, base+"_peaks_processed_s2v_only_annotated_tsne3D.csv")
with open(tsne_3d_out, 'w') as outf:
    for s_id, tsne_r in zip(spectrums_ids, tsne_3d_res):
        outf.write("{},{}\n".format(s_id, ','.join(map(str, tsne_r))))

### Link classifications to selected spectra

In [44]:
classifications_path = "/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/classifications/ALL_GNPS_210409_positive/"
classes_file = os.path.join(classifications_path, "ALL_GNPS_210409_positive_processed_annotated_CF_NPC_classes.txt")
classes = {}  # inchikey: [classes] in order of header
with open(classes_file) as inf:
    header = inf.readline().strip().split('\t')
    for line in inf:
        line = line.strip().split('\t')
        classes[line[0]] = line[2:]
print(f'read classes for {len(classes)} inchikeys')
print("\nclasses order:")
print(header)

read classes for 19824 inchikeys

classes order:
['inchi_key', 'smiles', 'cf_kingdom', 'cf_superclass', 'cf_class', 'cf_subclass', 'cf_direct_parent', 'npc_class_results', 'npc_superclass_results', 'npc_pathway_results', 'npc_isglycoside']


In [50]:
spectra_classes = []
empty_class = ["" for _ in range(len(header)-2)]
assert len(empty_class) == len(header)-2
for s_id, spec in zip(spectrums_ids, annot_spectrums_processed):
    inchikey = spec.metadata.get("inchikey")
    s_classes = classes.get(inchikey, empty_class)
    assert len(s_classes) == len(header)-2
    res = [s_id] + s_classes
    spectra_classes.append(res)
print(len(spectra_classes))
print(spectra_classes[:2])

187152
[['CCMSLIB00000001547', 'Organic compounds', 'Organic acids and derivatives', 'Peptidomimetics', 'Hybrid peptides', 'Hybrid peptides', 'Cyclic peptides; Microcystins', 'Oligopeptides', 'Amino acids and Peptides', '0'], ['CCMSLIB00000001548', 'Organic compounds', 'Organic acids and derivatives', 'Peptidomimetics', 'Depsipeptides', 'Cyclic depsipeptides', 'Cyclic peptides', 'Oligopeptides', 'Amino acids and Peptides', '0']]


In [49]:
new_header = ['spectrum_id'] + header[2:]
assert len(new_header) == len(spectra_classes[0])
print(new_header)

['spectrum_id', 'cf_kingdom', 'cf_superclass', 'cf_class', 'cf_subclass', 'cf_direct_parent', 'npc_class_results', 'npc_superclass_results', 'npc_pathway_results', 'npc_isglycoside']


In [51]:
spectrum_classes_file = os.path.join(classifications_path, "ALL_GNPS_210409_positive_cleaned_peaks_processed_s2v_only_annotated_classes.txt")
print(spectrum_classes_file)
with open(spectrum_classes_file, 'w') as outf:
    outf.write(f"{','.join(new_header)}\n")
    for line_list in spectra_classes:
        outf.write(f"{','.join(line_list)}\n")

/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/classifications/ALL_GNPS_210409_positive/ALL_GNPS_210409_positive_cleaned_peaks_processed_s2v_only_annotated_classes.txt
