In [3]:
import os
import sys
import gensim
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

ROOT = os.path.dirname(os.getcwd())
#path_data = os.path.join(ROOT, 'data')
path_data = 'C:\\Users\\joris\\Documents\\eScience_data\\data'
sys.path.insert(0, ROOT)
sys.path.insert(0, "C:\\Users\\joris\\Documents\\eScience_data\\spec2vec_gnps_data_analysis\\custom_functions")

## Import pre-processed dataset "AllPositive" and post process

In [4]:
#loading data as pickled object goes a lot quicker
import time
import pickle
outfile = os.path.join(path_data, 'gnps_positive_ionmode_cleaned_by_matchms_and_lookups.pickle')
print(outfile)
start = time.time()
with open(outfile, 'rb') as inf:
        spectrums = pickle.load(inf)
end = time.time()
print('loading took {:.2f} s'.format(end-start))
print("number of spectra:", len(spectrums))

C:\Users\joris\Documents\eScience_data\data\gnps_positive_ionmode_cleaned_by_matchms_and_lookups.pickle
loading took 17.44 s
number of spectra: 112956


In [5]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

def post_process_s2v(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    if s is None:
        return None
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
        
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

# apply post processing steps to the data
spectrums_s2v = [post_process_s2v(s) for s in spectrums]

# omit spectrums that didn't qualify for analysis
spectrums_s2v = [s for s in spectrums_s2v if s is not None]

print("{} remaining spectra.".format(len(spectrums_s2v)))

95320 remaining spectra.


## Inspect annotations

In [7]:
Inchikeys = []
for spec in spectrums_s2v:
    Inchikeys.append(spec.get("inchikey"))

In [8]:
inchikeys_pd = pd.Series([x for x in Inchikeys if x])
inchikeys_pd.str[:14].value_counts()[:30]

NEGQHKSYEYVFTD    432
SULIDBRAXVDKBU    426
IQGPMZRCLCCXAG    308
WTJKGGKOPKCXLL    295
IIZPXYDJLKNOIY    235
RWKUXQNLWDTSLO    234
QIBZFHLFHCIUOT    225
ZAYXPDDGEIJGGW    212
QJWDAOSZZYVBJZ    210
KILNVBDSWZSGLL    205
RFVFQQWKPSOBED    202
IESVDEZGAHUQJU    197
LFUDDCMNKWEORN    195
JLPULHDHAOZNQI    182
LSOWKZULVQWMLY    180
XGGMHQYOVYWRLV    178
LLHISNQVRRYJGL    175
JFISYPWOVQNHLS    174
SRIGHEHXEGELQJ    166
ACTIUHUUMQJHFO    163
PZNPLUBHRSSFHT    162
YLWSJLLZUHSIEA    158
GPWHCUUIQMGELX    150
BLZVZPYMHLXLHG    148
QEDPUVGSSDPBMD    146
IGZPHNNYPPAPLA    142
CITHEXJVPOWHKC    141
YEJYLHKQOBOSCP    131
SXNXGNVZTLZDHE    131
AXZGUCXCTZMPTR    128
dtype: int64

In [9]:
suitable_inchikeys = pd.DataFrame(inchikeys_pd.str[:14].value_counts()[inchikeys_pd.str[:14].value_counts().values >= 5])
suitable_inchikeys.reset_index(level=suitable_inchikeys.index.names, inplace=True)
suitable_inchikeys.columns = (['inchikey14', 'occurences'])

# Important: sort values to make it reproducible (same occurences have random order otherwise!)
suitable_inchikeys = suitable_inchikeys.sort_values(['occurences', 'inchikey14'], ascending=False)
suitable_inchikeys.head(15)

Unnamed: 0,inchikey14,occurences
0,NEGQHKSYEYVFTD,432
1,SULIDBRAXVDKBU,426
2,IQGPMZRCLCCXAG,308
3,WTJKGGKOPKCXLL,295
4,IIZPXYDJLKNOIY,235
5,RWKUXQNLWDTSLO,234
6,QIBZFHLFHCIUOT,225
7,ZAYXPDDGEIJGGW,212
8,QJWDAOSZZYVBJZ,210
9,KILNVBDSWZSGLL,205


In [10]:
print("Number of spectra with inchikeys that exist >= 5 times:", suitable_inchikeys.shape[0])

Number of spectra with inchikeys that exist >= 5 times: 4079


### Randomly select 1000 inchikeys that exist >=5  times in the dataset

In [11]:
num_spectra = 1000

np.random.seed(42) # to make it reproducible
selection = np.random.choice(suitable_inchikeys.shape[0], num_spectra, replace=False)
print(selection[:25])
selected_inchikeys = suitable_inchikeys['inchikey14'].values[selection]
selected_inchikeys[:25]

[  33 3233 1556 2547  457 3598  857 1878  705  803 1642  495 2866 1034
 1517  564 2826  568 2575 3956  670 3317 1498 2971   96]


array(['SRRQPVVYXBTRQK', 'JUZYLCPPVHEVSV', 'UZXMLGUMBQQVME',
       'PDAKXMIQFUHWQC', 'JFVKWCYZKMUTLH', 'HQPCSDADVLFHHO',
       'HUPGTAGQEXENPN', 'YPKUMLKVFXFYOT', 'KWIUHFFTVRNATP',
       'QJVXKWHHAMZTBY', 'PGERTGWKXFAEFR', 'MIHLTJZHRJACQQ',
       'YFAGHNZHGGCZAX', 'LKWWJGGLULNRBP', 'XWTYSIMOBUGWOL',
       'BPEXJHGGARTCIR', 'FHHVIBPVBBRLOR', 'YFPYXTNSQOUHPS',
       'LTLYEAJONXGNFG', 'INOGLHRUEYDAHX', 'VGOJYSUPEJWUNN',
       'BPICBUSOMSTKRF', 'ZQHJXKYYELWEOK', 'MXHRCPNRJAMMIM',
       'QXMHHXQBBKDSSL'], dtype=object)

In [12]:
#Randomly select another 1000 inchikeys that exist >=5 times in the dataset and don't occur in the first list selected_inchikeys
new_suitable_inchikeys = suitable_inchikeys.copy()
drop_selection = suitable_inchikeys.index[selection] #get index names of first selection
new_suitable_inchikeys.drop(drop_selection, inplace = True) #remove first selection
print(new_suitable_inchikeys.shape[0])
print('Number of old selected inchikeys in new list:',len(set(new_suitable_inchikeys['inchikey14']) & set(selected_inchikeys)))

np.random.seed(42) # to make it reproducible
new_selection = np.random.choice(new_suitable_inchikeys.shape[0], num_spectra, replace=False)
print(new_selection[:25], len(new_selection))
new_selected_inchikeys = new_suitable_inchikeys['inchikey14'].values[new_selection]
print('Overlap between new and old list:', len(set(new_selected_inchikeys) & set(selected_inchikeys)))

3079
Number of old selected inchikeys in new list: 0
[  63 2775 2016  331 2739  594 2244  346 2312 1472 2785  194 2370 2032
 2538  246  256  486  978 1961  144  801  744  457  416] 1000
Overlap between new and old list: 0


### Randomly pick one spectra for each of the chosen inchikeys

In [13]:
selected_spectra = []
inchikeys_pd = pd.Series([x for x in Inchikeys]) #include all even empty ones to get the IDs right!

np.random.seed(42) # to make it reproducible
for inchikey in selected_inchikeys:
    matches = inchikeys_pd[inchikeys_pd.str[:14] == inchikey].index.values
    selected_spectra.append(int(np.random.choice(matches,1)[0]))

In [14]:
#Pick one spectrum for each of the new chosen inchikeys
new_selected_spectra = []

np.random.seed(42) # to make it reproducible
for inchikey in new_selected_inchikeys:
    matches = inchikeys_pd[inchikeys_pd.str[:14] == inchikey].index.values
    new_selected_spectra.append(int(np.random.choice(matches,1)[0]))

### Randomly pick 1000 inchikeys (spectra) that are unqiue in the dataset

In [15]:
unique_inchikeys = pd.DataFrame(inchikeys_pd.str[:14].value_counts()[inchikeys_pd.str[:14].value_counts().values == 1])

unique_inchikeys.reset_index(level=unique_inchikeys.index.names, inplace=True)
unique_inchikeys.columns = (['inchikey14', 'occurences'])

# Important: sort values to make it reproducible (same occurences have random order otherwise!)
unique_inchikeys = unique_inchikeys.sort_values(['occurences', 'inchikey14'], ascending=False)
unique_inchikeys.head(15)

Unnamed: 0,inchikey14,occurences
2854,ZZYHCCDMBJTROG,1
4383,ZZTCNNZHOWDRPS,1
2907,ZZQHNBGRWRQWFI,1
1327,ZZPAWQYZQVUVHX,1
2651,ZZNVCZGRNCQHCQ,1
3826,ZZHLYYDVIOPZBE,1
3181,ZZAJQOPSWWVMBI,1
4934,ZYYNEJWFGGVJQZ,1
4729,ZYVYPNZFOCZLEM,1
3347,ZYUDDSCUGFXAKK,1


In [16]:
print(len(unique_inchikeys))

5352


In [17]:
num_spectra = 1000

np.random.seed(42) # to make it reproducible
unique_selection = np.random.choice(unique_inchikeys.shape[0], num_spectra, replace=False)
print(unique_selection[:25])
selected_unique_inchikeys = unique_inchikeys['inchikey14'].values[unique_selection]
print(selected_unique_inchikeys[:25])

#get the spectrum that belongs with the unique chosen inchikeys
selected_unique_spectra = []
inchikeys_pd = pd.Series([x for x in Inchikeys]) #include all even empty ones to get the IDs right!

np.random.seed(42) # to make it reproducible
for inchikey in selected_unique_inchikeys:
    matches = inchikeys_pd[inchikeys_pd.str[:14] == inchikey].index.values
    selected_unique_spectra.append(int(matches[0]))
print(len(selected_unique_spectra))

[4114 1650  401 4221 1684   33  290 3265 5324 3235 4213 3403 5004 3699
 1075  925 4074 4446 2642 2972  810 3204 3872 2707 2651]
['GQIJYUMTOUBHSH' 'RVWQXFWAUMMLKE' 'YGULWPYYGQCFMP' 'GDPHPXYFLPDZGH'
 'RRNSPXUFTKJIEZ' 'ZWNKGUOHTAKRBN' 'YTNXARXLPJGCHV' 'KMRZMOYMAAVHNA'
 'ADDCNOCQPWDJSR' 'KQFUXLQBMQGNRT' 'GEJNVNUFFYATKR' 'JVRVKOOOXHGJKI'
 'BOVRCQYBOHNUIF' 'INOKSQLHQGQUNF' 'VGGCLGHLLZKCSW' 'VYGQXRZAHIZHQV'
 'GUOQUXNJZHGPQF' 'FAZKBTFQSATIIN' 'NISZLIJBKRGIAU' 'LVUQCTGSDJLWCE'
 'WMAITHDYVBQITD' 'KSZJPQHIPKEEMF' 'HSLDNQNDHKZKNZ' 'NBLBCGUCPBXKOV'
 'NHNODHRSCRALBF']
1000


In [18]:
#Randomly select another 1000 inchikeys that are unique in the dataset and don't occur in the first list selected_unique_inchikeys
new_unique_inchikeys = unique_inchikeys.copy()
drop_selection = unique_inchikeys.index[unique_selection] #get index names of first selection
new_unique_inchikeys.drop(drop_selection, inplace = True) #remove first selection
print(new_unique_inchikeys.shape[0])
print('Number of old selected inchikeys in new list:',len(set(new_unique_inchikeys['inchikey14']) & set(selected_unique_inchikeys)))

np.random.seed(42) # to make it reproducible
new_unique_selection = np.random.choice(new_unique_inchikeys.shape[0], num_spectra, replace=False)
print(new_unique_selection[:25], len(new_unique_selection))
new_selected_unique_inchikeys = new_unique_inchikeys['inchikey14'].values[new_unique_selection]
print('Overlap between new and old list:', len(set(new_selected_unique_inchikeys) & set(selected_unique_inchikeys)))

#get the spectrum that belongs with the unique chosen inchikeys
new_selected_unique_spectra = []
inchikeys_pd = pd.Series([x for x in Inchikeys]) #include all even empty ones to get the IDs right!

np.random.seed(42) # to make it reproducible
for inchikey in new_selected_unique_inchikeys:
    matches = inchikeys_pd[inchikeys_pd.str[:14] == inchikey].index.values
    new_selected_unique_spectra.append(int(matches[0]))
print(len(new_selected_unique_spectra))

4352
Number of old selected inchikeys in new list: 0
[ 505 1611 3107  511 1467  410 3678 2677 3370 4235 3853 3217  149 4332
 1295 2457 2589 1261  287 3038   17  755  305 3276 4084] 1000
Overlap between new and old list: 0
1000


### Add the two datasets together

In [19]:
#add unique dataset together with old test set of 1000 spectra >5 occurrence
old_and_unique_test_set = selected_spectra + selected_unique_spectra
len(old_and_unique_test_set)

2000

In [21]:
#add second unique test set together with new test set of 1000 spectra >5 occurrence
new_and_unique2_test_set = new_selected_spectra + new_selected_unique_spectra
len(new_and_unique2_test_set)

2000

## Split spectra into library and query set

In [None]:
old_and_unique_documents_library_s2v = [SpectrumDocument(s, n_decimals=2) for ind, s in enumerate(spectrums_s2v) if ind not in old_and_unique_test_set]
new_and_unique2_documents_library_s2v = [SpectrumDocument(s, n_decimals=2) for ind, s in enumerate(spectrums_s2v) if ind not in new_and_unique2_test_set]

In [None]:
old_and_unique_documents_query_s2v = [SpectrumDocument(spectrums_s2v[i], n_decimals=2) for i in old_and_unique_test_set]
new_and_unique2_documents_query_s2v = [SpectrumDocument(spectrums_s2v[i], n_decimals=2) for i in new_and_unique2_test_set]