# Get Annotation Off-sample Stats

In [175]:
from pathlib import Path
import pandas as pd

data_path = Path('/tmp/gs_predictions.tsv')
df = pd.read_csv(data_path, sep='\t')
print(df.shape)
df.head()

(23238, 3)


Unnamed: 0,path,pred_label,off_sample_pred_prob
0,data/GS_predictions/Dataset 30/off/C18H30O+H.png,off,0.99531
1,data/GS_predictions/Dataset 30/off/C21H30O5+H.png,off,0.999994
2,data/GS_predictions/Dataset 30/off/C14H18O3+K.png,off,0.999977
3,data/GS_predictions/Dataset 30/off/C28H22O3+K.png,off,0.999999
4,data/GS_predictions/Dataset 30/off/C18H17NO6+K...,off,0.999997


In [176]:
import re

def split_path(path):
    ds, label, formula = path.split('/')[-3:]
    ion = Path(formula).stem
    formula = re.sub(r'[-+]\w+', '', ion)
    return ds, formula, label

rows = df.path.apply(lambda path: split_path(path)).values
len(rows)

23238

In [177]:
df = pd.DataFrame.from_records(rows, columns=['ds', 'formula', 'label'])
df.shape

(23238, 3)

In [178]:
ds_names = df.ds.unique()
N = ds_names.shape[0]
N

87

In [179]:
def calculate_stats(df):
    N = df.ds.unique().shape[0]
    
    def _calculate_formula_stats(f_df):
        off_sample_n = (f_df.label == 'off').sum()
        on_sample_n = (f_df.label == 'on').sum()
        total_n = f_df.shape[0]
        return off_sample_n, on_sample_n, total_n, N, off_sample_n / total_n, off_sample_n / N
    
    by_formula = df.groupby('formula')
    formula_stats = by_formula.apply(_calculate_formula_stats)
    stats_columns = [
        'off_sample_n', 'on_sample_n', 'local_n', 'global_n', 'local_off_freq', 'global_off_freq'
    ]
    stats_df = pd.DataFrame.from_records(
        formula_stats.values, index=formula_stats.index, columns=stats_columns
    )
    return stats_df

In [180]:
stats_df = calculate_stats(df)
stats_df

Unnamed: 0_level_0,off_sample_n,on_sample_n,local_n,global_n,local_off_freq,global_off_freq
formula,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C10H10N2O,2,0,2,87,1.0,0.022989
C10H10N2O2,1,1,2,87,0.5,0.011494
C10H10N2OS,0,2,2,87,0.0,0.000000
C10H10N4O,1,0,1,87,1.0,0.011494
C10H10N4O2S,2,0,2,87,1.0,0.022989
...,...,...,...,...,...,...
C9H9NS,1,0,1,87,1.0,0.011494
C9HF17O2,0,1,1,87,0.0,0.000000
CH3O5P,0,3,3,87,0.0,0.000000
CHBr3,1,0,1,87,1.0,0.011494


# Get Dataset Metadata

In [111]:
from metaspace.sm_annotation_utils import SMInstance

In [112]:
sm = SMInstance()

In [61]:
%time all_dss = sm._gqclient.getDatasets()

CPU times: user 217 ms, sys: 43.6 ms, total: 260 ms
Wall time: 6.04 s


In [113]:
len(all_dss)

3923

In [94]:
_ds_names = [name.replace('__', '//').replace('m_z', 'm/z') for name in ds_names]
len(_ds_names)

87

In [97]:
ds_df = pd.DataFrame(all_dss)
ds_df = ds_df[ds_df.name.isin(_ds_names)].drop_duplicates(subset=['name'])
ds_df = ds_df[['id', 'name', 'ionisationSource', 'maldiMatrix']]
ds_df.shape

In [110]:
ds_df

Unnamed: 0,id,name,ionisationSource,maldiMatrix
269,2017-08-03_18h26m57s,Mouse_Wholebody_3,IRMALDESI,none
271,2017-05-03_17h59m31s,NCSU_M8_mouse_brain,IRMALDESI,none
785,2018-12-18_16h19m20s,Row001,DESI,none
848,2016-10-13_14h06m49s,MPIMM_021_QE_P_CB_0,MALDI,"2,5-dihydroxybenzoic acid (DHB)"
849,2016-10-04_14h16m23s,S649 WS21 155x105 15um E110,MALDI,"2,5-dihydroxybenzoic acid (DHB)"
...,...,...,...,...
2802,2017-09-05_16h22m52s,20170905_CGL0170817_MT-MB_ATP_N_81x101_135x135,MALDI,"1,5-diaminonaphthalene (DAN)"
3162,2017-10-26_14h25m14s,BRB04S-RECAL,DESI,
3377,2017-11-16_16h35m58s,LNTO31_17_1-RECAL,DESI,
3581,2018-01-17_13h29m23s,DESI porcine kidney interface region,DESI,none


In [114]:
ds_df.ionisationSource.unique()

array(['IRMALDESI', 'DESI', 'MALDI'], dtype=object)

In [115]:
ds_df.maldiMatrix.unique()

array(['none', '2,5-dihydroxybenzoic acid (DHB)', 'Norharmane',
       'n-(1-naphthyl)ethylenediamine dihydrochloride (NEDC)', 'BPYN',
       'alpha-cyano-4-hydroxycinnamic acid (CHCA)',
       '1,5-diaminonaphthalene (DAN)', 'N/A', '9-aminoacridine (9AA)',
       '2,5-dihydroxyacetophenone (DHA)'], dtype=object)

In [120]:
def encode_ds_name(name):
    return name.replace('/', '_')

In [123]:
maldi_ds_names = [encode_ds_name(name) for name in ds_df[ds_df.ionisationSource == 'MALDI'].name.values]
len(maldi_ds_names)

59

In [125]:
desi_ds_names = [encode_ds_name(name) for name in ds_df[ds_df.ionisationSource == 'DESI'].name.values]
len(desi_ds_names)

24

In [126]:
_ds_name_mask = (ds_df.ionisationSource == 'MALDI')&(ds_df.maldiMatrix == '2,5-dihydroxybenzoic acid (DHB)')
maldi_dhb_ds_names = [encode_ds_name(name) for name in ds_df[_ds_name_mask].name.values]
len(maldi_dhb_ds_names)

31

# Get Annotation Off-sample Stats for Groups

In [181]:
ds_groups_mapping = [('MALDI', maldi_ds_names), ('DESI', desi_ds_names), ('MALDI_DHB', maldi_dhb_ds_names)]

for ds_group_name, ds_group in ds_groups_mapping:
    print(len(ds_group))
    
    print(df[df.ds.isin(ds_group)].shape)
    
    stats_df = calculate_stats(df[df.ds.isin(ds_group)])
    print(stats_df.shape)
    
    stats_df.to_csv(f'off-sample-formula-stats-{ds_group_name}.csv', sep='\t')

59
(16344, 3)
(5424, 6)
24
(5764, 3)
(2348, 6)
31
(8582, 3)
(3612, 6)
