In [14]:
import pandas as pd
import os
from os import path
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from numpy import percentile
import datetime
import gensim
from gensim import similarities
from gensim.similarities import Similarity
from gensim import models
import datetime

base_path = '/data/pixel-annot-export-v0.9/'
ann_path = path.join(base_path, 'pixel_df_list/'

# TMP DIRECTORY PATH FOR TMP GENSIM OUTPUT
opref = '/data/katya/tmp/ionsim'

ds_df_path = path.join(base_path, 'ds_df.msgpack')
ds_df = pd.read_msgpack(ds_df_path)

ion_df_path = path.join(base_path, 'ion_df.msgpack')
ion_df = pd.read_msgpack(ion_df_path)

formula_df_path = path.join(base_path, 'formula_df.msgpack')
formula_df = pd.read_msgpack(formula_df_path)

gs_path = '../GS.csv'
gs_df = pd.read_csv(gs_path)

# FDR theshold for filtering out ions
fdr = 50

sim_out_dir = 'ion_similarity_vectors'

In [19]:
def compute_ion_similarities(ds_ind):
    pixel_df_path = path.join(ann_path, '{}'.format(ds_ind))
    pixel_df = pd.read_msgpack(pixel_df_path)

    max_x = pixel_df['x'].max()
    max_y = pixel_df['y'].max() 

    pixel_df = pixel_df[pixel_df.fdr <= fdr]
    pixel_df.reset_index(drop=True, inplace=True)
    pixel_df = pixel_df.assign(p_ind = pixel_df.x.astype(np.int32) + (pixel_df.y.astype(np.int32) * (max_x + 1)))

    ion_corpus = []
    ions = []
    for ion_ind, ion_rows in pixel_df.groupby('ion_ind'):
        ions.append(ion_ind)
        
        # remove hot spots
        q = ion_rows.int.quantile(.99)
        ion_rows.loc[ion_rows['int'] > q, 'int'] = q 
        
        # rescale intensitiies from 0 to 1
        #ion_rows['int'] = ion_rows['int'] / ion_rows['int'].max()
        
        # build ion-pixel gensim corpus
        ion_doc = list(zip(ion_rows.p_ind, ion_rows.int))
        ion_corpus.append(ion_doc)
    
    print(datetime.datetime.now().strftime("%H:%M"), 'Ion corpus generated')
    sim_index = gensim.similarities.docsim.Similarity(opref, ion_corpus, num_features = pixel_df['p_ind'].max()+1)
    print(datetime.datetime.now().strftime("%H:%M"), 'Ion similarity computed.')
    
    pixel_df = None
    
    sim_df = pd.DataFrame(np.array(sim_index), columns = ions, index = ions)
    print(datetime.datetime.now().strftime("%H:%M"), 'Sim matrix converted into df.')
    return sim_df

In [20]:
# Get ion indices from dataframe
def get_ion_ind_df(ds_rows):
    result = []
    for _, row in ds_rows.iterrows():
        ion_ind_list = ion_df[ion_df.formula == row.sumFormula][ion_df.adduct == row.adduct].index.tolist()
        if len(ion_ind_list) == 0:
            print('No ion in the dictionary: ', sf, adduct)
        else:
            ion_ind = ion_ind_list[0]
            result.append(ion_ind)
    return result

In [None]:
for datasetId, ds_rows in gs_df.groupby('datasetId'):
    ds_row = ds_df[ds_df['id'] == datasetId]
        
    if len(ds_row.index)==0: 
        print('Dataset not found: ', datasetId)
        continue
        
    ds_name = ds_row['name'].iloc[0]
    ds_ind = ds_row.index.tolist()[0]
                
    print(datetime.datetime.now().strftime("%H:%M"), ds_name)
        
    sim_df = compute_ion_similarities(ds_ind)
    
    sim_df.to_msgpack(path.join(gs_out_dir, str(ds_ind)))
    print(datetime.datetime.now().strftime("%H:%M"), 'Df saved.')  
    
    sim_df = None


07:41 AstraZeneca//CT26_xenograft


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


08:26 Ion corpus generated
08:26 Ion similarity computed.
08:36 Sim matrix converted into df.
08:36 Df saved.
08:36 AstraZeneca//Rat_testis
08:43 Ion corpus generated
08:43 Ion similarity computed.
08:44 Sim matrix converted into df.
08:44 Df saved.
08:44 ICL//7TopRight,_17BottomRight,_27BottomLeft,_37TopLeft
08:45 Ion corpus generated
08:45 Ion similarity computed.
08:45 Sim matrix converted into df.
08:45 Df saved.
08:45 UoNotreDame//Untreated_6_574
08:46 Ion corpus generated
08:46 Ion similarity computed.
08:46 Sim matrix converted into df.
08:46 Df saved.
08:46 North Carolina State University//mouse body
10:43 Ion corpus generated
10:43 Ion similarity computed.
11:16 Sim matrix converted into df.
11:16 Df saved.
11:16 ICL//A51 CT S3-centroid
11:17 Ion corpus generated
11:17 Ion similarity computed.
11:17 Sim matrix converted into df.
11:17 Df saved.
11:17 S648 WS20 205x170 20um E110
11:22 Ion corpus generated
11:23 Ion similarity computed.
11:23 Sim matrix converted into df.
11:23 

19:46 Ion corpus generated
19:46 Ion similarity computed.
19:47 Sim matrix converted into df.
19:47 Df saved.
19:47 Servier_Ctrl_mouse_wb_median_plane_9aa
19:47 Ion corpus generated
19:47 Ion similarity computed.
19:47 Sim matrix converted into df.
19:48 Df saved.
19:48 Mouse_Wholebody_3
20:44 Ion corpus generated
20:44 Ion similarity computed.
20:52 Sim matrix converted into df.
20:52 Df saved.
20:52 Servier_Ctrl_mouse_wb_lateral_plane_DHB
20:53 Ion corpus generated
20:53 Ion similarity computed.
20:53 Sim matrix converted into df.
20:53 Df saved.
20:53 Servier_Ctrl_mouse_wb_median_plane_DHB
20:54 Ion corpus generated
20:54 Ion similarity computed.
20:55 Sim matrix converted into df.
20:55 Df saved.
20:55 20170815_CGL_MT-M.B_DAN012_NTM_42x30_100x100
20:56 Ion corpus generated
20:56 Ion similarity computed.
20:56 Sim matrix converted into df.
20:56 Df saved.
20:56 20170817_CGL_MT-M.B_ATP_DAN018_NTM_102x39_100x100
20:56 Ion corpus generated
20:56 Ion similarity computed.
20:56 Sim matri