In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

import sys
sys.path.append('/home/fcarli/CellHit')
from CellHit.data import DatasetLoader

In [2]:
data_path = Path('/home/fcarli/CellHit/data')
celligner_output_path = Path('/home/fcarli/WebCellHit/data/transcriptomics/celligner_CCLE_TCGA_optimized_revised.feather')

In [5]:
def compute_stats(dataset,data_path,celligner_output_path):

    #load metadata
    loader = DatasetLoader(dataset=dataset,
                            data_path=data_path,
                            celligner_output_path=celligner_output_path,
                            use_external_datasets=False,
                            samp_x_tissue=2,
                            random_state=0)
    
    #compute the min, median and max for each drug (needed to contextualize predictions)
    stats = loader.metadata[['DrugID','Drug','Y']].groupby(['DrugID','Drug']).agg(['min','median','max','mean','std']).reset_index()

    #get rid of multiindex
    stats.columns = ['DrugID','Drug','min','median','max','mean','std']

    return stats

# GDSC

In [6]:
gdsc_stats = compute_stats(dataset='gdsc',data_path=data_path,celligner_output_path=celligner_output_path)
gdsc_stats

Unnamed: 0,DrugID,Drug,min,median,max,mean,std
0,1003,Camptothecin,-5.881433,-2.434225,3.712453,-2.172563,1.858905
1,1004,Vinblastine,-7.532931,-3.822838,3.888674,-3.359456,2.219831
2,1005,Cisplatin,-1.309506,3.249865,9.246956,3.348483,1.872344
3,1006,Cytarabine,-5.326410,1.903207,6.678187,1.709272,2.250273
4,1007,Docetaxel,-7.205430,-4.467618,0.857092,-4.294178,1.583646
...,...,...,...,...,...,...,...
281,2362,THR-103,1.237725,4.531228,8.188195,4.484064,1.165689
282,2438,ascorbate (vitamin C),6.096767,10.464971,13.847363,10.295309,1.312929
283,2439,glutathione,5.705185,9.169454,12.152814,9.195275,0.903988
284,2498,alpha-lipoic acid,4.180814,7.712732,11.011581,7.700388,0.972669


In [7]:
gdsc_stats.to_csv('/home/fcarli/WebCellHit/webserver_data/local_data/gdsc_drug_stats.csv',index=False)


# PRISM

In [8]:
prism_stats = compute_stats(dataset='prism',data_path=data_path,celligner_output_path=celligner_output_path)
prism_stats

Unnamed: 0,DrugID,Drug,min,median,max,mean,std
0,0,"1,12-BESM",-1.046619,0.116367,1.149649,0.101596,0.378502
1,1,"1,3-DIPROPYL-8-PHENYLXANTHINE",-1.605691,0.025481,1.262307,0.002045,0.413690
2,2,"1,4-BUTANEDIOL",-3.488905,0.115205,1.880249,0.048549,0.509941
3,3,"1,5-DICAFFEOYLQUINIC-ACID",-1.819211,-0.281788,1.109137,-0.300245,0.408162
4,4,"1-((Z)-3-CHLOROALLYL)-1,3,5,7-TETRAAZAADAMANTA...",-1.391957,0.022534,0.953556,-0.020905,0.366739
...,...,...,...,...,...,...,...
6332,6332,ZOXAZOLAMINE,-1.366746,0.060528,1.235654,0.079134,0.355978
6333,6333,ZSET1446,-2.421444,0.026618,1.566562,0.018832,0.423451
6334,6334,ZSTK-474,-3.793874,-0.776676,1.267076,-0.820790,0.849901
6335,6335,ZUCLOPENTHIXOL,-1.887046,-0.131111,1.201596,-0.143093,0.407829


In [9]:
prism_stats.to_csv('/home/fcarli/WebCellHit/webserver_data/local_data/prism_drug_stats.csv',index=False)