In [2]:
from __future__ import print_function
import pandas as pd
import os, sys, re
from intermine.webservice import Service

# Load data direct from source
class A(object):
    url = {}
    url['expression'] = 'https://ndownloader.figshare.com/files/16757690'
    url['sample_info'] = 'https://ndownloader.figshare.com/files/16757723'
    url['copy_number'] = 'https://ndownloader.figshare.com/files/17857886'
    url['mutations'] = 'https://ndownloader.figshare.com/files/16757702'
    url['achilles_crispr'] = 'https://ndownloader.figshare.com/files/16757666'
    url['achilles_rnai'] = 'https://ndownloader.figshare.com/files/11489669'
    url['sensitivity'] = 'https://ndownloader.figshare.com/files/17008628'
    url['mfr'] = 'http://bmbl.sdstate.edu/MFR/data/resource%20data/tr_dv_ts.dataset.zip'
    data = {}
    
def get_gene_descriptions():
    """Load gene names and descriptions from humanmine (http://www.humanmine.org),
    an integrated database of human genome information.  Use cached data if available"""
    
    archive = 'data/gene_info.p'
    if os.path.exists(archive):
        df = pd.read_pickle(archive)
        return df
    
    service = Service("https://www.humanmine.org/humanmine/service")
    query = service.new_query("Gene")
    cols = ["primaryIdentifier", "symbol", "briefDescription", "description","proteins.uniprotAccession"]
    query.add_view(*cols)
    query.add_constraint("organism.taxonId", "=", "9606", code = "A")    
    df_rows = []

    for row in query.rows():
        df_rows.append(
            [row["primaryIdentifier"], 
             row["symbol"], 
             row["briefDescription"], 
             row["description"],
             row["proteins.uniprotAccession"]
            ])

    df = pd.DataFrame(data=df_rows,columns=cols)
    df.to_pickle(archive)
    return df
    

def get_data(key):
    """Load input data.
    Arguments: key for the data source (eg: expression, sample_info...)
    1) If the data is in memory, return the dataframe
    2) If the data is cached on the filesystem, load and return the dataframe
    3) Otherwise, load the data from the source URL, cache, return the dataframe
    """

    if A.data.get(key) is not None:
        return A.data[key]
    
    data_cache = 'data/'+key+'.p'
    if os.path.exists(data_cache):
        A.data[key] = pd.read_pickle(data_cache)
        return A.data[key]
    df = pd.read_csv(A.url[key],index_col=0)
    df.to_pickle(data_cache)
    A.data[key] = df
    return A.data[key]

In [3]:
# Map ncbi IDs to reactome pathways
# File downloaded from https://reactome.org/download/current/NCBI2Reactome.txt
pathway_info = {}
with open('data/NCBI2Reactome.txt') as n2r:
    for line in n2r.readlines():
        ncbi, pathway_id, url, pathway_name, type, species = line.strip().split('\t')
        # only human pathways
        if species != 'Homo sapiens':
            continue
        # only curated pathways
        if type == 'IEA':
            continue
        pathway_info[ncbi] = [pathway_id, url, pathway_name]
        
print("Have pathway info for",len(pathway_info),"human NCBI IDs")   

FileNotFoundError: [Errno 2] No such file or directory: 'data/NCBI2Reactome.txt'

In [24]:
for data_source in A.url:
    print('Loading',data_source,'...')
    df = get_data(data_source)
    #display(df.head())
    
df = get_data('achilles_crispr')
df.head()

Loading expression ...
Loading sample_info ...
Loading copy_number ...
Loading mutations ...
Loading achilles_crispr ...
Loading achilles_rnai ...
Loading sensitivity ...
Loading mfr ...


Unnamed: 0,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),AADAC (13),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
ACH-000004,0.168684,0.089128,-0.196966,-0.02126,0.038541,-0.175141,0.349346,-0.441008,0.291208,0.147993,...,-0.135665,-0.46181,,,0.253495,0.233904,-0.40614,0.283878,0.230978,-0.135112
ACH-000005,-0.068759,0.218792,0.178252,0.15839,-0.193862,-0.324566,0.24622,-0.576495,-0.081217,0.016182,...,-0.176432,-0.391199,-0.182117,-0.108978,0.186545,-0.075884,-0.095781,0.029269,0.000945,-0.242038
ACH-000007,0.053893,0.081444,-0.06017,0.153435,0.087362,0.150684,0.061146,-0.470462,-0.01221,0.277616,...,-0.101852,-0.276755,-0.030821,0.121126,0.214875,-0.0193,-0.342632,0.08361,-0.392722,-0.44338
ACH-000009,0.059874,-0.011153,-0.054367,0.060886,0.039767,0.043527,0.011845,-0.63029,0.161797,0.033587,...,-0.35588,-0.290047,-0.031825,0.115886,0.116784,0.035294,-0.575523,0.22894,-0.114559,-0.549906
ACH-000011,0.277165,0.085354,0.007972,0.445843,-0.036717,-0.261409,0.111173,-0.430867,0.138193,0.120785,...,-0.418769,-0.518908,-0.128187,-0.126336,0.269698,0.148516,-0.227106,0.120656,-0.252444,-0.401821


In [29]:
ncbi_cols = []
for c in df.columns:
    match = re.search('\((\d+)\)',c)
    if match:
        c = match.group(1)
    ncbi_cols.append(int(c))

df.columns = ncbi_cols
df.head()
    


Unnamed: 0,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,55055,11130,7789,158586,79364,440590,79699,7791,23140,26009
ACH-000004,0.168684,0.089128,-0.196966,-0.02126,0.038541,-0.175141,0.349346,-0.441008,0.291208,0.147993,...,-0.135665,-0.46181,,,0.253495,0.233904,-0.40614,0.283878,0.230978,-0.135112
ACH-000005,-0.068759,0.218792,0.178252,0.15839,-0.193862,-0.324566,0.24622,-0.576495,-0.081217,0.016182,...,-0.176432,-0.391199,-0.182117,-0.108978,0.186545,-0.075884,-0.095781,0.029269,0.000945,-0.242038
ACH-000007,0.053893,0.081444,-0.06017,0.153435,0.087362,0.150684,0.061146,-0.470462,-0.01221,0.277616,...,-0.101852,-0.276755,-0.030821,0.121126,0.214875,-0.0193,-0.342632,0.08361,-0.392722,-0.44338
ACH-000009,0.059874,-0.011153,-0.054367,0.060886,0.039767,0.043527,0.011845,-0.63029,0.161797,0.033587,...,-0.35588,-0.290047,-0.031825,0.115886,0.116784,0.035294,-0.575523,0.22894,-0.114559,-0.549906
ACH-000011,0.277165,0.085354,0.007972,0.445843,-0.036717,-0.261409,0.111173,-0.430867,0.138193,0.120785,...,-0.418769,-0.518908,-0.128187,-0.126336,0.269698,0.148516,-0.227106,0.120656,-0.252444,-0.401821


In [20]:
mfr = get_data('mfr')
mfr.head()

Unnamed: 0,gene.pair,label,exp1,exp2,pcc,src,mi,ppc,cmi,go,lc,hg,rx,tr,MFR
1,P00740_P04217,1,0.237106,0.336767,0.633155,0.516063,0.018146,0.636449,0.341682,0.0,0.428571,0.806186,0.0,0,0.537186
2,Q9Y243_Q9Y4X4,1,0.462556,0.28065,0.538318,0.547997,0.011,0.489385,0.343751,0.254375,0.4,0.954148,0.0,0,0.481837
3,Q7Z449_Q9Y243,1,0.226468,0.462556,0.402204,0.399315,0.015249,0.455895,0.34594,0.0,0.0,0.816527,0.0,0,0.26948
4,P17030_Q9Y243,1,0.415233,0.462556,0.661247,0.647233,0.032679,0.601135,0.321106,0.0,0.2,0.908248,0.0,0,0.442733
5,P46821_Q9Y243,1,0.45421,0.462556,0.694198,0.692106,0.052969,0.685591,0.3266,0.182067,0.166667,0.865148,0.0,0,0.54895


In [17]:
# Use data from human mine to map NCBI gene IDs to name, summary, symbol, uniprot
gd = get_gene_descriptions()
ncbi2name = {}
ncbi2symbol = {}
ncbi2description = {}
ncbi2uniprot = {}
uniprot2ncbi = {}
for i, r in gd.iterrows():
    ncbi, symbol, name, description, uniprot = list(r)
    ncbi = int(ncbi)
    ncbi2name[ncbi] = name
    ncbi2symbol[ncbi] = symbol
    ncbi2description[ncbi] = description
    # ncbi <-> unioprot can be 1:many
    if ncbi2uniprot.get(ncbi) is None:
        ncbi2uniprot[ncbi] = set()
    ncbi2uniprot[ncbi].add(uniprot)
    uniprot2ncbi[uniprot] = ncbi
print("Done mappinmg gene info")

Done mappinmg gene info


In [6]:
cn = get_data('copy_number').T

In [20]:
sd = get_data('sample_info')
lin = set(sd.lineage.dropna())
lineage = {}
for l in lin:
    ldf = sd[sd.lineage == l]
    subtypes = set(ldf.lineage_subtype.dropna())
    if len(subtypes) > 1:
        for sub in subtypes:
            if l in sub:
                lname = sub
            else:
                lname = l + '_' + sub
            sub_df = ldf[ldf.lineage_subtype == sub]
            lineage[lname] = list(sub_df.index)
    else:
        lineage[l] = list(ldf.index)
        
for l in lineage:
    print(l,len(lineage[l]))

rhabdomyosarcoma 19
mesothelioma 17
urinary_tract 37
ovary_non_epithelial 2
ovary_immortalized 1
ovary_adenocarcinoma 57
breast_TNBC 29
breast_ERpos 12
breast_ERneg 1
breast_immortalized 2
breast_TPBC 5
breast_HER2Amp 13
rhabdoid 23
adrenal_cortex 1
gastric_small 2
gastric_adenosquamous 1
gastric_adenocarcinoma 33
other 1
multiple_myeloma 34
colorectal 73
kidney 39
lung_NSC 149
lung_small 66
lung_immortalized 1
soft_tissue_liposarcoma 5
soft_tissue_leiomyosarcoma 3
soft_tissue_fibrosarcoma 1
soft_tissue_pleomorphic_sarcoma 1
soft_tissue_sarcoma_undifferentiated 2
soft_tissue_synovial_sarcoma 8
soft_tissue_epitheliod_sarcoma 2
bone_Ewing_sarcoma 28
bone_osteosarcoma 12
bone_chordoma 4
bone_chondrosarcoma 4
fibroblast 43
embryo 3
upper_aerodigestive_squamous 44
upper_aerodigestive_buccal_mucosa 1
Hodgkin_lymphoma 9
lymphoma_DLBCL 17
lymphoma_ALCL 5
Burkitt_lymphoma 13
lymphoma_B-cell_ALL 2
lymphoblastic_lymphoma 1
B-cell_lymphoma_other 21
T-cell_lymphoma_other 13
central_nervous_system_m

In [31]:
df = get_data('mfr')

df.head()
genes = set()
for p in df['gene.pair']:
    my_genes = p.split('_')
    for g in my_genes:
        genes.add(g)
len(genes)
    

15155