In [38]:
from __future__ import print_function
import pandas as pd
import os, sys, re
from intermine.webservice import Service

# Load data direct from source
class A(object):
    url = {}
    url['expression'] = 'https://ndownloader.figshare.com/files/16757690'
    url['sample_info'] = 'https://ndownloader.figshare.com/files/16757723'
    url['copy_number'] = 'https://ndownloader.figshare.com/files/17857886'
    url['mutations'] = 'https://ndownloader.figshare.com/files/16757702'
    url['achilles_crispr'] = 'https://ndownloader.figshare.com/files/16757666'
    url['achilles_rnai'] = 'https://ndownloader.figshare.com/files/11489669'
    url['sensitivity'] = 'https://ndownloader.figshare.com/files/17008628'
    url['mfr'] = 'http://bmbl.sdstate.edu/MFR/data/resource%20data/tr_dv_ts.dataset.zip'
    data = {}
    
def get_gene_descriptions():
    """Load gene names and descriptions from humanmine (http://www.humanmine.org),
    an integrated database of human genome information.  Use cached data if available"""
    
    archive = 'data/gene_info.p'
    if os.path.exists(archive):
        df = pd.read_pickle(archive)
        return df
    
    service = Service("https://www.humanmine.org/humanmine/service")
    query = service.new_query("Gene")
    cols = ["primaryIdentifier", "symbol", "briefDescription", "description","proteins.uniprotAccession"]
    query.add_view(*cols)
    query.add_constraint("organism.taxonId", "=", "9606", code = "A")    
    df_rows = []

    for row in query.rows():
        df_rows.append(
            [row["primaryIdentifier"], 
             row["symbol"], 
             row["briefDescription"], 
             row["description"],
             row["proteins.uniprotAccession"]
            ])

    df = pd.DataFrame(data=df_rows,columns=cols)
    df.to_pickle(archive)
    return df
    

def get_data(key):
    """Load input data.
    Arguments: key for the data source (eg: expression, sample_info...)
    1) If the data is in memory, return the dataframe
    2) If the data is cached on the filesystem, load and return the dataframe
    3) Otherwise, load the data from the source URL, cache, return the dataframe
    """

    if A.data.get(key) is not None:
        return A.data[key]
    
    data_cache = 'data/'+key+'.p'
    if os.path.exists(data_cache):
        A.data[key] = pd.read_pickle(data_cache)
        return A.data[key]
    df = pd.read_csv(A.url[key],index_col=0)
    df.to_pickle(data_cache)
    A.data[key] = df
    return A.data[key]

def ncbi_gene_ids(df):
    """Converts gene name "symbol (ncbi_id)"
    to integer NCBI ID
    """
    ncbi_cols = []
    for c in df.columns:
        match = re.search('\((\d+)\)',c)
        if match:
            c = match.group(1)
        ncbi_cols.append(int(c))

    df.columns = ncbi_cols
    return df

In [4]:
# Map ncbi IDs to reactome pathways
# File downloaded from https://reactome.org/download/current/NCBI2Reactome.txt
pathway_info = {}
with open('data/NCBI2Reactome.txt') as n2r:
    for line in n2r.readlines():
        ncbi, pathway_id, url, pathway_name, type, species = line.strip().split('\t')
        # only human pathways
        if species != 'Homo sapiens':
            continue
        # only curated pathways
        if type == 'IEA':
            continue
        pathway_info[ncbi] = [pathway_id, url, pathway_name]
        
print("Have pathway info for",len(pathway_info),"human NCBI IDs")   

Have pathway info for 10241 human NCBI IDs


## Aggregate cell lines by lineage
* Group all cell lines by lineage
* If a lineage has > 1 defined sublineages, also segregate cell lines by sublineages (eg: leukemia -> AML)

In [25]:
sample_info = get_data('sample_info')
lineages = set(sample_info.lineage.dropna())
lineage = {}
for l in lineages:
    ldf = sd[sd.lineage == l]
    subtypes = set(ldf.lineage_subtype.dropna())
    # cell lines for sublineage
    if len(subtypes) > 1:
        for sub in subtypes:
            if l in sub:
                lname = sub
            else:
                lname = l + '_' + sub
            lname = l + '_' + sub
            sub_df = ldf[ldf.lineage_subtype == sub]
            lineage[lname] = list(sub_df.index)

    # all cell lines
    lineage[l] = list(ldf.index)
        
# for l in sorted(lineage.keys()):
#      print(l,len(lineage[l]))

In [39]:
print(A.url.keys())
df = get_data('achilles_crispr')
display(df.head())
strip_cols(df)
df.head()

dict_keys(['expression', 'sample_info', 'copy_number', 'mutations', 'achilles_crispr', 'achilles_rnai', 'sensitivity', 'mfr'])


Unnamed: 0,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),AADAC (13),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
ACH-000004,0.168684,0.089128,-0.196966,-0.02126,0.038541,-0.175141,0.349346,-0.441008,0.291208,0.147993,...,-0.135665,-0.46181,,,0.253495,0.233904,-0.40614,0.283878,0.230978,-0.135112
ACH-000005,-0.068759,0.218792,0.178252,0.15839,-0.193862,-0.324566,0.24622,-0.576495,-0.081217,0.016182,...,-0.176432,-0.391199,-0.182117,-0.108978,0.186545,-0.075884,-0.095781,0.029269,0.000945,-0.242038
ACH-000007,0.053893,0.081444,-0.06017,0.153435,0.087362,0.150684,0.061146,-0.470462,-0.01221,0.277616,...,-0.101852,-0.276755,-0.030821,0.121126,0.214875,-0.0193,-0.342632,0.08361,-0.392722,-0.44338
ACH-000009,0.059874,-0.011153,-0.054367,0.060886,0.039767,0.043527,0.011845,-0.63029,0.161797,0.033587,...,-0.35588,-0.290047,-0.031825,0.115886,0.116784,0.035294,-0.575523,0.22894,-0.114559,-0.549906
ACH-000011,0.277165,0.085354,0.007972,0.445843,-0.036717,-0.261409,0.111173,-0.430867,0.138193,0.120785,...,-0.418769,-0.518908,-0.128187,-0.126336,0.269698,0.148516,-0.227106,0.120656,-0.252444,-0.401821


Unnamed: 0,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,55055,11130,7789,158586,79364,440590,79699,7791,23140,26009
ACH-000004,0.168684,0.089128,-0.196966,-0.02126,0.038541,-0.175141,0.349346,-0.441008,0.291208,0.147993,...,-0.135665,-0.46181,,,0.253495,0.233904,-0.40614,0.283878,0.230978,-0.135112
ACH-000005,-0.068759,0.218792,0.178252,0.15839,-0.193862,-0.324566,0.24622,-0.576495,-0.081217,0.016182,...,-0.176432,-0.391199,-0.182117,-0.108978,0.186545,-0.075884,-0.095781,0.029269,0.000945,-0.242038
ACH-000007,0.053893,0.081444,-0.06017,0.153435,0.087362,0.150684,0.061146,-0.470462,-0.01221,0.277616,...,-0.101852,-0.276755,-0.030821,0.121126,0.214875,-0.0193,-0.342632,0.08361,-0.392722,-0.44338
ACH-000009,0.059874,-0.011153,-0.054367,0.060886,0.039767,0.043527,0.011845,-0.63029,0.161797,0.033587,...,-0.35588,-0.290047,-0.031825,0.115886,0.116784,0.035294,-0.575523,0.22894,-0.114559,-0.549906
ACH-000011,0.277165,0.085354,0.007972,0.445843,-0.036717,-0.261409,0.111173,-0.430867,0.138193,0.120785,...,-0.418769,-0.518908,-0.128187,-0.126336,0.269698,0.148516,-0.227106,0.120656,-0.252444,-0.401821


In [None]:
data = {}
data 

## Fix Achilles RNAi to to use CCLE cell line IDs

In [45]:
sample_info = get_data('sample_info')
ccle_name2id = {}
for i, r in sample_info.iterrows():
    ccle_name2id[r['CCLE Name']] = i 

achilles_rnai = get_data('achilles_rnai').T
achilles_index = list(achilles_rnai.index)
achilles_rnai.index = [ccle_name2id.get(i) or i for i in achilles_index]
achilles_rnai.describe()

Unnamed: 0,A1BG (1),NAT2 (10),ADA (100),CDH2 (1000),AKT3 (10000),MED6 (10001),NR2E3 (10002),NAALAD2 (10003),DUXB (100033411),PDZK1P1 (100034743),...,HNRNPDL (9987),DMTF1 (9988),PPP4R1 (9989),CDH1 (999),SLC12A6 (9990),PTBP3 (9991),KCNE2 (9992),DGCR2 (9993),CASP8AP2 (9994),SCO2 (9997)
count,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,...,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0
mean,-0.128559,-0.062949,0.05218,0.035571,0.073588,-0.010781,0.055337,0.072024,0.134287,0.086955,...,0.0491,0.044417,-0.161913,-0.018613,0.041153,-0.656251,-0.003405,0.050086,-0.524194,-0.076897
std,0.124823,0.135102,0.149959,0.134957,0.118782,0.146802,0.151149,0.114891,0.100591,0.09631,...,0.123658,0.134776,0.121253,0.144138,0.096894,0.18778,0.105688,0.150471,0.286075,0.173581
min,-0.591187,-0.615757,-0.404078,-0.504298,-0.713022,-0.518746,-0.40693,-0.386424,-0.195428,-0.233658,...,-0.288058,-0.334535,-0.560391,-0.371797,-0.251362,-1.150943,-0.335286,-0.556667,-1.443735,-0.639469
25%,-0.200764,-0.153099,-0.054149,-0.049472,0.018771,-0.099676,-0.048597,0.009815,0.076195,0.03593,...,-0.035309,-0.042693,-0.239758,-0.12244,-0.020465,-0.77729,-0.058195,-0.039912,-0.710553,-0.188873
50%,-0.123575,-0.069537,0.061277,0.033315,0.079908,-0.00201,0.052966,0.076944,0.137898,0.089408,...,0.047236,0.046242,-0.159157,-0.024791,0.038997,-0.660937,0.000486,0.056624,-0.46151,-0.074827
75%,-0.051885,0.021924,0.149807,0.116677,0.141142,0.089635,0.160081,0.142492,0.206585,0.141804,...,0.131704,0.131713,-0.082169,0.076697,0.099894,-0.525345,0.065278,0.142808,-0.315323,0.038054
max,0.219773,0.40725,0.583252,0.672208,0.827562,0.424368,0.547851,0.388047,0.431561,0.363576,...,0.397315,0.486055,0.280198,0.476642,0.352624,-0.186703,0.285523,0.485453,-0.030149,0.48607


In [17]:
# Use data from human mine to map NCBI gene IDs to name, summary, symbol, uniprot
gd = get_gene_descriptions()
ncbi2name = {}
ncbi2symbol = {}
ncbi2description = {}
ncbi2uniprot = {}
uniprot2ncbi = {}
for i, r in gd.iterrows():
    ncbi, symbol, name, description, uniprot = list(r)
    ncbi = int(ncbi)
    ncbi2name[ncbi] = name
    ncbi2symbol[ncbi] = symbol
    ncbi2description[ncbi] = description
    # ncbi <-> unioprot can be 1:many
    if ncbi2uniprot.get(ncbi) is None:
        ncbi2uniprot[ncbi] = set()
    ncbi2uniprot[ncbi].add(uniprot)
    uniprot2ncbi[uniprot] = ncbi
print("Done mappinmg gene info")

Done mappinmg gene info


In [6]:
cn = get_data('copy_number').T

pancreas 55 ['ACH-000022', 'ACH-000023', 'ACH-000031', 'ACH-000042', 'ACH-000060', 'ACH-000085', 'ACH-000093', 'ACH-000094', 'ACH-000107', 'ACH-000108', 'ACH-000114', 'ACH-000118', 'ACH-000138', 'ACH-000139', 'ACH-000155', 'ACH-000164', 'ACH-000178', 'ACH-000205', 'ACH-000213', 'ACH-000222', 'ACH-000235', 'ACH-000243', 'ACH-000265', 'ACH-000266', 'ACH-000270', 'ACH-000281', 'ACH-000307', 'ACH-000320', 'ACH-000332', 'ACH-000347', 'ACH-000354', 'ACH-000417', 'ACH-000468', 'ACH-000502', 'ACH-000517', 'ACH-000535', 'ACH-000599', 'ACH-000601', 'ACH-000652', 'ACH-000685', 'ACH-000933', 'ACH-001098', 'ACH-001101', 'ACH-001107', 'ACH-001108', 'ACH-001171', 'ACH-001353', 'ACH-001375', 'ACH-001376', 'ACH-001377', 'ACH-001378', 'ACH-001379', 'ACH-001380', 'ACH-001382', 'ACH-001999']
liver 26 ['ACH-000217', 'ACH-000221', 'ACH-000316', 'ACH-000361', 'ACH-000393', 'ACH-000420', 'ACH-000422', 'ACH-000471', 'ACH-000475', 'ACH-000476', 'ACH-000478', 'ACH-000480', 'ACH-000483', 'ACH-000493', 'ACH-000537

In [31]:
df = get_data('mfr')

df.head()
genes = set()
for p in df['gene.pair']:
    my_genes = p.split('_')
    for g in my_genes:
        genes.add(g)
len(genes)
    

15155