In [1]:
import os
import pandas as pd
import numpy as np
eu = os.path.expanduser
jn = os.path.join

In [2]:
assoc_dir = eu("~/vervet_project/data/gene_associations")

In [3]:
meta_dir = eu("~/vervet_project/metadata")
def load_gene_df():
    gene_df = pd.read_csv(jn(meta_dir,"annot_mike_genes_minimal.tsv"),sep="\t",index_col = [0,1])
    gene_df.index.names = [gene_df.index.names[0],'start']
    return gene_df

In [4]:
gene_df = load_gene_df()

In [5]:
pops = ["aet","cyn","pyn","pys","sab","tan"]

In [7]:
#gene_df.to_csv(jn(meta_dir,"annot_mike_genes_minimal.tsv"),sep="\t")

In [6]:
def index_rolling(s,window,func,overlap=0,*args,**kwargs):
    """
    Apply function in rolling windows, where the window
    size is defined with respect to the index values.
    This means that different windows can comprise different
    numbers of elements.
    
    s ... pandas Series
    window ... window size in units of the index values
    func ... function to apply to the series values within each
                window
    overlap ... oberlap size of windows
    args, kwarg ... additional arguments for func
    
    Example: index_rolling(pd.Series(range(20),index=np.random.rand(20))),0.1,max)
    """
    #note that basis must be sorted in order for this to work properly
    windows_min = s.index.min()
    windows_max = s.index.max()
    window_starts = np.arange(windows_min, windows_max, window-overlap)
    window_starts = pd.Series(window_starts, index = window_starts)
    def applyToWindow(val):
        # using slice_indexer rather that what.loc [val:val+window] allows
        # window limits that are not specifically in the index
        indexer = s.index.slice_indexer(val,val+window,1)
        chunk = s.iloc[indexer]
        try:
            return func(chunk,*args,**kwargs)
        except ValueError, e:
            if "empty sequence" in str(e):
                #print indexer, chunk
                return None
            else:
                raise           
    rolled = window_starts.apply(applyToWindow)
    return rolled

## ROD (reference ordered data) permutation test

tools that, given an analysis result, one can rotate (shift) the results randomly around the genome

In [7]:
def get_chrom_length():
    return pd.read_csv(eu("~/vervet_project/metadata/ref3500.tsv"),
                       sep="\t",index_col=0,squeeze=True,header=False)

In [8]:
def shift_rod(rod_df, rnd, mode = "grid"):
    """
    shift reference ordered data across the whole genome

    Input:
    rod_df ... pandas dataframe or series with mulitiindex (chrom, pos)
                
    modes ...
        'grid' ... just rotate the index of the rod data frame
                    this means that the positions stay the same only the
                    value for each position becomes different
                    Faster, but means that you only hit the same grid-point
                    this should make it conservative on large grids. Large
                    grids are problematic if the fraction of top windows 
                    considered becomes large 
        'continuous' ... add the random shift to each index value.
                         NOT IMPLEMENTED
    """
    if mode == "grid":
        new_start_i = int(len(rod_df)*rnd)
        rotate_data = np.concatenate((rod_df.iloc[new_start_i:].values,rod_df.iloc[:new_start_i].values))
        if  isinstance(rod_df,pd.core.series.Series):
            r = pd.Series(rotate_data,index=rod_df.index)
            return r
        elif isinstance(rod_df,pd.core.frame.DataFrame):
            r = pd.DataFrame(rotate_data,index=rod_df.index,columns=rod_df.columns)
            return r
        
    else:
        raise UserException("Only mode grid supported.")

In [900]:
%timeit shift_rod(tot_sweed_df["Likelihood"], 0.32424) 

100 loops, best of 3: 14.5 ms per loop


## get genes for peaks

In [9]:
def get_peaks(value_s,min_peak_dist):
    """
    ranked df of peaks
    assuming a minimum distance between peaks
    """
    assert len(value_s.index.droplevel(1).unique()) == 1, "Multiple chromsosomes not supported."
    chrom = value_s.index[0][0]
    x = index_rolling(value_s.ix[chrom],2*min_peak_dist,np.argmax,min_peak_dist)
    x.name = "pos"
    #only take peaks that are found in at least two windows 
    #this makes sure that we have a minimal distance between peaks
    pos_true = (x.reset_index().groupby("pos").apply(len)>1)
    pos = pos_true[pos_true.values].index
    s = value_s.ix[chrom].ix[pos]
    return s

In [10]:
def get_genes(peak_s, gene_df, max_dist):
    """
    take the input series and gets 
    names of genes nearby
    
    Input:
    peak_s ... pandas series with (chrom, pos) index and value of
                the statistic ('peak height'). Series should be named.
    gene_df ... data frame with gene info 
    """    
    all_genes = []
    if not gene_df.index.is_monotonic:
        gene_df = gene_df.sort_index()
    tot_hit_df = pd.DataFrame()
    for chrom in peak_s.index.droplevel(1).unique():
        loc_gene_df = gene_df.ix[chrom]
        #loc_gene_df = loc_gene_df.append(pd.DataFrame(np.nan,index=[np.inf],columns=loc_gene_df.columns))
        pos_rel_to_start = np.searchsorted(loc_gene_df.index.values-max_dist,peak_s.ix[chrom].index.values)
        pos_rel_to_end = np.searchsorted(loc_gene_df["end"].values+max_dist,peak_s.ix[chrom].index.values)
        genes = list(set(loc_gene_df["gene_id"].iloc[np.hstack([range(a,b) for a,b in zip(pos_rel_to_end,pos_rel_to_start)])]))
        #print chrom, genes
        all_genes += genes
    return all_genes

def get_gene_info(gene_ls,gene_df):
    """
    for a list of gene ids,
    get a data frame with their 
    position
    """
    gi = gene_df[gene_df["gene_id"].apply(lambda x: x in gene_ls)]
    return gi

def get_peaks(gene_info,top_s,max_dist):
    """
    For each gene in gene_info get the
    peaks within max_dist in top_s. This 
    is basically reverse engineering to get
    the peak info for each gene that was found 
    to be associated with a peak. 
    
    Input:
    gene_info ... data frame with index ('chrom','start')
                and columns 'gene_id' and 'end'
    top_s ... series of peak positions with index (chrom, pos)
                and values peak height
    max_dist ... maximum distance between gene and peak
    """
    def get_dist(df,gene_pos):
        s = pd.Series(df.index.droplevel(0).values - gene_pos.ix[df.index[0][0]],
                                                  index=df.index.droplevel(0).values)
        return s
    tot_gene_peaks_df = pd.DataFrame()
    if not top_s.index.is_monotonic:
        top_s = top_s.sort_index()
    for chrom in gene_info.index.droplevel(1).unique():
        loc_top_s = top_s.ix[chrom]
        start = np.searchsorted(loc_top_s.index.values+max_dist,gene_info.ix[chrom].index.values)
        end = np.searchsorted(loc_top_s.index.values-max_dist,gene_info.ix[chrom]["end"].values)
        
        x = pd.concat([loc_top_s.iloc[st:ed] for st,ed in zip(start,end)],
                      keys=gene_info.ix[chrom]["gene_id"].values)
        x.name = "peak_height"
        


        dist_start = x.groupby(lambda i: i[0]).\
                    apply(lambda df: get_dist(df,
                                              gene_info.ix[chrom].reset_index().set_index("gene_id")["pos"]))
        dist_start.name = "dist_start"
        dist_end = x.groupby(lambda i: i[0]).\
                    apply(lambda df: get_dist(df,
                                              gene_info.ix[chrom].set_index("gene_id")["end"]))
        dist_end.name = "dist_end"
        gene_peaks_df = pd.concat([x,dist_start,dist_end],axis=1)
        gene_peaks_df.index = pd.MultiIndex.from_arrays([gene_peaks_df.index.droplevel(1),
                                         [chrom]*len(x),
                                         gene_peaks_df.index.droplevel(0)])
        tot_gene_peaks_df = pd.concat([tot_gene_peaks_df, gene_peaks_df])
        tot_gene_peaks_df.index.names = ["gene_id","chrom","peak_pos"]
        if chrom == "CAE19":
            print gene_info.ix[chrom].reset_index().set_index("gene_id")["pos"]
            print loc_top_s
            print start
            print end
            print x
            print dist_start
    return tot_gene_peaks_df

In [11]:
chromosomes = ["CAE" + str(i) for i in range(1,30)+["X","Y"]]
def get_sweed_df(chrom,pop):
    sweed_res = pd.read_csv(eu("~/vervet_project/analyses/20140611_163_subpop_sweepfinder/"
                             "_data/SweeD_Report.163_ref3500_{}_{}".format(chrom,pop)),
                                                          sep="\t",skiprows=2,index_col=0)
    sweed_res.index = pd.MultiIndex.from_tuples(zip([chrom]*len(sweed_res.index),sweed_res.index))
    return sweed_res

'0.14.0'

In [1057]:
pop = "aet"
eu("~/vervet_project/analyses/20140611_163_subpop_sweepfinder/"
                             "_data/SweeD_Report.163_ref3500_{}_{}".format(chrom,pop))

'/home/GMI/hannes.svardal/vervet_project/analyses/20140611_163_subpop_sweepfinder/_data/SweeD_Report.163_ref3500_CAEY_aet'

In [372]:
gene_df = load_gene_df()
gene_df = gene_df.sortlevel(level=[0,1])

In [11]:
pop = "aet"
tot_sweed_df = pd.read_csv(eu("~/vervet_project/analyses/20140611_163_subpop_sweepfinder/"
                                 "_data/tot_sweed_result_{}.tsv".format(pop)),sep="\t",index_col=[0,1])

In [12]:
top_s = tot_sweed_df.sort("Likelihood",ascending=False)["Likelihood"].iloc[:2000]
#top_s = pd.concat([pd.Series([17],index=pd.MultiIndex.from_arrays([["CAE1"],[4922]])),top_s])

In [1225]:
max_dist = 10000
cand_genes = get_genes(top_s, gene_df, max_dist=max_dist)
gene_info = get_gene_info(cand_genes, gene_df)
peaks = get_peaks(gene_info,top_s,max_dist=max_dist)

gene_id
SAMM50     26467319
Name: pos, dtype: int64
pos
9483311.2373     21.93971
9483796.1723     23.32173
26477373.2917    20.04123
26478828.0967    19.82511
Name: Likelihood, dtype: float64
[2]
[4]
        pos          
SAMM50  26477373.2917    20.04123
        26478828.0967    19.82511
Name: peak_height, dtype: float64
SAMM50  26477373.2917    10054.2917
        26478828.0967    11509.0967
Name: dist_start, dtype: float64


In [1248]:
def genes_per_peak_s(df):
    s = df.iloc[0]#["peak_height"]
    s["genes"] = 3#list(df["gene_id"].unique())
    return s
p = peaks.reset_index([0])

In [1263]:
gene_list_peak_pos = p["gene_id"].groupby(lambda x: x).apply(list)
gene_list_peak_pos.name = "genes"
gene_list_peak_pos.index = pd.MultiIndex.from_tuples(gene_list_peak_pos.index)
peak_info = pd.concat([top_s,gene_list_peak_pos],axis=1)
peak_info.sort("Likelihood",ascending=False,inplace=True)

In [6]:
gene_to_cat2 = pd.read_csv(jn(assoc_dir,"gene_to_cat2.csv"),index_col=0)

In [167]:
gene_to_cat2.to_csv(jn(assoc_dir,"gene_to_cat2.tsv"),index=False,sep='\t')

In [7]:
cat2_to_name = pd.read_csv(jn(assoc_dir,"cat2_to_name.csv"),index_col=0)

In [177]:
cat2_to_name.to_csv(jn(assoc_dir,"cat2_to_name.tsv"),index=False,sep='\t')

In [184]:
try:
    gene_df.drop('feature',axis=1,inplace=True)
except ValueError:
    print 'Sorry'

In [10]:
cat2_to_name

Unnamed: 0,go_identifier,go_name
0,is_a,is_a
1,consider,consider
2,replaced_by,replaced_by
3,Cross_product_review,Involved_in
4,goantislim_grouping,Grouping classes that can be excluded
5,gocheck_do_not_annotate,Term not to be used for direct annotation
6,gocheck_do_not_manually_annotate,Term not to be used for direct manual annotation
7,goslim_aspergillus,Aspergillus GO slim
8,goslim_candida,Candida GO slim
9,goslim_generic,Generic GO slim


## get go associations for gene list

In [11]:
gene_to_go = pd.read_csv(jn(assoc_dir,"gene_to_cat.csv"),index_col=0)
gene_to_name = pd.read_csv(jn(assoc_dir,"gene_to_name.csv"),index_col=0)
go_to_name = pd.read_csv(jn(assoc_dir,"cat_to_name.csv"),index_col=0)

In [1077]:
rank_table.join(go_to_name.set_index("go_identifier",drop=True))

Unnamed: 0_level_0,n_genes,out_of,rank,go_name
go_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GO:0000014,1,0,0,single-stranded DNA endodeoxyribonuclease acti...
GO:0000060,1,0,0,"protein import into nucleus, translocation"
GO:0000075,2,0,0,cell cycle checkpoint
GO:0000082,2,0,0,G1/S transition of mitotic cell cycle
GO:0000087,2,0,0,mitotic M phase
GO:0000088,1,0,0,mitotic prophase
GO:0000090,1,0,0,mitotic anaphase
GO:0000096,1,0,0,sulfur amino acid metabolic process
GO:0000098,1,0,0,sulfur amino acid catabolic process
GO:0000122,2,0,0,negative regulation of transcription from RNA ...


In [1199]:
def permut_assoc(rod_s, rnd, gene_df, gene_to_go, top_n, max_dist):
    """
    This is the main function.
    Use with rnd = 0 to get the real assoc.
    """
    s = shift_rod(rod_s, rnd)
    s.sort(ascending=False, inplace=True)
    top_s = s.iloc[:top_n]
    cand_genes = get_genes(top_s, gene_df, max_dist=max_dist)
    assoc = get_go_assoc(cand_genes, gene_to_go)
    return assoc

def get_go_assoc(gene_ls, gene_to_go):
    """
    Get series with number of genes associated with each
    category in gene_to_go
    """
    s = gene_to_go.set_index("gene_symbol").ix[gene_ls].groupby("go_identifier").apply(len)
    return s

def multiple_permut_assoc(rod_s, gene_df, gene_to_go, top_n, max_dist, n_runs, rnd_seed=None):
    if rnd_seed is not None:
        np.random.seed(rnd_seed)
    assoc_table = pd.concat([permut_assoc(rod_s, rnd, gene_df, gene_to_go, top_n, max_dist) for rnd in np.random.rand(n_runs)],axis=1)
    assoc_table = assoc_table.fillna(0)
    return assoc_table

def save_permut_assoc_table(assoc_table,fn):
    assoc_table.to_csv(fn)

def get_initial_rank_table(real_assoc):
    return pd.DataFrame({"n_genes":real_assoc.values,"rank":0,"out_of":0},index=real_assoc.index)

def get_p_val(rank_table):
    """
    Input:
    
    """
    r =  1-rank_table["rank"]*1./(rank_table["out_of"]+1)
    r.sort()
    return r

def update_rank(rank_table,permut_assoc_fh):
    assoc_table = pd.read_csv(permut_assoc_fh,index_col=0)
    r = assoc_table.apply(lambda row: get_rank(row,rank_table),axis=1)
    return r
    
def total_rank(rank_table, permut_fns):
    rt = rank_table.copy()
    for f in permut_fns:
        rt = update_rank(rt,permut_assoc_fh)
    return rt
    
#def save_p_value_total(rank_table, permut_fns):
    
    
def empirical_rank(value,dist):
    """
    get empirical p value of
    value with respect to list of 
    values in dist
    """
    array = np.append(value,dist)
    temp = array.argsort()
    ranks = np.empty(len(array), int)
    ranks[temp] = np.arange(len(array))
    return ranks[0]

def get_rank(series,rank_df):
    go = series.name
    try:
        go_s = rank_df.ix[go]
    except KeyError:
        go_s = pd.Series({"n_genes":0,"rank":0,"out_of":0})
    real_val = go_s["n_genes"]
    old_rank = go_s["rank"]
    old_out_of = go_s["out_of"]
    rank = empirical_rank(real_val,series.values)
    new_rank = old_rank + rank
    new_out_of = old_out_of + len(series)
    return pd.Series({"n_genes":real_val,"rank":new_rank,"out_of":new_out_of})

def get_genes_per_go(gene_ls,gene_to_go):
    s = gene_to_go.set_index("gene_symbol").ix[gene_ls].groupby("go_identifier").apply(lambda x: list(x.index))
    s.name = "genes"
    return sqst

In [1195]:
r = get_go_assoc(cand_genes,gene_to_go)
rank_table = get_initial_rank_table(r)

In [1178]:
genes_per_go = pd.Series(r0.index.values,index=r0.index)

In [1203]:
pd.concat([rank_table,gg], axis =1)

Unnamed: 0_level_0,n_genes,out_of,rank,genes
go_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GO:0000075,1,0,0,[ANAPC1]
GO:0000079,1,0,0,[HERC5]
GO:0000087,1,0,0,[ANAPC1]
GO:0000090,1,0,0,[ANAPC1]
GO:0000122,2,0,0,"[TP63, TENM2]"
GO:0000139,2,0,0,"[HS3ST4, SCARA3]"
GO:0000166,24,0,0,"[SCN8A, RAPGEF3, SYK, RFK, EPHB1, RYK, ATP13A4..."
GO:0000185,1,0,0,[GRM5]
GO:0000187,2,0,0,"[SYK, NTRK3]"
GO:0000226,1,0,0,[PARD3]


In [1202]:
gg = get_genes_per_go(cand_genes,gene_to_go)

In [927]:
r = permut_assoc(tot_sweed_df["Likelihood"], 0.1, gene_df, gene_to_go, top_n = 500, max_dist=10000)

In [934]:
real_assoc = permut_assoc(tot_sweed_df["Likelihood"], 0, gene_df, gene_to_go, top_n=1000, max_dist=10000)

In [933]:
assoc_table = multiple_permut_assoc(tot_sweed_df["Likelihood"], gene_df, gene_to_go, top_n=1000, max_dist = 10000, n_runs=100, rnd_seed=None)

In [952]:
def f(r):
    print r
    raise

In [959]:
p_vals = assoc_table.apply(lambda r: get_p_val(r,real_assoc),axis=1)

In [960]:
p_vals.sort()

In [1206]:
sab_enrich = pd.read_csv(eu("~/vervet_project/analyses/20140611_163_subpop_sweepfinder/_data/enrichment_n2000_d10000_permut50000_sab.tsv"),
                            index_col=[0],sep="\t")

In [1218]:
len(gene_to_go.set_index("go_identifier").index.diff(sab_enrich.index))

432

In [1220]:
len(sab_enrich)

14566

In [1222]:
len(gene_to_go.set_index("go_identifier").index.unique())

14998

## Development for test_enrichment.py

In [4]:
from hs_vervet.scripts import test_enrichment as te

In [10]:
from scipy import stats

In [None]:
pop = "aet"
tot_sweed_df = pd.read_csv(eu("~/vervet_project/analyses/20140611_163_subpop_sweepfinder/"
                                 "_data/tot_sweed_result_{}.tsv".format(pop)),sep="\t",index_col=[0,1])

In [79]:
def gene_likelihood(outlier_locations,mode="max"):
    """
    get genes that overlap with the multi-index of the provided data frame
    """
    modes = ["max","sum"]
    assert mode in modes
    gene_hit_df = pd.DataFrame()
    for chrom in outlier_locations.index.droplevel(1).unique():
        pos_rel_to_start = gene_df.ix[chrom].index.searchsorted(outlier_locations.ix[chrom].index).values
        pos_rel_to_end = np.searchsorted(gene_df.ix[chrom]["end"].values,outlier_locations.ix[chrom].index.values)
        in_gene = (pos_rel_to_start - pos_rel_to_end) == 1
        gene_hits = gene_df.ix[chrom].iloc[pos_rel_to_end[in_gene]]
        gene_hits['chrom'] = chrom
        gene_hits.set_index('chrom', append=True, inplace=True)
        gene_hits = gene_hits.reorder_levels(['chrom', 'pos'])
        gene_hits["likelihood"] = outlier_locations.ix[chrom][in_gene]["Likelihood"].values
        gene_hit_df = gene_hit_df.append(gene_hits)
    #if mode == "max":    
    return gene_hit_df.groupby(lambda x:x).max()
    #elif mode == "sum":
    #    return gene_hit_df.groupby(lambda x:x).sum()

In [197]:
def gene_stats(rod_s,stats=None,stat_names=None):
    """
    cacluclate statistics on the ROD series across genes
    stats default: [np.max,np.nanmean,len]
    """
    if stats == None:
        stats = [np.max,np.nanmean,len]
    def stats_apply(df):
        #name = df.index.values[0]
        c_dic = {"gene_id":df.iloc[0]["gene_id"],"end":df.iloc[0]["end"]}
        if stat_names is None:
            c_dic.update({stat.__name__:stat(df["value"]) for stat in stats})
        else:
            c_dic.update({n:stat(df["value"]) for n,stat in zip(stat_name,stats)})
        s = pd.Series(c_dic)
        return s
    gene_hit_df = pd.DataFrame()
    for chrom in rod_s.index.droplevel(1).unique():
        pos_rel_to_start = gene_df.ix[chrom].index.searchsorted(rod_s.ix[chrom].index).values
        pos_rel_to_end = np.searchsorted(gene_df.ix[chrom]["end"].values,rod_s.ix[chrom].index.values)
        in_gene = (pos_rel_to_start - pos_rel_to_end) == 1
        gene_hits = gene_df.ix[chrom].iloc[pos_rel_to_end[in_gene]]
        gene_hits['chrom'] = chrom
        gene_hits.set_index('chrom', append=True, inplace=True)
        gene_hits = gene_hits.reorder_levels(['chrom', 'pos'])
        gene_hits["value"] = rod_s.ix[chrom][in_gene].values
        gene_hits = gene_hits.groupby(lambda x:x).apply(stats_apply)
        gene_hit_df = gene_hit_df.append(gene_hits)   
    gene_hit_df.index = pd.MultiIndex.from_tuples(gene_hit_df.index)
    return gene_hit_df

In [205]:
def gene_stats_gb(rod_s,stats=None,stat_names=None):
    """
    #implement avoiding loop
    cacluclate statistics on the ROD series across genes
    stats default: [np.max,np.nanmean,len]
    """
    if stats == None:
        stats = [np.max,np.nanmean,len]
    def stats_apply(df):
        #name = df.index.values[0]
        c_dic = {"gene_id":df.iloc[0]["gene_id"],"end":df.iloc[0]["end"]}
        if stat_names is None:
            c_dic.update({stat.__name__:stat(df["value"]) for stat in stats})
        else:
            c_dic.update({n:stat(df["value"]) for n,stat in zip(stat_name,stats)})
        s = pd.Series(c_dic)
        return s
    def get_gene_hits(rod_s):
        #print rod_s.shape
        chrom = rod_s.index.values[0][0]
        #print chrom
        pos_rel_to_start = gene_df.ix[chrom].index.searchsorted(rod_s.ix[chrom].index).values
        pos_rel_to_end = np.searchsorted(gene_df.ix[chrom]["end"].values,rod_s.ix[chrom].index.values)
        in_gene = (pos_rel_to_start - pos_rel_to_end) == 1
        gene_hits = gene_df.ix[chrom].iloc[pos_rel_to_end[in_gene]]
        gene_hits["value"] = rod_s.ix[chrom][in_gene].values
        gene_hits = gene_hits.groupby(lambda x:x).apply(stats_apply)
        return gene_hits   
    gene_hit_df = rod_s.groupby(lambda x: x[0]).apply(get_gene_hits)
    return gene_hit_df

In [210]:
tot_sweed_s = tot_sweed_df["Likelihood"]

In [17]:
fn0 = open(eu("~/vervet_project/analyses/20140611_163_subpop_sweepfinder/_data/sweed_go_enrichment_q0.001_d10000_0_pys.temp.tsv"))
fn1 = open(eu("~/vervet_project/analyses/20140611_163_subpop_sweepfinder/_data/sweed_go_enrichment_q0.001_d10000_1_pys.temp.tsv"))
tot_rank = te.read_table(fn0,index_col=0)
rank_table = te.read_table(fn1,index_col=0)

In [20]:
tot_rank.index.diff(rank_table.index)

Index([u'000'], dtype='object')

In [14]:
top_n = 1000
value_s = tot_sweed_df["Likelihood"].sort(ascending=False,inplace=False)
top_s = value_s.iloc[:top_n]

In [16]:
max_dist = 10000
cand_genes = te.get_genes(top_s, gene_df, max_dist=10000)

In [19]:
real_assoc = te.get_go_assoc(cand_genes, gene_to_go)

In [21]:
init_rank_table = te.get_initial_rank_table(real_assoc)

## Enrichment according to values across gene (mean, max)...

In [7]:
from hs_vervet.tools import hs_pandas as hp
from hs_vervet.scripts import test_enrichment as te
assoc_dir = eu("~/vervet_project/data/gene_associations")

In [8]:
gene_to_cat = pd.read_csv(jn(assoc_dir,"gene_to_cat.csv"),index_col=0)

In [28]:
chrom_len_df = pd.read_csv(eu("~/vervet_project/metadata/ref3500.tsv"),
                       sep="\t",index_col=0,squeeze=True,header=False)
gene_df.sortlevel(inplace=True)

In [9]:
pop = "aet"
tot_sweed_df = pd.read_csv(eu("~/vervet_project/analyses/20140611_163_subpop_sweepfinder/"
                                 "_data/tot_sweed_result_{}.tsv".format(pop)),sep="\t",index_col=[0,1])

tot_sweed_df.index = pd.MultiIndex.from_arrays([tot_sweed_df.index.droplevel(1),tot_sweed_df.index.droplevel(0).values.astype(int)])
tot_sweed_df.index.set_names('pos',level=1,inplace=True)

In [10]:
rod_s = tot_sweed_df['Likelihood'].copy().sortlevel()#.ix[['CAE28','CAE29']]

In [448]:
def data_per_feature3(rod,feature_df):
    """
    Get the entires in rod which lie within a feature
    (e.g. gene) in feature_df.
    Input:
    rod (reference ordered data)... pandas series or data frame with multiindex (chrom, pos)
                                    such as SNP genotypes
    feature_df (gene annotation data frame)... index must be (chrom,feature_name), must have columns 'start', 'end'
    """
    rod = pd.DataFrame(rod).copy()
    feature_df = feature_df.copy()
    index = rod.reset_index().groupby('chrom').apply(lambda df: df['pos']+chrom_len_df.loc[:df['chrom'].iloc[0]].iloc[:-1].sum()).values
    rod['index'] = index
    rod.set_index(index,inplace=True)

    feature_df.loc[:,['start','end']] = feature_df.reset_index().groupby('chrom').apply(lambda df: df[['start','end']]+chrom_len_df.loc[:df['chrom'].iloc[0]].iloc[:-1].sum()).values
    feature_df.index = feature_df.index.droplevel(0)
    feature_name = feature_df.index.name

    pos_rel_to_start = feature_df['start'].searchsorted(rod.index)
    pos_rel_to_end = np.searchsorted(feature_df["end"].values,rod.index.values)
    in_feature = (pos_rel_to_start - pos_rel_to_end) == 1
    feature_id = feature_df.iloc[pos_rel_to_end[in_feature]].index
    snp_df = rod[in_feature]
    snp_df[feature_name] = feature_id
    snp_df.set_index(['gene_id','index'],inplace=True)
    return snp_df

In [15]:
def rod_chrompos_to_1d_index(rod, chrom_len_s, drop=True):
    """
    Converts a (chrom, pos) multiindex to a 
    single numeric index that runs through chromosomes.
    Note that the (chrom, pos) index is sorted lexographically,
    i.e., if chrom names are strings, the entries in the resulting
    index are Chr1, Chr10, Chr2, Chr20, Chr3,...,ChrX, ChrY.
    Example: (Chr2, 1) is converted to len(Chr1)+len(Chr10)+1.
    The inverse operation is given by rod_1d_to_chrompos_index.
    
    Input:
    rod ... series or dataframe with reference ordered data 
            with multiindex (chrom,pos)
    chrom_len_s ... series with chromosome names as keys
                    and chromosome length as values
    drop ... If False, keep chrom, pos as columns
    """
    if not chrom_len_s.index.is_monotonic:
        chrom_len_s = chrom_len_s.sort_index()
    rod = pd.DataFrame(rod).copy()
    if not rod.index.is_monotonic:
        rod.sortlevel(inplace=True)
    columns = columns = [c for c in rod.columns if c not in ['index']]
    rod.reset_index(inplace=True)
    #return rod.groupby('chrom')
    index = rod.groupby('chrom').apply(lambda df: df['pos']+chrom_len_s.loc[:df['chrom'].iloc[0]].iloc[:-1].sum()).values
    rod['index'] = index
    rod.set_index('index', inplace=True, drop=True)
    if not drop:
        columns = list(columns) + ['chrom', 'pos']
    rod.index = rod.index.values.astype(int)
    if not rod.index.is_monotonic:
        rod.sort_index(inplace=True)
    return rod[columns] if len(columns)>1 else rod[columns[0]]

In [16]:
def rod_1d_to_chrompos_index(rod_1d, chrom_len_s, drop=True):
    """
    Reverts the action of rod_chrompos_to_1d_index.
    Converts a single numeric index that runs through 
    chromosomes to a (chrom, pos) multiindex.
    Note that the single index is expected to correspond
    to (chrom, pos) sorted lexographically,
    i.e., if chrom names are strings, the entries should be in
    in the order Chr1, Chr10, Chr2, Chr20, Chr3,...,ChrX, ChrY.
    Example:  len(Chr1)+len(Chr10)+1 is converted to (Chr2, 1).
    
    Input:
    rod ... series or dataframe of reference ordered data 
            with single index running through chromosomes.
            (as produced by rod_chrompos_to_1d_index())
    chrom_len_s ... series with chromosome names as keys
                    and chromosome length as values
    drop ... If False, keep numeric index as columns
    """
    if not chrom_len_s.index.is_monotonic:
        chrom_len_s = chrom_len_s.sort_index()
    rod = pd.DataFrame(rod_1d).copy()
    if not rod.index.is_monotonic:
        rod.sort_index(inplace=True)
    columns = [c for c in rod.columns if c not in ['chrom','pos']]
    cs = chrom_len_s.cumsum()
    rod['chrom'] = np.nan
    for chrom, (start, end) in  zip(cs.index,zip([0] + list(cs.values[:-1]),cs.values)):
        end = min(end,rod.index[-1])
        rod.loc[slice(start,end),'chrom'] = chrom
        #print chrom, rod.loc[slice(start,end),columns[0]]
        rod.loc[slice(start,end),'pos'] = rod.ix[slice(start,end)].index - start
    if drop:
        rod.set_index(['chrom','pos'], inplace=True, drop=True)
    else:
        rod = rod.reset_index().set_index(['chrom','pos'], drop=True)
        columns = list(columns) + ['index']
    #if not rod.index.is_monotonic:
    #    rod.sort_index(inplace=True)

    return rod[columns] if len(columns)>1 else rod[columns[0]]

In [764]:
"""
Specifications:

rod ... Series or DataFrame of reference ordered data
index: 
    (chrom, pos)

feature_df ... Data Frame of intervals along the genome,
                equivalent of a bed file, but 1-indexed
index: 
    (chrom, start)
columns 
    required: 
        end
    optional:
        name
        ...
"""

'\nSpecifications:\n\nrod ... Series or DataFrame of reference ordered data\nindex: \n    (chrom, pos)\n\nfeature_df ... Data Frame of intervals along the genome,\n                equivalent of a bed file, but 1-indexed\nindex: \n    (chrom, start)\ncolumns \n    required: \n        end\n    optional:\n        name\n        ...\n'

In [17]:
def feature_df_to_1d_index(feature_df, chrom_len_s):
    """
    Converts mulitindex feature_df (chrom,start)
    to single numeric index running trough all 
    chromosomes. The column 'end' is also converted.
    See rod_chrompos_to_1d_index for details.
    """
    feature_df = feature_df.copy()
    feature_df.index.names = (feature_df.index.names[0], 'pos') 
    feature_df_1d = rod_chrompos_to_1d_index(feature_df, chrom_len_s,drop=False)
    #print feature_df_1d
    end_df = feature_df_1d.reset_index().set_index(['chrom','end'])
    #print end_df
    end_df.drop('pos',axis=1,inplace=True)
    end_df.rename(columns={'index':'start'}, inplace=True)
    end_df.index.names = (end_df.index.names[0], 'pos') 
    end_1d = rod_chrompos_to_1d_index(end_df, chrom_len_s)
    end_1d.index.name = 'end'
    end_1d = end_1d.reset_index().set_index('start')
    end_1d.index.name = 'index'
    if not end_1d.index.is_monotonic:
        end_1d.sort_index(inplace=True)
    return end_1d

In [22]:
feature_df_1d = feature_df_to_1d_index(gene_df, chrom_len_df)

In [18]:
def feature_df_to_chrompos_index(feature_df_1d, chrom_len_s):
    """
    Converts feature_df with single numberic index
    running through all chromosomes to 
    multiindex (chrom,start)
    The column 'end' is also converted.
    This is the inverse function of 
    feature_df_to_1d_index().
    See rod_1d_to_chrompos_index for details.
    """
    feature_df_1d = feature_df_1d.copy()
    feature_df = rod_1d_to_chrompos_index(feature_df_1d, chrom_len_s)
    feature_df.index.names = (feature_df.index.names[0], 'start') 
    end_df_1d = feature_df.reset_index().set_index('end')
    
    end_df = rod_1d_to_chrompos_index(end_df_1d, chrom_len_s)
    end_df.index.names = (end_df.index.names[0],'end')
    end_df = end_df.reset_index().set_index(['chrom','start'])
    if not end_df.index.is_monotonic:
        end_df.sortlevel(inplace=True)
    return end_df

In [23]:
feature = feature_df_to_chrompos_index(feature_df_1d, chrom_len_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,end,gene_id
chrom,start,Unnamed: 2_level_1,Unnamed: 3_level_1
CAE1,2628,4911,SCGB1C1
CAE1,4916,10379,ODF3
CAE1,14423,18444,BET1L
CAE1,18597,25398,RIC8A
CAE1,27803,48975,SIRT3
CAE1,49510,67353,PSMD13
CAE1,92213,99919,NLRP6
CAE1,102388,109337,ATHL1
CAE1,112164,113988,IFITM5
CAE1,124603,127671,LOC103241759


In [31]:
print (feature == gene_df).all()
print feature.index.equals(gene_df.index)

end        True
gene_id    True
dtype: bool
True


In [35]:
rod_1d = rod_chrompos_to_1d_index(rod_s,chrom_len_df)

In [36]:
rod = rod_1d_to_chrompos_index(rod_1d,chrom_len_df)

In [37]:
print (rod == rod_s).all()
print rod.index.equals(rod_s.index)

True
True


In [33]:
def data_per_feature_SI_2(rod, feature_df, feature_name = 'name'):
    """
    Get the entires in rod which lie within a feature
    (e.g. gene) in feature_df.
    Input:
    rod (reference ordered data)... pandas series or data frame with multiindex (chrom, pos)
                                    such as SNP genotypes
    feature_df (gene annotation data frame)... index must be (chrom,feature_name), must have columns 'start', 'end'
    """
    rod = pd.DataFrame(rod)
    pos_rel_to_start = feature_df.index.searchsorted(rod.index)
    #feature_df["end"] is not necessarily sorted, but wouldn't sorting it here
    #lead to problems  as well?
    pos_rel_to_end = np.searchsorted(feature_df["end"].values,rod.index.values)
    in_feature = (pos_rel_to_start - pos_rel_to_end) == 1
    feature_id = feature_df.iloc[pos_rel_to_end[in_feature]][feature_name].values  
    rod = rod[in_feature]
    rod[feature_name] = feature_id
    return rod

In [86]:
dpf_SI = data_per_feature_SI_2(rod_1d, feature_df_1d, feature_name='gene_id')
dpf_SI = rod_1d_to_chrompos_index(dpf_SI, chrom_len_df)
#dpf_SI = dpf_SI.reset_index().set_index(['chrom','gene_id','pos'])

In [93]:
dpf_SI = rod_1d_to_chrompos_index(dpf_SI, chrom_len_df)

In [90]:
r.sortlevel(inplace=True)

In [94]:
(dpf_SI==r).all()

Likelihood    True
gene_id       True
dtype: bool

In [39]:
dpf_classic.sortlevel(level=[0,2],inplace=True)

In [40]:
dpf_SI.sortlevel(level=[0,2],inplace=True)

In [41]:
print (dpf_SI == dpf_classic).all()
print dpf_SI.index.equals(dpf_classic.index)

Likelihood    True
dtype: bool
True


In [42]:
%timeit data_per_feature_SI_2(rod_1d, feature_df_1d, feature_name='gene_id')

1 loops, best of 3: 1.46 s per loop


In [345]:
def data_per_feature2(rod,feature_df):
    rod = pd.DataFrame(rod)
    def gene_values(gene,sl):
        gene_values = data.ix[sl].copy()
        gene_values['gene'] = gene
        gene_values.set_index('gene',append=True,inplace=True)
        return gene_values
    for chrom in rod.index.droplevel(1).unique():
        data = rod.ix[chrom]
        genes = feature_df.ix[chrom]
        slices = genes.apply(lambda gene: slice(gene['start'], gene['end']), axis=1)
        return pd.concat([gene_values(*sl) for sl in slices.iteritems()])

In [253]:
def get_features(peak_s, feature_df, max_dist):
    """
    take the input series and gets.
    names of features nearby

    Input:
    peak_s ... pandas series with (chrom, pos) index and value of
                the statistic ('peak height'). Series should be named.
    feature_df ... data frame with feature info.
    """
    all_features = []
    if not feature_df.index.is_monotonic:
        feature_df = feature_df.sort_index()
    tot_hit_df = pd.DataFrame()
    for chrom in peak_s.index.droplevel(1).unique():
        loc_feature_df = feature_df.ix[chrom]
        #loc_feature_df = loc_feature_df.append(pd.DataFrame(np.nan,index=[np.inf],columns=loc_feature_df.columns))
        pos_rel_to_start = np.searchsorted(loc_feature_df['start']-max_dist,peak_s.ix[chrom].index.values)
        pos_rel_to_end = np.searchsorted(loc_feature_df["end"].values+max_dist,peak_s.ix[chrom].index.values)
        #return pos_rel_to_start,pos_rel_to_end
        #print [range(a,b) for a,b in zip(pos_rel_to_end,pos_rel_to_start)]
        #features = list(set(loc_feature_df.index[np.hstack([range(a,b) for a,b in zip(pos_rel_to_end,pos_rel_to_start)])]))
        features = list(set(loc_feature_df["feature_id"].iloc[np.hstack([range(a,b) for a,b in zip(pos_rel_to_end,pos_rel_to_start)])]))
        #features = list(set(loc_feature_df["feature_id"].iloc[np.vstack([pos_rel_to_start,pos_rel_to_end]).T]))
        all_features += features
    return all_features

In [57]:
def get_summary_per_feature(value_s,feature_df,summary=None,summary_func=None):
    """
    summary ... name of the function of the groupby object
                         to apply to the data (e.g. 'mean', 'max',...)
    summary_func ... alternatively: function object to apply
                     This is slower than applying a summary string.
    """
    values_per_feature = hp.data_per_feature(value_s,feature_df)
    if summary is not None:
        summary_per_feature = hp.apply_to_feature(values_per_feature,summary)
    elif summary_func is not None:
        summary_per_feature = hp.apply_to_feature(values_per_feature,function=summary_func)
    else:
        raise ValueError("Either summary or summary_func must be given.")
    return summary_per_feature

In [118]:
def get_summary_per_category(value_per_feature,feature_to_category,summary='mean'):
    """
    Calculates summary (e.g. mean) of values for the features
    in each of the given categories.
    """
    feature_to_category = feature_to_category.copy()
    values_per_feature_to_category = value_per_feature.ix[feature_to_category['gene_symbol'].values].values
    feature_to_category['value'] =  values_per_feature_to_category
    del feature_to_category['gene_symbol']
    summary_per_category = getattr(feature_to_category.groupby('go_identifier'),summary)()
    return summary_per_category['value'] 

In [59]:
spf = get_summary_per_feature(rod_s,gene_df,summary='max')

In [122]:
spc = get_summary_per_category(spf,gene_to_cat,'mean')

In [123]:
init_rank_table = te.get_initial_rank_table(spc)

In [14]:
from hs_vervet.scripts import test_enrichment_dev as ted

In [158]:
del gene_to_cat['value']

In [549]:
gene_to_cat.rename(columns={'gene_symbol':'symbol'}, inplace=True)

In [580]:
se = ted.SummaryEnrichment(rod_s, gene_df, gene_to_cat, feature_summary='max',category_summary='sum', feature_name='symbol',
                               category_name='go_identifier')

INFO:root:Removing categories for which there are less than 2 features in the features file.
INFO:root:4451 categories removed.


In [581]:
%timeit se.permute(10)

ValueError: cannot label index with a null key

In [556]:
from hs_vervet.scripts import test_enrichment_dev2 as ted2

In [579]:
%autoreload

In [571]:
se2 = ted2.SummaryEnrichment(rod_s, gene_df, gene_to_cat, feature_summary='max',category_summary='sum', feature_name='symbol',
                               category_name='go_identifier')

INFO:root:Removing categories for which there are less than 2 features in the features file.
INFO:root:4451 categories removed.


In [572]:
%timeit se2.permute(10)

1 loops, best of 3: 56.9 s per loop


In [37]:
gene_to_cat.rename(columns={'gene_symbol':'gene_id'},inplace=True)

In [44]:
gene_to_cat['gene_id'].unique().shape

(83596,)

In [218]:
def top_genes(values_per_feature,quantile=None,top_n=None):
    """
    reverse engineering to get top quantile per gene category
    """
    if top_n is None:
        top_n = int(len(means)*quantile)
    groups = values_per_feature.groupby(lambda idx: idx[1])
    means = groups.mean()
    sort_means = means.sort(ascending=False)
    sort_means.iloc[:top_n] = 1
    sort_means.iloc[top_n:] = 0
    return sort_means

In [219]:
se1 = ted.SummaryEnrichment(rod_s, gene_df, gene_to_cat, feature_summary_fun=feature_summary_fun,category_summary='sum')

In [109]:
s = te.shift_rod(rod_s, rnd)
spf0 = get_summary_per_feature(s,gene_df,summary='max')
spc0 = get_summary_per_category(spf0,gene_to_cat,'median')

In [110]:
spc0 = spc0['value']

In [115]:
(init_rank_table["n_genes"] > \
                spc0.reindex(init_rank_table.index).fillna(0))

category
GO:0000002    False
GO:0000003    False
GO:0000009    False
GO:0000010    False
GO:0000012    False
GO:0000014    False
GO:0000015    False
GO:0000016     True
GO:0000018    False
GO:0000019    False
GO:0000022    False
GO:0000023     True
GO:0000026    False
GO:0000027    False
GO:0000028     True
...
hiv155030.0       False
hiv155348.0       False
hiv155459.0       False
hiv155807.0       False
hiv155871.0       False
hiv155908.0       False
hiv155945.0       False
hiv155971.0       False
hiv156110.0       False
hiv19424028.0     False
hs_hivprot        False
hs_hivprotphen    False
hs_hivrep         False
hs_hivrepphen     False
hs_sivcand        False
Length: 15013, dtype: bool

In [148]:
n = 100000

In [149]:
%%timeit 
for i in np.random.rand(n):
    pass

100 loops, best of 3: 9.37 ms per loop


In [152]:
%%timeit 
for _ in range(n):
    pass
    #rnd = np.random.rand()

100 loops, best of 3: 3.36 ms per loop


In [125]:
def multiple_permut_summary(rod_s, init_rank_table, gene_df, gene_to_cat, n_runs):
    for rnd in np.random.rand(n_runs):
        s = te.shift_rod(rod_s, rnd)
        spf0 = get_summary_per_feature(s,gene_df,summary='max')
        spc0 = get_summary_per_category(spf0,gene_to_cat,'mean')
        init_rank_table["rank"] += (init_rank_table["n_genes"] > \
                                        spc0.reindex(init_rank_table.index).fillna(0))
        init_rank_table["out_of"] += 1
    return init_rank_table

In [None]:
rt = multiple_permut_summary(rod_s, init_rank_table, gene_df, gene_to_cat, 500)

In [131]:
init_rank_table.sort('rank',ascending=False)

Unnamed: 0_level_0,n_genes,out_of,rank
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GO:0071412,17.441990,500,500
GO:0008281,23.258225,500,500
GO:0005333,27.544410,500,500
GO:0090038,15.423555,500,500
GO:0009331,12.012225,500,500
GO:0005334,27.230980,500,500
GO:0032009,10.860521,500,500
GO:0008504,13.227751,500,500
GO:0061092,48.617600,500,500
GO:0015844,10.112849,500,500


In [68]:
s = gene_to_cat.groupby('go_identifier').apply(len)
s.name = 'n_genes'

In [138]:
pd.DataFrame(gene_to_cat.iloc[:,:2].values.values,columns=["hallo",'kuh'])

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [257]:
data = np.array(range(50))

In [281]:
data = pd.DataFrame(range(0,50),index=range(0,100,2))

In [293]:
genes = pd.DataFrame({'start':[2,21],'end':[9,26]},index=['gene1','gene2'])

In [283]:
data[slice(i,j)] for i,j in ngenes['start'].values,genes['end'].values

SyntaxError: invalid syntax (<ipython-input-283-a81fcb2143c2>, line 1)

In [296]:
[data[data.index.slice_indexer(j['start'],j['end'])] for _,j in genes.iterrows()]

[   0
 2  1
 4  2
 6  3
 8  4,      0
 22  11
 24  12
 26  13]

In [308]:
%timeit data[data.index.slice_indexer(2, 9)]

10000 loops, best of 3: 129 µs per loop


In [309]:
%timeit data.ix[slice(2, 9, None)]

10000 loops, best of 3: 134 µs per loop


In [343]:
def gene_values(gene,sl):
    gene_values = data.ix[sl].copy()
    gene_values['gene'] = gene
    gene_values.set_index('gene',append=True,inplace=True)
    return gene_values
slices = genes.apply(lambda gene:slice(gene['start'],gene['end']),axis=1)
pd.concat([gene_values(*sl) for sl in slices.iteritems()])

In [148]:
gene_df.rename(columns={'name':'feature'},inplace=True)

In [115]:
se = ted.SummaryEnrichment(rod_s, gene_df, gene_to_cat, feature_summary='max',category_summary='mean')

In [118]:
se.real_assoc()

In [120]:
se.permute(10)

In [138]:
%timeit rod_s.groupby(lambda x: x[0]).apply(len)

10 loops, best of 3: 24.2 ms per loop


In [139]:
%timeit rod_s.groupby(lambda x: x[0]).mean()

10 loops, best of 3: 21.9 ms per loop


In [125]:
se2 = ted.SummaryEnrichment(rod_s, gene_df, gene_to_cat, feature_summary='max',category_summary='mean',chrom_len_s = chrom_len_df)

In [126]:
se2.real_assoc()

In [130]:
(se2.rank_table['value'].dropna().sort(inplace=False) == se.rank_table['value'].dropna().sort(inplace=False)).all()

True

In [133]:
%timeit se2.permute(10)

1 loops, best of 3: 10.2 s per loop


In [24]:
gene_to_cat2.columns

Index([u'gene_id', u'go_identifier'], dtype='object')

In [22]:
gene_to_cat2.rename(columns={'gene_symbol':'gene_id'},inplace=True)

In [38]:
te = ted.TopScoresEnrichment(rod_s, gene_df, gene_to_cat2, top_type='count',top=1000,feature_name='gene_id',category_name='go_identifier',max_dist=20000)

INFO:root:Removing categories for which there are less than 2 features in the features file.
INFO:root:0 categories removed.


In [39]:
te.create_info()

In [44]:
dir(te)

['__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_bind_feature_to_category',
 'ascending',
 'candidate_features',
 'category_name',
 'create_info',
 'feature_df',
 'feature_name',
 'feature_to_category',
 'get_association',
 'get_permut_rank_table',
 'get_pvals',
 'get_summary_per_category',
 'get_summary_per_feature',
 'init_rank_table',
 'initital_rank_table',
 'max_dist',
 'ncpus',
 'peaks_per_feature',
 'permute',
 'permuter',
 'prune_feature_to_category',
 'rank_table',
 'summary_per_feature',
 'top_n',
 'top_peaks',
 'value_name',
 'value_s']

In [156]:
te.permute(10)

In [17]:
t = ted.get_peaks(gene_df,top_s,max_dist=0,feature_name='gene_id')

In [69]:
ted.peaks_per_gene['peak_height'].groupby(lambda i:i[0]).max()

AttributeError: 'SummaryEnrichment' object has no attribute 'peaks_per_gene'

In [59]:
candidates = t.index.droplevel([1,2]).values

In [62]:
x = ted.CandidateEnrichment(candidates,gene_to_cat,feature_name='gene_id', category_name='go_identifier', feature_df=gene_df)



In [64]:
x.create_info()

In [65]:
x.summary_per_feature

Unnamed: 0,candidate,zscore
HEPHL1,True,12.358675
GAS2,True,12.358675
GRIK4,True,12.358675
CCDC50,True,12.358675
TENM2,True,12.358675
UTP23,True,12.358675
HS3ST4,True,12.358675
RYK,True,12.358675
SLCO2A1,True,12.358675
USP13,True,12.358675


In [19]:
ppg = ted.get_peak_info(top_s,t)

In [26]:
gene_df.reset_index().set_index('gene_id').reset_index().set_index(['chrom','start'])

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_id,end
chrom,start,Unnamed: 2_level_1,Unnamed: 3_level_1
CAE1,2628,SCGB1C1,4911
CAE1,4916,ODF3,10379
CAE1,14423,BET1L,18444
CAE1,18597,RIC8A,25398
CAE1,27803,SIRT3,48975
CAE1,49510,PSMD13,67353
CAE1,92213,NLRP6,99919
CAE1,102388,ATHL1,109337
CAE1,112164,IFITM5,113988
CAE1,124603,LOC103241759,127671


## Debug get info in TopScoresEnrichment

In [48]:
ballet_s = pd.read_csv('/home/GMI/hannes.svardal/vervet_project/analyses/20150429_ballet/_data/163_HC_ballet_pyn_all_chrom_T1_out.tsv',sep='\t',squeeze=True,index_col=[0,1])

In [82]:
feature_to_cat = pd.read_csv('/home/GMI/hannes.svardal/vervet_project/data/gene_associations/gene_to_cat_release100_gt2.tsv',sep='\t')

In [91]:
feature_to_cat.rename(columns={'gene_symbol':'symbol'},inplace=True)

In [96]:
gene_to_cat2.rename(columns={'gene_id':'symbol'},inplace=True)

In [115]:
gene_df = load_gene_df().rename(columns={'gene_id':'symbol'})

In [88]:
feature_df = pd.read_csv('/home/GMI/hannes.svardal/vervet_project/metadata/annot_release100_genes.tsv',sep='\t',index_col=[0,1],usecols=['chrom','start','end','symbol'])

In [214]:
te = ted.TopScoresEnrichment(ballet_s, feature_df, feature_to_cat, top_type='quantile',top=0.1,feature_name='symbol',category_name='go_identifier',max_dist=20000)

INFO:root:Removing categories for which there are less than 2 features in the features file.
INFO:root:0 categories removed.


In [62]:
%autoreload 0

In [558]:
%autoreload

In [132]:
from hs_vervet.tools import hs_pandas as hp

In [199]:
print len(gene_df['symbol']), len(gene_df['symbol'].unique())

20447 20447


In [200]:
print len(feature_df['symbol']), len(feature_df['symbol'].unique())

34451 34142


In [133]:
value_s = te.value_s.sort(ascending=te.ascending, inplace=False)
top_s = value_s.iloc[:te.top_n]
te.candidate_features = hp.get_features(top_s, te.feature_df, feature_name=te.feature_name,
                                                                             max_dist=te.max_dist)
sub_feature_df = te.feature_df.reset_index().set_index(te.feature_name).ix[te.candidate_features]\
                                                        .reset_index().set_index(['chrom','start'])

In [139]:
max_dist = te.max_dist
feature_name=te.feature_name
sub_gene_df = sub_feature_df

In [201]:
chrom = 'CAE17'
loc_top_s = top_s.ix[chrom].copy()
loc_top_s = top_s.ix[chrom]
gene_info = sub_gene_df

In [186]:
start = np.searchsorted(loc_top_s.index.values+max_dist,gene_info.ix[chrom].index.values)
end = np.searchsorted(loc_top_s.index.values-max_dist,gene_info.ix[chrom]["end"].values)

In [204]:
gene_info.ix[chrom][feature_name].unique().shape

(433,)

In [205]:
gene_info.ix[chrom][feature_name].shape

(677,)

In [213]:
feature_df.drop_duplicates(subset='symbol',inplace=True)

In [208]:
x = pd.concat([loc_top_s.iloc[st:ed] for st,ed in zip(start,end)],
                          keys=gene_info.ix[chrom][feature_name].unique())

In [168]:
ted.get_peaks(sub_gene_df,top_s,max_dist,feature_name)

chrom  pos     
CAE14  19513384    766.994361
       19512798    761.695477
       19513153    761.140567
       19504790    744.792114
CAE12  16800241    741.098915
       15749913    740.452204
       15750098    739.203645
       15750141    738.324430
       16800255    733.141917
       15749757    731.422516
CAE14  19498574    730.691665
CAE12  15749085    727.137981
CAE14  19496176    724.857386
CAE12  16856121    723.638600
CAE14  19495727    723.608426
...
CAE7   121589268   -114.272755
CAE8   16289750    -114.272787
CAE18  34187896    -114.272807
CAE12  71996467    -114.272830
CAE21  101512762   -114.272839
CAE1   70144833    -114.272876
CAE15  58845450    -114.272880
CAE3   87375087    -114.272895
CAE21  9246170     -114.272901
CAE3   11006858    -114.272911
CAE18  6991506     -114.272962
CAE3   90281382    -114.273000
CAE10  90734247    -114.273038
CAE21  57090405    -114.273103
CAE6   48647963    -114.273199
Name: score, Length: 1487592, dtype: float64
                    

ValueError: Wrong number of items passed 608816, placement implies 101634988

In [221]:
gene_to_cat2[gene_to_cat2["go_identifier"].apply(lambda x: "hiv" in x)]#.set_index("go_identifier").ix['hiv_protphen']

Unnamed: 0,symbol,go_identifier
1020483,TP53,hs_hivprot
1020484,APOE,hs_hivprot
1020485,TNF,hs_hivprot
1020486,EGFR,hs_hivprot
1020487,VEGFA,hs_hivprot
1020488,IL6,hs_hivprot
1020489,TGFB1,hs_hivprot
1020490,ACE,hs_hivprot
1020491,ESR1,hs_hivprot
1020492,APP,hs_hivprot


In [119]:
np.__version__

'1.9.1'

In [79]:
te.peaks_per_feature.xs('CAE16',level=1).index.droplevel(1).unique().shape

(470,)

In [128]:
((gene_df['end'].iloc[:-1].values - gene_df.reset_index()['start'].iloc[1:].values)>0).sum()

1162

In [127]:
((feature_df['end'].iloc[:-1].values - feature_df.reset_index()['start'].iloc[1:].values)>0).sum()

5769

In [235]:
d = {i:[gene_df.iloc[int(i % len(gene_df))]] for i in range(10000)}

In [230]:
import sys

In [236]:
sys.getsizeof(d)/1000000.

0.786712

In [245]:
m = np.random.randint(0,1,(20000,20000))

In [246]:
sys.getsizeof(m)

80

In [329]:
trod_s = pd.Series(0,index=pd.MultiIndex.from_arrays([['CAE1']*20,np.arange(20)]))

In [330]:
trod_s.index.names = ['chrom','pos']
trod_s.name = 'score'

chrom  pos
CAE1   0      0
       1      0
       2      0
       3      0
       4      0
       5      0
       6      0
       7      0
       8      0
       9      0
       10     0
       11     0
       12     0
       13     0
       14     0
       15     0
       16     0
       17     0
       18     0
       19     0
dtype: int64

In [331]:
tgene_df = gene_df.iloc[:3]
tgene_df.index = pd.MultiIndex.from_arrays([['CAE1']*3,[2,9,11]])
tgene_df.loc[:,'end'] = [7,18,17]
tgene_df.index.names = ['chrom','pos']

In [268]:
tgene_df

Unnamed: 0_level_0,Unnamed: 1_level_0,end,symbol
chrom,pos,Unnamed: 2_level_1,Unnamed: 3_level_1
CAE1,2,7,SCGB1C1
CAE1,9,18,ODF3
CAE1,11,17,BET1L


In [264]:
from hs_vervet.tools import hs_pandas as hpd

In [272]:
hpd.data_per_feature(trod_s, tgene_df, feature_name='symbol')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,symbol
chrom,pos,Unnamed: 2_level_1,Unnamed: 3_level_1
CAE1,3,0,SCGB1C1
CAE1,4,0,SCGB1C1
CAE1,5,0,SCGB1C1
CAE1,6,0,SCGB1C1
CAE1,7,0,SCGB1C1
CAE1,10,0,ODF3
CAE1,11,0,ODF3


In [18]:
def get_features_per_data(peak_s, feature_df, feature_name='feature', max_dist=0):
    """
    take the input data series and gets a similar series
    with one entry per pair data-point gene
    (i.e., there can be 0,1 or more entries per data point)
    
    """
    all_features = []
    if not feature_df.index.is_monotonic:
        feature_df = feature_df.sort_index()
    tot_hit_df = pd.DataFrame()
    for chrom in peak_s.index.droplevel(1).unique():
        loc_feature_df = feature_df.ix[chrom]
        #loc_feature_df = loc_feature_df.append(pd.DataFrame(np.nan,index=[np.inf],columns=loc_feature_df.columns))
        #print loc_feature_df.index-max_dist, peak_s.ix[chrom].index.values
        #try:
        pos_rel_to_start = np.searchsorted(loc_feature_df.index.values-max_dist,peak_s.ix[chrom].index.values)
        #except:
        #    print chrom, peak_s.ix[chrom]
        pos_rel_to_end = np.searchsorted(loc_feature_df["end"].values+max_dist,peak_s.ix[chrom].index.values)
        features_per_datapoint = (pos_rel_to_start - pos_rel_to_end)
        print chrom, features_per_datapoint.shape
        data_idx = [i for i in range(len(features_per_datapoint)) for j in range(features_per_datapoint[i])]
        features = loc_feature_df[feature_name].iloc[np.hstack([range(a,b) for a,b in zip(pos_rel_to_end,pos_rel_to_start)])].values
        data_df = pd.DataFrame(peak_s.ix[chrom].iloc[data_idx])
        data_df[feature_name] = features
        data_df['chrom'] = chrom
        all_features.append(data_df)
    fpd=pd.concat(all_features)    
    #fpd.set_index(['chrom'],append=True,inplace=True)
    #fpd = dpf.reorder_levels(['chrom','pos'])
    g = fpd.reset_index().groupby(['chrom','pos'])
    def get_series_of_features(gdf):
        features = gdf[feature_name].unique()
        r = pd.Series({i:s for i,s in enumerate(features)})
        df = pd.DataFrame({feature_name:r,peak_s.name:gdf[peak_s.name].values[0]})
        return df
    d = g.apply(get_series_of_features)
    d.index.names = ['chrom','pos','number']
    return  d

In [22]:
def get_features(peak_s, feature_df, feature_name='feature', max_dist=0):
    """
    take the input series and gets.
    names of features nearby

    Input:
    peak_s ... pandas series with (chrom, pos) index and value of
                the statistic ('peak height'). Series should be named.
    feature_df ... data frame with feature info.
    """
    all_features = []
    if not feature_df.index.is_monotonic:
        feature_df = feature_df.sort_index()
    tot_hit_df = pd.DataFrame()
    for chrom in peak_s.index.droplevel(1).unique():
        loc_feature_df = feature_df.ix[chrom]
        #loc_feature_df = loc_feature_df.append(pd.DataFrame(np.nan,index=[np.inf],columns=loc_feature_df.columns))
        #print loc_feature_df.index-max_dist, peak_s.ix[chrom].index.values
        #try:
        pos_rel_to_start = np.searchsorted(loc_feature_df.index.values-max_dist,peak_s.ix[chrom].index.values)
        #except:
        #    print chrom, peak_s.ix[chrom]
        pos_rel_to_end = np.searchsorted(loc_feature_df["end"].values+max_dist,peak_s.ix[chrom].index.values)
        features = list(set(loc_feature_df[feature_name].iloc[np.hstack([range(a,b) for a,b in zip(pos_rel_to_end,pos_rel_to_start)])]))
        all_features += features
    return all_features

In [502]:
def shift_rod(rod_df, rnd, mode = "grid"):
    """
    shift reference ordered data across the whole genome

    Input:
    rod_df ... pandas dataframe or series with mulitiindex (chrom, pos)

    modes ...
        'grid' ... just rotate the index of the rod data frame
                    this means that the positions stay the same only the
                    value for each position becomes different
                    Faster, but means that you only hit the same grid-point
                    this should make it conservative on large grids. Large
                    grids are problematic if the fraction of top windows.
                    considered becomes large.
        'continuous' ... add the random shift to each index value.
                         NOT IMPLEMENTED
    """
    if mode == "grid":
        new_start_i = int(len(rod_df)*rnd)
        rotate_data = np.concatenate((rod_df.iloc[new_start_i:].values,rod_df.iloc[:new_start_i].values))
        if  isinstance(rod_df,pd.core.series.Series):
            r = pd.Series(rotate_data,index=rod_df.index)
            return r
        elif isinstance(rod_df,pd.core.frame.DataFrame):
            r = pd.DataFrame(rotate_data,index=rod_df.index,columns=rod_df.columns)
            return r
    else:
        raise UserException("Only mode grid supported.")

In [None]:
def features_per_data_to_data_per_features(fpd, feature_name='features'):
    return fpd.reset_index().set_index(feature_name).sort_index()

In [17]:
rod_s0 = rod_s.iloc[:88230]

In [19]:
r = get_features_per_data(rod_s0, gene_df, feature_name='gene_id', max_dist=10000)

CAE1 (88230,)


In [508]:
r2.index = r2.index.droplevel(2)

In [540]:
srod_s = shift_rod(rod_s,rnd=np.random.rand())
srod_s.name = 'Likelihood'

In [541]:
shifted_r2 = r2.drop('Likelihood',axis=1).join(srod_s)

In [448]:
top500 = rod_s.sort(inplace=False,ascending=False).iloc[:3000]

In [423]:
r2 = r.copy()

In [424]:
r2 = r2.reset_index().set_index(['chrom','pos'])

In [449]:
in_peaks = r2.ix[top500.index].dropna()

In [450]:
in_peaks.index = pd.MultiIndex.from_tuples(in_peaks.index)

In [None]:
in_peaks.

In [453]:
in_peaks.sortlevel(inplace=True)

In [476]:
get_features_per_data(trod_s, tgene_df, feature_name='symbol', max_dist=0).reset_index().set_index('symbol').sort_index()

Unnamed: 0_level_0,chrom,pos,level_2,score
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BET1L,CAE1,12,1,0
BET1L,CAE1,13,1,0
BET1L,CAE1,14,1,0
BET1L,CAE1,15,1,0
BET1L,CAE1,16,1,0
BET1L,CAE1,17,1,0
ODF3,CAE1,10,0,0
ODF3,CAE1,11,0,0
ODF3,CAE1,12,0,0
ODF3,CAE1,13,0,0


In [414]:
in_peaks.reset_index().groupby(['chrom','pos'])

<pandas.core.groupby.DataFrameGroupBy object at 0x54c61ed0>

In [350]:
def get_series_of_features(gdf):
    features = gdf['symbol'].unique()
    r = [s for i,s in enumerate(features)]
    return r

In [351]:
d = g.apply(get_series_of_features)

In [352]:
d

(CAE1, 3)         [SCGB1C1]
(CAE1, 4)         [SCGB1C1]
(CAE1, 5)         [SCGB1C1]
(CAE1, 6)         [SCGB1C1]
(CAE1, 7)         [SCGB1C1]
(CAE1, 10)           [ODF3]
(CAE1, 11)           [ODF3]
(CAE1, 12)    [ODF3, BET1L]
(CAE1, 13)    [ODF3, BET1L]
(CAE1, 14)    [ODF3, BET1L]
(CAE1, 15)    [ODF3, BET1L]
(CAE1, 16)    [ODF3, BET1L]
(CAE1, 17)    [ODF3, BET1L]
dtype: object

In [None]:
def 

In [353]:
g2 = fpd.reset_index().groupby(['chrom','pos'])

In [362]:
def get_series_of_features2(gdf):
    features = gdf['symbol'].unique()
    r = pd.Series({i:s for i,s in enumerate(features)})
    return r

In [363]:
d = g2.apply(get_series_of_features2)
d

chrom  pos   
CAE1   3    0    SCGB1C1
       4    0    SCGB1C1
       5    0    SCGB1C1
       6    0    SCGB1C1
       7    0    SCGB1C1
       10   0       ODF3
       11   0       ODF3
       12   0       ODF3
            1      BET1L
       13   0       ODF3
            1      BET1L
       14   0       ODF3
            1      BET1L
       15   0       ODF3
            1      BET1L
       16   0       ODF3
            1      BET1L
       17   0       ODF3
            1      BET1L
dtype: object

In [375]:
d.index.names

FrozenList([u'chrom', u'pos', None])

In [477]:
hpd.data_per_feature(trod_s, tgene_df, feature_name='symbol')

Unnamed: 0_level_0,Unnamed: 1_level_0,score,symbol
chrom,pos,Unnamed: 2_level_1,Unnamed: 3_level_1
CAE1,3,0,SCGB1C1
CAE1,4,0,SCGB1C1
CAE1,5,0,SCGB1C1
CAE1,6,0,SCGB1C1
CAE1,7,0,SCGB1C1
CAE1,10,0,ODF3
CAE1,11,0,ODF3


In [295]:
trod_s.iloc[d]

chrom  pos
CAE1   0      0
       1      0
       2      0
       3      0
       0      0
       1      0
       2      0
       3      0
       0      0
       1      0
       2      0
       3      0
dtype: int64

In [46]:
fpdp = np.random.randint(0,10,10000)
d = [i for i in range(len(fpdp)) for j in range(fpdp[i]) ]

In [47]:
fpdp

array([2, 2, 1, 4, 5, 4, 8, 1, 4, 1, 0, 0, 1, 6, 4, 7, 6, 2, 4, 4])

In [31]:
for i in range(4):
    for j in range(fpdp[i]):
        print i

0
1
1
3
3
3


In [38]:
[[i]*j for i,j in zip(range(len(fpdp)),fpdp)]

[[0], [1, 1], [], [3, 3, 3]]

In [28]:
%timeit x = get_features(rod_s, gene_df, feature_name= 'gene_id')

1 loops, best of 3: 31.6 s per loop
