In [None]:
import pandas as pd
import numpy as np
from chrom_windows import window

In [2]:
data = pd.DataFrame({'chrom': ['chr1']+['chr2']*10,
                    'start': list(range(10)) + [40],
                    'end': list(map(sum, zip(range(10), [5, 1]*5+[20]))) + [45],
                   'species': ['human']*5+['chimp']*6, 'run': range(0, 110, 10), 'analysis' : np.linspace(3, 7, 11)})

data

Unnamed: 0,chrom,start,end,species,run,analysis
0,chr1,0,5,human,0,3.0
1,chr2,1,2,human,10,3.4
2,chr2,2,7,human,20,3.8
3,chr2,3,4,human,30,4.2
4,chr2,4,9,human,40,4.6
5,chr2,5,6,chimp,50,5.0
6,chr2,6,11,chimp,60,5.4
7,chr2,7,8,chimp,70,5.8
8,chr2,8,13,chimp,80,6.2
9,chr2,9,10,chimp,90,6.6


Make a function `interval_count` that is called on the intervals in windows of size 5. Note that the `window` decorator only handles a single chromosome so you always need to group your data by chromosome:

In [3]:
@window(size=5)
def interval_count(df):
    return len(df.index)

df = data.groupby('chrom').apply(interval_count)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,start,end,interval_count
chrom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr1,0,0,5,1
chr2,0,0,5,4
chr2,1,5,10,7
chr2,2,10,15,2
chr2,3,15,20,0
chr2,4,20,25,0
chr2,5,25,30,0
chr2,6,30,35,0
chr2,7,35,40,0
chr2,8,40,45,1


You can get rid of the extra index like this:

In [4]:
df.reset_index(drop=True, level=-1)

Unnamed: 0_level_0,start,end,interval_count
chrom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr1,0,5,1
chr2,0,5,4
chr2,5,10,7
chr2,10,15,2
chr2,15,20,0
chr2,20,25,0
chr2,25,30,0
chr2,30,35,0
chr2,35,40,0
chr2,40,45,1


You can further convert the index to colums like this:

In [5]:
df.reset_index(drop=True, level=-1).reset_index()

Unnamed: 0,chrom,start,end,interval_count
0,chr1,0,5,1
1,chr2,0,5,4
2,chr2,5,10,7
3,chr2,10,15,2
4,chr2,15,20,0
5,chr2,20,25,0
6,chr2,25,30,0
7,chr2,30,35,0
8,chr2,35,40,0
9,chr2,40,45,1


You can group by more than just the chromosome if you like:

In [6]:
data.groupby(['chrom', 'species']).apply(interval_count).reset_index(drop=True, level=-1).reset_index()

Unnamed: 0,chrom,species,start,end,interval_count
0,chr1,human,0,5,1
1,chr2,chimp,0,5,0
2,chr2,chimp,5,10,5
3,chr2,chimp,10,15,2
4,chr2,chimp,15,20,0
5,chr2,chimp,20,25,0
6,chr2,chimp,25,30,0
7,chr2,chimp,30,35,0
8,chr2,chimp,35,40,0
9,chr2,chimp,40,45,1


You can use hte `even` keyword to put approximately the same amount of interval in each window (to the extent that this is possible):

In [7]:
@window(size=10)
def interval_sum(df):
    return (df.end-df.start).sum()

data.groupby('chrom').apply(interval_sum).reset_index(drop=True, level=-1).reset_index()

Unnamed: 0,chrom,start,end,interval_sum
0,chr1,0,10,5
1,chr2,0,10,21
2,chr2,10,20,4
3,chr2,20,30,0
4,chr2,30,40,0
5,chr2,40,50,5


You can return any number of values from your function. Just do so as a Series or a dictionary:

In [8]:
@window(size=10)
def multiple_stats(df):
    # return a Series
    return df[['analysis','run']].sum()

data.groupby(['chrom']).apply(multiple_stats).reset_index(drop=True, level=-1).reset_index()

Unnamed: 0,chrom,start,end,analysis,run
0,chr1,0,10,3.0,0.0
1,chr2,0,10,45.0,450.0
2,chr2,10,20,11.6,140.0
3,chr2,20,30,0.0,0.0
4,chr2,30,40,0.0,0.0
5,chr2,40,50,7.0,100.0


In [9]:
@window(size=10)
def multiple_stats(df):
    # return dictionary
    return dict(tot_length=(df.end-df.start).sum(), interval_count=len(df), mean_length=(df.end-df.start).mean())
    
data.groupby(['chrom']).apply(multiple_stats).reset_index(drop=True, level=-1).reset_index()

Unnamed: 0,chrom,start,end,tot_length,interval_count,mean_length
0,chr1,0,10,5,1,5.0
1,chr2,0,10,21,9,2.333333
2,chr2,10,20,4,2,2.0
3,chr2,20,30,0,0,
4,chr2,30,40,0,0,
5,chr2,40,50,5,1,5.0


In [10]:
@window(size=100000000, empty=True, fill='hg19')
def count1(df):
    return len(df.index)

data.groupby('chrom').apply(count1).reset_index(drop=True, level=-1).reset_index()

Unnamed: 0,chrom,start,end,count1
0,chr1,0,100000000,1
1,chr1,100000000,200000000,0
2,chr1,200000000,300000000,0
3,chr2,0,100000000,10
4,chr2,100000000,200000000,0
5,chr2,200000000,300000000,0


Use the `logbase` argument to make windows increase logarithmically with the specified base, starting from size. Usefull if the density of intervals decrease with distance (E.g. reletive to some annotation.)

In [14]:
@window(size=2, logbase=2)
def count2(df):
    return len(df.index)

data.groupby('chrom').apply(count2).reset_index(drop=True, level=-1).reset_index()

Unnamed: 0,chrom,start,end,count2
0,chr1,0.0,2.0,1
1,chr1,2.0,6.0,1
2,chr2,0.0,2.0,1
3,chr2,2.0,6.0,4
4,chr2,6.0,14.0,6
5,chr2,14.0,30.0,0
6,chr2,30.0,62.0,1


If you get fed up with adding `.reset_index(drop=True, level=-1).reset_index()` you can make your own reset_index to pipe it trough:

In [15]:
def reset_group_index(df):
    return df.reset_index(drop=True, level=-1).reset_index()

In [16]:
@window(size=10)
def count(df):
    return len(df.index)
    
data.groupby(['chrom']).apply(count).pipe(reset_group_index)

Unnamed: 0,chrom,start,end,count
0,chr1,0,10,1
1,chr2,0,10,9
2,chr2,10,20,2
3,chr2,20,30,0
4,chr2,30,40,0
5,chr2,40,50,1
