In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='svg'

from genominterv.decorators import bootstrap
from genominterv.stats import proximity_stat, jaccard_stat
import geneinfo as gi
from geneinfo.utils import GeneList as glist

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
from collections import OrderedDict

def plot_intervals(kwargs):

    import matplotlib.pyplot as plt

    vlines = kwargs.get('vlines', [])
    if 'vlines' in kwargs: del kwargs['vlines']
    figsize = kwargs.get('figsize', (8, 1.5*len(kwargs)-1))
    if 'figsize' in kwargs: del kwargs['figsize']

    tups = list(kwargs.items())
    tups = reversed(tups)

    df_list = []
    labels = []
    for label, df in tups:
        labels.append(label)
        df['label'] = np.repeat(label, df.index.size)
        df_list.append(df)
    bigdf = pd.concat(df_list)

    bigdf['chrom'] = pd.Categorical(bigdf['chrom'], bigdf['chrom'].unique())
    bigdf['label'] = pd.Categorical(bigdf['label'], bigdf['label'].unique())

    gr = bigdf.groupby('chrom', observed=False)

    fig, axes = plt.subplots(gr.ngroups, 1, figsize=figsize, 
                            sharey=True
                            #  sharex=True
                             )
    if type(axes) is not np.ndarray:
        # in case there is only one axis so it not returned as a list
        axes = [axes]
    
    for i, chrom in enumerate(gr.groups):
        _df = gr.get_group(chrom)
        _gr = _df.groupby('label', observed=False)
        for y, label in enumerate(_gr.groups):
            try:
                df = _gr.get_group(label)
            except KeyError:
                continue
            y = np.repeat(y, df.index.size)
            axes[i].hlines(y, df.start.tolist(), df.end.tolist(), lw=10, colors=f'C{y[0]}', capstyle='butt')
            delta = len(labels)/10

        axes[i].spines['top'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['right'].set_visible(False)

        axes[i].set_yticks(list(range(len(labels))), labels)
        axes[i].tick_params(axis='y', which='both', left=False)
        axes[i].set_ylim(-1, len(labels)-0.7)
        # axes[i].set_xlim(df.start.min()-delta, df.end.max()+delta)
        if i != gr.ngroups-1:
            axes[i].tick_params(axis='x', which='both', bottom=False)

        axes[i].set_title(chrom, loc='left', fontsize=10)

    for y, ax in enumerate(axes):
        y = np.repeat(y, len(vlines))
        axes[i].vlines(vlines, *ax.get_ylim(), lw=0.1, colors='black', zorder=0)
    
    plt.tight_layout()
    return axes

def stairs(df, start='start', end='end', pos='pos', endtrim=0):
    "Turn a df with start, end into one with pos to plot as stairs"
    df1 = df.copy(deep=True)
    df2 = df.copy(deep=True)
    df1[pos] = df1[start]
    df2[pos] = df2[end] - endtrim
    return pd.concat([df1, df2]).sort_values([start, end])

In [3]:
chrom_sizes = {
'chr1': 223616942,
'chr2': 196197964,
'chr5': 187317192,
'chr3': 185288947,
'chr6': 179085566,
'chr4': 169963040,
'chr7': 169868564,
'chrX': 153388924,
'chr8': 145679320,
'chr9': 134124166,
'chr11': 133066086,
'chr12': 130043856,
'chr14': 128056306,
'chr15': 113283604,
'chr13': 108737130,
'chr10': 99517758,
'chr17': 95433459,
'chr16': 79627064,
'chr20': 77137495,
'chr18': 74474043,
'chr19': 58315233,
'chrY': 11753682,
}

@bootstrap(chrom_sizes)
def proximity_test(q, a):
    return proximity_stat(q, a)


@bootstrap(chrom_sizes)
def jaccard_test(q, a):
    return jaccard_stat(q, a)


def overlaps(df1, df2):
    """
    Establishes whether each query segment overlaps at least one 
    annotation segment. Returns a boolean array with same length 
    as df1.index.
    """
    overlapping = []
    for i, (s1, e1) in enumerate(zip(df1.start, df1.end)):
        overlaps = False
        for s2, e2 in zip(df2.start, df2.end):
            if e1 > s2 and e2 > s1:
                overlaps = True
                break
        overlapping.append(overlaps)
    return np.array(overlapping)

In [4]:
high_olive_rhemac10 = pd.read_csv('lift/rheMac10/high_olive_rhemac10.bed', sep='\t', 
            header=None, names=['label', 'chrom', 'start', 'end'])
high_olive_rhemac10.head()

Unnamed: 0,label,chrom,start,end
0,high_olive,chrX,0,2500000
1,high_olive,chrX,7342552,8077091
2,high_olive,chrX,9206099,12116298
3,high_olive,chrX,15613277,15711368
4,high_olive,chrX,16315102,16640934


In [14]:
flank=0
olive_edges = np.concatenate((high_olive_rhemac10.start, high_olive_rhemac10.end))
olive_edge_1bp = pd.DataFrame(np.column_stack((olive_edges, olive_edges+flank)), columns=['start', 'end'])
olive_edge_1bp.start = np.maximum(olive_edge_1bp.start, 0)
olive_edge_1bp.end = np.minimum(olive_edge_1bp.end, chrom_sizes['chrX']) # rheMac10 chrX length
olive_edge_1bp['chrom'] = 'chrX'
olive_edge_1bp = olive_edge_1bp.sort_values(by=['start', 'end'])
olive_edge_1bp.head()

Unnamed: 0,start,end,chrom
0,0,0,chrX
34,2500000,2500000,chrX
1,7342552,7342552,chrX
35,8077091,8077091,chrX
2,9206099,9206099,chrX


## Call compartments and defined edges

In [9]:
def parse_compartment_data(file_name):
    e1_100kb = pd.read_csv(file_name)
    e1_100kb['start'] = [i*100_000 for i in range(e1_100kb.index.size)]
    e1_100kb['end'] = e1_100kb.start + 100_000
    e1_100kb['sign'] = np.sign(e1_100kb.e1)
    e1_100kb['segment_id'] = ((e1_100kb.sign.shift() != e1_100kb.sign)).cumsum()
    
    comp = e1_100kb.groupby('segment_id', as_index=False).agg(dict(
         e1=['mean', 'sum'], 
         start='min', 
         end='max', 
         segment_id='mean', 
         sign='mean'
    ))
    comp.columns = ['_'.join(col).strip() for col in comp.columns.values]
    comp = comp.rename(
        columns={'start_min':'start',
                 'end_max':'end', 
                 'segment_id_mean':'segment_id', 
                 'sign_mean':'sign'}
    )
    comp['comp'] = ['A' if x > 0 else 'B' for x in comp.sign]
    comp = comp.reset_index()
    comp['chrom'] = 'chrX'
    
    _comp = comp.copy()
    for i in range(1, _comp.index.size-1):
        if np.isnan(_comp.loc[i-1, 'e1_mean']):
            _comp.loc[i, 'start'] = np.nan
        if np.isnan(_comp.loc[i+1, 'e1_mean']):
            _comp.loc[i, 'end'] = np.nan
    _comp = _comp.loc[~_comp.e1_mean.isnull(), :]
    _comp = _comp.reset_index()
    compartment_edges = pd.concat([_comp.start, _comp.end]).sort_values().unique()
    
    compartments = comp.loc[~comp.e1_mean.isnull()].copy()
    compartments['start'] = compartments.start.astype(int)
    compartments['end'] = compartments.end.astype(int)

    return compartments, compartment_edges

def edge_segments(compartment_edges, flank):
    compartment_edge_segm = pd.DataFrame(np.column_stack((compartment_edges, compartment_edges+flank)), columns=['start', 'end'])
    compartment_edge_segm['chrom'] = 'chrX'
    return compartment_edge_segm

In [15]:
compartments, compartment_edges = parse_compartment_data('rec_compartments/pachytene_spermatocyte_e1_100kb_10Mb_smoothed.csv')
compartment_edge_1bp = edge_segments(compartment_edges, flank=flank)

query = olive_edge_1bp
annot = compartment_edge_1bp
stat, p = proximity_test(query.loc[~overlaps(query, annot)], annot)

In [16]:
stat, p

(0.2953089823359197, 0.05655)

In [13]:
stat, p

(0.29531001381430666, 0.05638)