# Bug 

In [2]:
#This is a bug in the Python interface; _source and _target do not work properly
#on undirected graphs. As a workaround, use g.incident(x) to obtain the IDs of
#the edges incident on vertex x -- this can then be used to subset g.es:
    
#    Using g.es[g.incident(idx)] solved

##
# bin format chr1:x1-x2
# corresponding bed format chr1    x1    x2

In [1]:
import pandas as pd
import os
import numpy as np
#import glob
from pybedtools import BedTool

from igraph import *
#import matplotlib
#matplotlib.use('AGG')
import matplotlib.pyplot as plt
#import seaborn as sns
#sns.set_style("white") 

# display plots in this notebook
%matplotlib inline
from scipy import stats

from scipy.stats import wilcoxon

In [2]:
def convert_loops_to_graph_bp(df_loops):
    ## loop format: ['#chr1', 'x1', 'x2', 'chr2', 'y1', 'y2', 'GeneID']
    df_bins = Loops_Return_two_bins_no_dup(df_loops)
    df_bins['name'] = df_bins['#chr1'].astype(str)+':'+df_bins['x1'].astype(str)+'-'+df_bins['x2'].astype(str)
    Num_vs = len(df_bins.index)
    ## Initiation a graph from loops file 

    graph = Graph()
    graph.add_vertices(Num_vs)
    graph.vs["name"] = df_bins['name']

    df_edge = df_loops.merge(df_bins, on=['#chr1', 'x1', 'x2']).merge(
        df_bins, left_on=['chr2', 'y1', 'y2'], right_on=['#chr1', 'x1', 'x2']).loc[:,['index_x','index_y']]

    graph.add_edges(df_edge.values)
    return graph

def convert_loops_to_graph(df_loops, weight_col, _extra_edge_col):
    ## loop format: ['#chr1', 'x1', 'x2', 'chr2', 'y1', 'y2', 'GeneID', 'weight_cols']
    df_bins = Loops_Return_two_bins_no_dup(df_loops)
    df_bins['name'] = df_bins['#chr1'].astype(str)+':'+df_bins['x1'].astype(str)+'-'+df_bins['x2'].astype(str)
    Num_vs = len(df_bins.index)
    ## Initiation a graph from loops file 

    graph = Graph()
    graph.add_vertices(Num_vs)
    graph.vs["name"] = df_bins.loc[:,'name']
    

    df_edge = df_loops.merge(df_bins, on=['#chr1', 'x1', 'x2']).merge(
        df_bins, left_on=['chr2', 'y1', 'y2'], right_on=['#chr1', 'x1', 'x2'])
    graph.add_edges(df_edge.loc[:, ['index_x','index_y']].values)
    if weight_col:
        graph.es["weight"] = df_edge.loc[:,weight_col].values
    if _extra_edge_col:
        graph.es[_extra_edge_col] = df_edge.loc[:,_extra_edge_col].values
    return graph



def Loops_Return_two_bins_no_dup(df_hic):
    ## Associated by promoter
    second_bin_columns = [3,4,5,0,1,2]+list(range(6,len(df_hic.columns),1))
    df_hic=df_hic.append(pd.DataFrame(df_hic.iloc[:, second_bin_columns].values, columns=df_hic.columns),sort=False).sort_index()
    return df_hic.iloc[:,0:3].drop_duplicates().reset_index().drop('index',axis=1).reset_index()

def convert_cluster2bed(df_cluster, usecol):
    df_tem = df_cluster[usecol].str.split(r"\:|-",expand=True)
    df_tem = pd.concat( [df_tem, df_cluster], axis=1)
    if (df_tem.iloc[0,0].find('chr') == -1):
        df_tem[0] = 'chr'+df_tem[0]
    return df_tem

def convert_bin2bed(df_cluster, col_name):
    df_tem = df_cluster[col_name].str.split(r"\:|-",expand=True)
    df_tem = pd.concat( [df_tem, df_cluster], axis=1)
    if (df_tem.iloc[0,0].find('chr') == -1):
        df_tem[0] = 'chr'+df_tem[0]
    return df_tem

def convert_vs2bed(input_graph, col_name):
    ## output first 3 columns is standard bed format
    df_tem = pd.DataFrame(data={col_name:input_graph.vs[col_name]})
    df_tem = pd.concat( [df_tem[col_name].str.split(r"\:|-",expand=True),df_tem], axis=1)
    if (df_tem.iloc[0,0].find('chr') == -1):
        df_tem[0] = 'chr'+df_tem[0]
    return df_tem

def convert_graph_vs_to_df(_input_graph):
    df_vs = pd.DataFrame(data= {"degree":_input_graph.degree()})
    for col in _input_graph.vs.attributes():
        df_vs[col] = _input_graph.vs[col]

    return df_vs

def graph_community_multilevel_Blondel(input_graph, cutoff):
    ## input graph should have at least one attribute: name
    df_vs = convert_graph_vs_to_df(input_graph)
    _col_vs_name='name'
    if (input_graph.is_weighted()):
        print ("Weighted Graph Cluster")
        structure = input_graph.community_multilevel(weights=input_graph.es['weight'] ,return_levels=False)
    else:
        structure = input_graph.community_multilevel(return_levels=False)
    df_vs['membership'] = structure.membership
    df_vs_cluster_group = df_vs.groupby('membership')
    
    ## Rank each cluster by number of bins
    cluster_name=[]
    cluster_num_vertices=[]
    for df_vs_cluster in df_vs_cluster_group:
        df_vs_inside_cluster = Cluster_Filter_by_Denisty(df_vs_cluster[1], _col_vs_name, 'degree', cutoff)
        #df_vs_inside_cluster =df_vs_cluster[1]
        df_cluster_coordiante = df_vs_inside_cluster[_col_vs_name].str.split(r"\:|-",expand=True)
        cluster_coordinate = 'chr'+df_cluster_coordiante.iloc[0,0]+':'+str(df_cluster_coordiante.iloc[:,1].astype(int).min())+'-'+str(df_cluster_coordiante.iloc[:,2].astype(int).max())
        cluster_name.append(cluster_coordinate) ##0: cluster name
        cluster_num_vertices.append(len(df_vs_inside_cluster)) # 1: num_vertices
    
    df_cluster_output = pd.DataFrame(data={'hub_name':cluster_name,'Num_vertices':cluster_num_vertices}).sort_values('Num_vertices', ascending=False)
    return df_cluster_output, df_vs_cluster_group

def Graph_Pagerank(_input_graph):
    _input_graph.vs['pagerank'] = _input_graph.pagerank(weights=_input_graph.es['weight'])
    return _input_graph

def Cluster_Filter_by_Denisty(_df_vs_cluster, _col_name, _core_col, _cutoff):
    ## Linear Denisty Threshold, 1 edge at least 1 anchor
    cutoff=_cutoff#0.5
    df_tem = _df_vs_cluster
    col_name='name'
    _core_col='pagerank'
    resolution=10000
    df_tem[col_name].str.split(r"\:|-",expand=True)
    df_tem = pd.concat( [df_tem[col_name].str.split(r"\:|-",expand=True),df_tem], axis=1)

    ## Define highest degree as summit
    #num_core = int(len(df_tem)/3)+1
    num_core = 1
    core = df_tem.nlargest(int(num_core), _core_col).iloc[:,1].astype(int).mean()
    #core = df_tem.nlargest(5, _core_col).iloc[:,1].mean()
    df_tem['density'] = df_tem['degree'].astype(float)/(abs(df_tem.iloc[:,1].astype(float)-float(core))**2)*resolution**2

    return df_tem[df_tem['density']>cutoff]



def graph_community_multilevel_Blondel_diff_level(input_graph, cutoff):
    ## input graph should have at least one attribute: name
    df_vs = convert_graph_vs_to_df(input_graph)
    _col_vs_name='name'
    
    if (input_graph.is_weighted()):
        print ("Weighted Graph Cluster")
        structure = input_graph.community_multilevel(weights=input_graph.es['weight'], return_levels=True)
    else:
        structure = input_graph.community_multilevel(return_levels=True)
    
    for tem_level in structure:
        print (tem_level.summary())
    df_vs['membership'] = structure[0].membership
    df_vs_cluster_group = df_vs.groupby('membership')
    
    ## Rank each cluster by number of bins
    cluster_name=[]
    cluster_num_vertices=[]
    for df_vs_cluster in df_vs_cluster_group:
        df_vs_inside_cluster = Cluster_Filter_by_Denisty(df_vs_cluster[1], _col_vs_name, 'degree', cutoff)
        if (len(df_vs_inside_cluster)>0):
            df_cluster_coordiante = df_vs_inside_cluster[_col_vs_name].str.split(r"\:|-",expand=True)
            #print (df_cluster_coordiante)
            cluster_coordinate = 'chr'+df_cluster_coordiante.iloc[0,0]+':'+str(df_cluster_coordiante.iloc[:,1].astype(int).min())+'-'+str(df_cluster_coordiante.iloc[:,2].astype(int).max())
            cluster_name.append(cluster_coordinate) ##0: cluster name
            cluster_num_vertices.append(len(df_vs_inside_cluster)) # 1: num_vertices
    
    df_cluster_output = pd.DataFrame(data={'hub_name':cluster_name,'Num_vertices':cluster_num_vertices}).sort_values('Num_vertices', ascending=False)
    return df_cluster_output, df_vs_cluster_group


def graph_community_multilevel_Blondel_diff_level_promoter(input_graph, cutoff):
    ## input graph should have at least one attribute: name
    df_vs = convert_graph_vs_to_df(input_graph)
    _col_vs_name='name'
    
    if (input_graph.is_weighted()):
        print ("Weighted Graph Cluster")
        structure = input_graph.community_multilevel(weights=input_graph.es['weight'], return_levels=True)
    else:
        structure = input_graph.community_multilevel(return_levels=True)
    
    for tem_level in structure:
        print (tem_level.summary())
    df_vs['membership'] = structure[0].membership
    df_vs_cluster_group = df_vs.groupby('membership')
    
    ## Rank each cluster by number of bins
    cluster_summary = []
    for df_vs_cluster in df_vs_cluster_group:
        df_cluster = df_vs_cluster[1] 
        if( len(df_cluster[df_cluster['Promoter']!=0])>0):
            for promoter_id in df_cluster[df_cluster['Promoter']!=0]['Promoter_gene_id'].unique():#[0]
                #promoter_id = df_cluster[df_cluster['Promoter']!=0]['Promoter_gene_id'].unique()[0]
                #print(promoter_id)
                if (promoter_id=='Myb'):
                    df_test_out = df_cluster
                df_vs_inside_cluster, cluster_coordinate = Cluster_Filter_by_Denisty_Promoter(df_cluster, _col_vs_name, promoter_id, cutoff)            
                cluster_summary.append( [cluster_coordinate, len(df_vs_inside_cluster), promoter_id])
                
    
    df_cluster_output = pd.DataFrame(data=cluster_summary, columns=['hub_name','Num_vertices','Promoter']).sort_values('Num_vertices', ascending=False)
    return df_cluster_output, df_vs_cluster_group, df_test_out

def Cluster_Filter_by_Denisty_Promoter(_df_vs_cluster, _col_name, _promoter_id, _cutoff):
    ## Linear Denisty Threshold, 1 edge at least 1 anchor
    cutoff=_cutoff#0.5
    df_tem = _df_vs_cluster
    col_name='name'
    promoter_id = _promoter_id
    resolution=10000
    df_tem[col_name].str.split(r"\:|-",expand=True)
    df_tem = pd.concat( [df_tem[col_name].str.split(r"\:|-",expand=True),df_tem], axis=1)
    
    ## Define Target Promoter as core
    core = df_tem[df_tem['Promoter_gene_id']==promoter_id].iloc[:,1:3].astype(int).sum(axis=1)/2
    #print (core)

    df_tem['density'] = df_tem['degree'].astype(float)/(abs(df_tem.iloc[:,1].astype(float)-float(core))**2)*resolution**2
    df_filtered_cluster_elements = df_tem[df_tem['density']>cutoff]
    
    df_cluster_coordiante = df_filtered_cluster_elements[col_name].str.split(r"\:|-",expand=True)
    
    cluster_coordinate = 'chr'+df_cluster_coordiante.iloc[0,0]+':'+str(df_cluster_coordiante.iloc[:,1].astype(int).min())+'-'+str(df_cluster_coordiante.iloc[:,2].astype(int).max())
    
    
    return df_filtered_cluster_elements, cluster_coordinate

In [3]:
def display_graph_vertex(input_graph, vs_idx_set):
    # Input graph and Input vertex index set
    for vs in graph_processed.vs.select(vs_idx_set):
        print ( "vs_idx:"+ str(vs.index)+ ' '+ str(vs.attributes()))
    return None
def display_graph_edge(input_graph, vs_idx_set):
    # Input graph and Input vertex index set
    for vs_idx in vs_idx_set:
        edges_from_vs = input_graph.es[input_graph.incident(vs_idx)]
        for es in edges_from_vs:
            print ( "es_idx:"+ str(es.index)+ ' '+ str(es.tuple))
    return None

def annotate_graph_with_feature_values_new(_input_graph, graph_name_col2bed, path_feature, feature_name, _feature_score, norm_factor=1.0):
    input_graph = _input_graph
    name_col2bed = graph_name_col2bed ## Default "name"
    Vs_Attrs_Name = feature_name ## such as 'Tcf1'
    if ( Vs_Attrs_Name not in input_graph.vs.attributes()):
        ## Convert vs to bed format in order to annotate
        df_vs_bed = convert_vs2bed(input_graph, name_col2bed)
        ### df_vs_bed to be annoted
        df_vs_bed.iloc[:,0]='chr'+ df_vs_bed.iloc[:,0].astype(str)
        Feature_vs = BedTool.from_dataframe(df_vs_bed).sort()

        PATH_Feature_A = path_feature ##
        df_A = pd.read_csv(PATH_Feature_A, sep="\t")
        Feature_A = BedTool.from_dataframe(df_A).sort()

        ## annotate A in vs
        Feature_vs_with_A = Feature_vs.intersect(Feature_A, wb=True, F=0.3) ## 30% maybe enough

        if (len(Feature_vs_with_A)>0):
            df_vs_with_A=pd.read_csv(Feature_vs_with_A.fn, sep="\t", names=df_vs_bed.columns.append(df_A.columns).values, header=None)
        else:
            df_vs_with_A=pd.DataFrame(columns=df_vs_bed.columns.append(df_A.columns))
        
        
        vs_score = _feature_score  ## 'such as logFC'
        vs_attrs_score = feature_name+'_'+vs_score
        input_graph.vs[Vs_Attrs_Name]=0
        input_graph.vs[vs_attrs_score]=0

        for df_vs in df_vs_with_A.groupby(name_col2bed): ### Default Define vertex attribute "name"
            input_graph.vs.select(name=df_vs[0])[Vs_Attrs_Name] = df_vs[1].shape[0]
            ### max Tcf1 binding
            if ( type(df_vs[1].loc[:,vs_score].head(1).values[0]) == str):
                input_graph.vs.select(name=df_vs[0])[vs_attrs_score] = df_vs[1].loc[:,vs_score].max()
            else:
                #print(df_vs[1].loc[:,vs_score])
                if (df_vs[1].shape[0]==1):
                    input_graph.vs.select(name=df_vs[0])[vs_attrs_score] = df_vs[1].loc[:,vs_score].values[0]/norm_factor
                else:
                    List_Feature = list(df_vs[1].loc[:,vs_score].values)
                    input_graph.vs.select(name=df_vs[0])[vs_attrs_score] = List2Str(List_Feature, norm_factor)#multiple save as str

        print ("Annotate " + Vs_Attrs_Name + " is finished.")
    else: 
        print ("Feature of " + Vs_Attrs_Name + " is already annoated. Skip.")
    
    return input_graph

def annotate_graph_with_feature_values(_input_graph, graph_name_col2bed, path_feature, feature_name, _feature_score):
    input_graph = _input_graph
    name_col2bed = graph_name_col2bed ## Default "name"
    Vs_Attrs_Name = feature_name ## such as 'Tcf1'
    if (Vs_Attrs_Name not in _input_graph.vs.attributes()):
        ## Convert vs to bed format in order to annotate
        df_vs_bed = convert_vs2bed(input_graph, name_col2bed)
        ### df_vs_bed to be annoted
        Feature_vs = BedTool.from_dataframe(df_vs_bed).sort()

        PATH_Feature_A = path_feature ##
        df_A = pd.read_csv(PATH_Feature_A, sep="\t")
        Feature_A = BedTool.from_dataframe(df_A).sort()

        ## annotate A in vs
        Feature_vs_with_A = Feature_vs.intersect(Feature_A, wb=True, F=0.3)

        if (len(Feature_vs_with_A)>0):
            df_vs_with_A=pd.read_csv(Feature_vs_with_A.fn, sep="\t", names=df_vs_bed.columns.append(df_A.columns).values, header=None)
        else:
            df_vs_with_A=pd.DataFrame(columns=df_vs_bed.columns.append(df_A.columns))
        
        
        vs_score = _feature_score  ## 'such as logFC'
        vs_attrs_score = Vs_Attrs_Name+'_'+vs_score
        input_graph.vs[Vs_Attrs_Name]=0
        input_graph.vs[vs_attrs_score]=0
        for df_vs in df_vs_with_A.groupby(name_col2bed): ### Default Define vertex attribute "name"
            input_graph.vs.select(name=df_vs[0])[Vs_Attrs_Name] = df_vs[1].shape[0]
            ### max Tcf1 binding
            if ( type(df_vs[1].loc[:,vs_score].head(1).values[0]) == str):
                input_graph.vs.select(name=df_vs[0])[vs_attrs_score] = df_vs[1].loc[:,vs_score].max()
            else:
                #print(df_vs[1].loc[:,vs_score])
                input_graph.vs.select(name=df_vs[0])[vs_attrs_score] = df_vs[1].loc[:,vs_score].mean()
        print ("Annotate " + Vs_Attrs_Name + " is finished.")
    else: 
        print ("Feature of " + Vs_Attrs_Name + " is already annoated. Skip.")
    
    return input_graph

def Return_Graph_of_Gene(_input_graph, _gene, _search_depth):
    search_depth=_search_depth
    graph_input = _input_graph
    gene_request = _gene
    vertex_set=set()
    if ( len(graph_input.vs.select(Promoter_gene_id=gene_request))>0 ):
        vertex_set.add(graph_input.vs.select(Promoter_gene_id=gene_request)[0].index)
        final_subgrapph=None
        for i in range(search_depth):
            for vertex_index in list(vertex_set):
                graph_select_edges = graph_input.es[graph_input.incident(vertex_index)]
                for edge_in_graph in graph_select_edges:
                    vertex_set.add(edge_in_graph.tuple[0])
                    vertex_set.add(edge_in_graph.tuple[1])

            #print (vertex_set)
            final_subgrapph = graph_input.induced_subgraph(vertex_set)
    else:
        print ('Gene Not Included!')
        final_subgrapph=None
    
    return final_subgrapph

def Visualization_Graph_bp(_sub_graph_test, _plot_name):
    ### Mark Promoter in graph with gene_id
    sub_graph_test = _sub_graph_test
    #sub_graph_test.vs.select(Promoter_gt=0)['label'] = sub_graph_test.vs.select(Promoter_gt=0)['Promoter_gene_id']

    #color_dict = {"Tcf1":"red", "None": "gray"}
    #shape_dict ={'Promoter': 'circular', 'Enhancer':'arrow-up', 'Enhancer_Down':'arrow-down',
    #'Enhancer_Up':'arrow-up', 'None':'hidden'} 

## Default
    sub_graph_test.vs["color"] = "gray"
    #sub_graph_test.vs["shape"] = "hidden"

    # high priority will be put in the end
    if ( 'K27ac' in sub_graph_test.vs.attributes()):
        sub_graph_test.vs.select(K27ac_gt=0)["shape"] = 'diamond'
        #sub_graph_test.vs.select(K27ac_gt=0)['label'] = ["{:.2f}".format(x) for x in sub_graph_test.vs.select(K27ac_gt=0)['K27ac_logFC']]
    
    ## DNase Priority 2
    if ( 'DNase' in sub_graph_test.vs.attributes()):
        sub_graph_test.vs.select(DNase_FC_logFC_gt=0)["shape"] = 'arrow-up'
        sub_graph_test.vs.select(DNase_FC_logFC_lt=0)["shape"] = 'arrow-down'
        sub_graph_test.vs.select(DNase_gt=0)["size"] = [abs(x) * 20 for x in sub_graph_test.vs.select(DNase_gt=0)['DNase_FC_logFC']]
    
    ## Promoter Priority 1
    if ( 'Promoter' in sub_graph_test.vs.attributes()):
        sub_graph_test.vs.select(Promoter_gt=0)["shape"] = 'circular'
        sub_graph_test.vs.select(Promoter_gt=0)['label'] = sub_graph_test.vs.select(Promoter_gt=0)['Promoter_gene_id']
        sub_graph_test.vs.select(Promoter_gt=0)["size"] = 20
        ## convert Genomic Region to shape code 
    if ( 'Tcf1' in sub_graph_test.vs.attributes()):
        sub_graph_test.vs.select(Tcf1_gt=0)["color"] = "red"

    sub_graph_test.vs['name']=''
    graph_plot_saveas(sub_graph_test, _plot_name+'.PNG')
    
    return sub_graph_test
### End of Visulization

In [4]:
def return_ABC_in_given_gene_and_graph(_gene_id, _graph, _col1, _col2, _col3_interaction):
    ## Define activity name
    A_name1= _col1
    A_name2= _col2
    geneid=_gene_id
    return_graph = Return_Graph_of_Gene(_graph, geneid, 1)

    for vs_idx in return_graph.vs.select(Promoter_gene_id_eq=geneid):
        ## All edges has a connect with promoter
        edges_from_vs = return_graph.es[return_graph.incident(vs_idx)]
        promoter_index=vs_idx.index
        Sum_All_edges=list()
        for es in edges_from_vs:
            for index_anchor in es.tuple:
                if (index_anchor == vs_idx.index):
                    continue
                else:
                    #print (index_anchor)
                    Contact_interaction = es[_col3_interaction]
                    Activity_E = np.sqrt(return_graph.vs[index_anchor][A_name1]*return_graph.vs[index_anchor][A_name2])
                    Sum_All_edges.append( [geneid, return_graph.vs[index_anchor]['name'], Activity_E, Contact_interaction])

        ## Calculate ABC
        df_sum_all_edges=pd.DataFrame(data=Sum_All_edges, columns=['gene_id', 'loc', 'Activity_E', 'Contact_interaction'])
        df_sum_all_edges['AC_Score'] = (df_sum_all_edges['Activity_E']*df_sum_all_edges['Contact_interaction'])/sum(0.01+df_sum_all_edges['Activity_E']*df_sum_all_edges['Contact_interaction'])

    return df_sum_all_edges

In [5]:
def Generate_All_Genes(Input_Path, number):
#### READ FILE
    df = pd.read_csv(Input_Path+'/gene_exp.diff', sep='\t', header=0, usecols={'test_id',\
    'status','sample_1','sample_2','value_1','value_2','log2(fold_change)','p_value','q_value'})
#### Rename columns
    df=df.rename(columns={'test_id':'gene_id', 'value_1': df['sample_1'].unique()[0], 'value_2': df['sample_2'].unique()[0]})
#### Output   
    return df.loc[:,['gene_id',df['sample_1'].unique()[0],df['sample_2'].unique()[0],'log2(fold_change)',
                     'p_value','q_value', 'End_'+str(number), '||']].fillna('')

def Predict_expression_mode1(input_graph):
    ## Input Graph must have attributes: "region", "binding"
    vs_promoter = input_graph.vs.select(region="Promoter")
    output_gene_with_score=list()
    for vertex in vs_promoter:
        edges_from_vs = input_graph.es[input_graph.incident(vertex.index)]
        vs_subgraph=edges_from_vs.subgraph()
        test_score = 0 ## for a single promoter vertex define a expression score
        for vs_region, vs_binding in zip(vs_subgraph.vs['region'], vs_subgraph.vs['binding']):
            #print (vs_region,vs_binding)
            if (vs_region == 'Enhancer_Down'): test_score+=(-1)
            elif (vs_region == 'Enhancer_Up'): test_score+=1
            if ( vs_binding=='Tcf1'): test_score*=2
            ## end
        output_gene_with_score.append( [vertex['label'], test_score])
    return output_gene_with_score

# Convert Bedpe to graph
## Input format for interaction should be the simplest
## chr b1 b2 interaction

In [6]:
def Read_Interaction(_PATH_interaction, _resolution, _col_fore, _col_back):
    PATH_interaction=_PATH_interaction
    col_fore = _col_fore
    col_back  = _col_back
    resolution = _resolution
    
    df_interaction = pd.read_csv(PATH_interaction, sep="\t").fillna(0)
    df_interaction = df_interaction[df_interaction.iloc[:,1]!=df_interaction.iloc[:,2]] ### remove self interaction
    df_interaction.loc[:,'#chr']=df_interaction.loc[:,'#chr'].replace('chr','')
    df_interaction.loc[:,'#chr1']=df_interaction.iloc[:,0]
    df_interaction.loc[:,'x1']=df_interaction.iloc[:,1]
    df_interaction.loc[:,'x2']=df_interaction.iloc[:,1]+resolution
    df_interaction.loc[:,'chr2']=df_interaction.iloc[:,0]
    df_interaction.loc[:,'y1']=df_interaction.iloc[:,2]
    df_interaction.loc[:,'y2']=df_interaction.iloc[:,2]+resolution

    df_interaction.loc[:,'log_FC'] = np.log2(df_interaction.loc[:,col_fore].replace(0,0.1) / df_interaction.loc[:,col_back].replace(0,0.1) )
    #df_interaction.loc[:,'GeneID'] = "id_"+df_interaction.index.astype(str)
    df_interaction = df_interaction.loc[:,['#chr1','x1','x2','chr2','y1','y2','log_FC', col_fore, col_back]]
    return df_interaction
col_fore = 'DKO_na'
col_back  = 'WT_na'
resolution = 10000

PATH_FEATURES='/home/xli/Data/Shaoqi/Hub/'
PATH_interaction=PATH_FEATURES+'WT_na-DKO_na_2016.bed'

df_hic = Read_Interaction(PATH_FEATURES+'WT_na-DKO_na_2016.bed', resolution, col_fore, col_back)
df_hic.shape

  if (await self.run_code(code, result,  async_=asy)):


(3101646, 9)

In [7]:
df_WT_Specific_loops  = df_hic[(df_hic['log_FC']<0)]#&(df_interaction['DKO']!=0.1)] very critical for 19
df_WT_Specific_loops.loc[:, 'logFC'] = abs(df_WT_Specific_loops.loc[:,'log_FC'])

g_graph = convert_loops_to_graph(df_WT_Specific_loops, col_back, col_fore)
g_graph.summary()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


'IGRAPH UNW- 251764 1397594 -- \n+ attr: name (v), DKO_na (e), weight (e)'

In [8]:
graph_processed = g_graph
graph_processed = Graph_Pagerank(graph_processed)

df_WT_Hub, df_WT_Hub_Groups = graph_community_multilevel_Blondel_diff_level(graph_processed, 0.5)

df_WT_Hub_top = convert_cluster2bed(df_WT_Hub, 'hub_name').reset_index().drop('index', axis=1)

Weighted Graph Cluster
Clustering with 251764 elements and 56658 clusters
Clustering with 251764 elements and 7590 clusters
Clustering with 251764 elements and 1148 clusters
Clustering with 251764 elements and 365 clusters
Clustering with 251764 elements and 315 clusters


In [11]:
## Calculate pvalue for each hub

df_Hub_filter = convert_cluster2bed(df_WT_Hub, 'hub_name').reset_index().drop('index', axis=1)

## Associated each Hub with interaction and pvalue
col_name = df_hic.columns
df_inter = pd.read_csv(PATH_interaction, sep="\t").fillna(0)
df_inter = df_inter[df_inter.iloc[:, 1]!=df_inter.iloc[:, 2]]
df_inter.loc[:,'#chr']= 'chr'+df_inter.loc[:,'#chr'].astype(str)
Feature_interaction = BedTool.from_dataframe(df_inter).sort()
Feature_hub = BedTool.from_dataframe(df_Hub_filter).sort()
#
Feature_Hub_interaction = Feature_hub.intersect(Feature_interaction, wa=True, wb=True, F=1.0)

col_name = df_Hub_filter.columns.append(df_inter.columns)
df_Feature_Hub_interaction = pd.read_csv(Feature_Hub_interaction.fn, sep='\t', names=col_name)

df_Feature_Hub_interaction_group = df_Feature_Hub_interaction.groupby('hub_name')



### calculate a pvalue for each hub
hub_sum=[]
for hub in df_Feature_Hub_interaction_group:
    #print (hub[0])
    df_hub = hub[1]
    data_for_test = df_hub.loc[:, col_back]  - df_hub.loc[:, col_fore]
    w, pvalue_hub = wilcoxon(data_for_test)#, alternative='less')
    hub_sum.append([hub[0], df_hub.Num_vertices.unique()[0], pvalue_hub])
    #break

df_hub_summary = pd.DataFrame( data = hub_sum, columns=['hub_name', 'Num_vertices', 'pvalue'])
df_hub_summary = df_Hub_filter.merge(df_hub_summary, on=['hub_name','Num_vertices'], how='inner').sort_values(by='pvalue')

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
df_hub_summary.head(10)

Unnamed: 0,0,1,2,hub_name,Num_vertices,pvalue
56,chr18,56520000,56660000,chr18:56520000-56660000,14,1.077923e-13
26,chr15,36260000,36410000,chr15:36260000-36410000,14,1.614891e-12
51,chr10,20850000,20990000,chr10:20850000-20990000,14,1.869677e-11
195,chr11,6540000,6660000,chr11:6540000-6660000,12,9.463384e-09
111,chr13,44840000,44970000,chr13:44840000-44970000,13,1.550791e-08
19,chr11,53930000,54080000,chr11:53930000-54080000,15,1.672322e-07
79,chr10,84320000,84450000,chr10:84320000-84450000,13,2.90596e-07
1727,chr7,74570000,74660000,chr7:74570000-74660000,8,3.073152e-07
163,chr6,28210000,28330000,chr6:28210000-28330000,12,4.312522e-07
153,chr13,43460000,43580000,chr13:43460000-43580000,12,5.218285e-07
