In [28]:
import numpy as np
import pandas as pd
import networkx as nx
import scipy
from scipy import stats
import mygene
import math

import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')
% matplotlib inline

In [None]:
from rpy2.robjects import default_converter
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

# use the default conversion rules to which the pandas conversion
# is added
with localconverter(default_converter + pandas2ri.converter) as cv:
    dataf = robjects.r["data"]

## Creating Background Network ###

#### Load Brin's transcription factor's

In [2]:
# load TR list from AnimalTFDB 
TR_db_m = pd.read_csv("Mus_musculus_transcription_factors_gene_list.txt", sep = "\t")
TR_db_h = pd.read_csv("Homo_sapiens_transcription_factors_gene_list.txt", sep = "\t")
TR_db = TR_db_m.append(TR_db_h)
TR_list_entrez = TR_db.Entrez_ID

In [3]:
len(TR_db_h)

1691

In [4]:
#translate TR list entrez to symbol
mg = mygene.MyGeneInfo()
translated_DF = mg.getgenes(set(TR_list_entrez), as_dataframe=True)
TR_list_symbols = translated_DF["symbol"].str.upper()

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-2956...done.


In [5]:
# may need to check for Nan values, depending on the database

#TR_list_not_found = translated_string_nodes["notfound"]
#
#for i in range(len(TR_list_not_found)):
#    if TR_list_not_found[i] is 'NaN':
#        print 1

#### Cross reference Brin's TR list with background STRING db


In [6]:
# Load STRING database as background network
STRING_DF = pd.read_excel("STRING_network.xlsx")

In [7]:
# convert symbols to all caps, make into edge list
sym1_list = STRING_DF.Symbol1.str.upper()
sym2_list = STRING_DF.Symbol2.str.upper()
weight_list = STRING_DF.Weight
sign_list = STRING_DF.Edge_Sign
db_edges = zip(sym1_list, sym2_list, weight_list)

sign_num_list = []
for sign in sign_list:
    if str(sign) == '+':
        sign_num_list.append(1)
    elif str(sign) == '-':
        sign_num_list.append(-1)
    else:
        sign_num_list.append(0)

db_sign_att = zip(sym1_list, sym2_list, sign_num_list)

In [8]:
#num unique nodes STRING
len(np.unique(list(sym1_list)+list(sym2_list)))

3580

In [9]:
# extracting TR edge information from background string db
edge_list_filtered_TR = []
sign_att_filtered_list = []
for i in range(len(db_edges)):
    if db_edges[i][0] in list(TR_list_symbols):
        edge_list_filtered_TR.append(db_edges[i])
        sign_att_filtered_list.append(db_sign_att[i])

In [10]:
# make and process directed graph from these two tables
DG = nx.DiGraph()
DG.add_weighted_edges_from(edge_list_filtered_TR)
for i in range(len(sign_att_filtered_list)):
    DG[sign_att_filtered_list[i][0]][sign_att_filtered_list[i][1]]['sign'] = sign_att_filtered_list[i][2]

In [11]:
len(DG.nodes())

37

In [12]:
# number of TR's from Brin's list that are in the AnimalTFDB database
len(list(set(sym1_list) & set(TR_list_symbols))) 
len(set(zip(*edge_list_filtered_TR)[0])) # multiple ways to do it

5

### p-value with differencially expressed genes

In [13]:
# load differencially expressed genes (experimental results)
DEG_db = pd.read_csv("differencially_expressed_genes.txt", sep = "\t")
DEG_list = []
DEG_to_updown = {}

In [14]:
# filtering for lfdr < 0.3
for i in range(len(DEG_db)):
    if str(DEG_db.symbol[i]).upper() != 'NAN': # removing Nan values
        
        if (DEG_db['lfdr.89.12'][i] < 0.3):
            DEG_list.append(str(DEG_db.symbol[i]).upper()) # filtering DEG list by lfdr < 0.3
            
            # creating dictionary between DEG symbols and their up/down value
            if DEG_db['log2.89.12'][i] != 0: 
                DEG_to_updown[str(DEG_db.symbol[i]).upper()] = (DEG_db['log2.89.12'][i])/abs(DEG_db['log2.89.12'][i])
            else:
                DEG_to_updown[str(DEG_db.symbol[i]).upper()] = 0
                

In [16]:
DEG_in_DG = set(DG.nodes()) & set(DEG_list)
zero_dict = dict(zip(DG.nodes(), [0]*len(DG.nodes())))
for gene in DEG_in_DG:
    zero_dict[gene] = DEG_to_updown[gene]
nx.set_node_attributes(DG, 'updown', zero_dict) 

In [None]:
# unique source nodes
source_nodes = list(set(zip(*DG.edges())[0]))
source_nodes

In [None]:
#DEG_list = [7,9,10]
#edge_list = [(2,5), (2,4), (1,5), (3,5), (3,4), (6,4), (6,7), (6,9), (6,10), (8,9), (8,10)]
#DG = nx.DiGraph()
#DG.add_edges_from(edge_list)
#sym1_list = [2, 2, 1, 3, 3, 6, 6, 6, 6, 8, 8]
#sym2_list = [5, 4, 5, 5, 4, 4, 7, 9, 10, 9, 10]
#source_nodes = list(set(zip(*DG.edges())[0]))
#print 'source_nodes: ' + str(source_nodes)

In [None]:
# calculating all the p-scores

def tr_pvalues(DG, background_list, DEG_list):
    
    source_nodes = list(set(zip(*DG.edges())[0]))  #identifying unique source nodes in graph
    
    TR_to_pvalue = {}
    for TR in source_nodes:
        x = len(list(set(DG.neighbors(TR)) & set(DEG_list))) # per TR, observed overlap between TR neighbors and DEG_list
        M = len(background_list)  # num unique nodes in universe, aka background network (STRING)
        n = len(DG.neighbors(TR)) # per TR, number of targets for that TR
        N = len(list(set(background_list) & set(DEG_list))) # number of DEG, picked from universe "at random" (TODO: intersection)
    
        TR_to_pvalue[TR] = -(scipy.stats.hypergeom.logsf(x, M, n, N, loc=0)) # remove unnecessary negative sign
        
    return TR_to_pvalue 
    
un = np.unique(list(sym1_list)+list(sym2_list))
tr_pvalues(DG, un, DEG_list)

### z-score with DEG

In [24]:
# NOTE! This is only counting N+ and N- for nodes in our DEG list.

def tr_zscore(DG, DEG_list):
    
    source_nodes = list(set(zip(*DG.edges())[0])) #identifying unique source nodes in graph
    
    TR_to_zscore = {}
    for TR in source_nodes:
        N_minus = 0 # number of inhibiting predicting DEG edges
        N_plus = 0 # number of activating predicting DEG edges
        N_zero = 0 # number of edges with errorous calculations
    
        TRs_DEG_neighbors = set(DG.neighbors(TR)) & set(DEG_list)
        for n in TRs_DEG_neighbors:
                sign_of_edge = DG[TR][n]['sign']
                up_down_of_n = DG.node[n]['updown']
                
                # predict whether this neighbor thinks the TR is Act. or Inhib.
                if ((sign_of_edge * up_down_of_n) == 1):
                    N_plus += 1
                elif ((sign_of_edge * up_down_of_n) == -1):
                    N_minus += 1
                else:
                    N_zero += 1 # mark an error if could not predict
                    print "Issue with edge (" + str(TR) + ',' + str(n) + ')'
                
        if N_zero != 0:
            print "Could not attribute activated or inhibiting trait to " + str(N_zero) + 'nodes'
      
        # prevent a divide-by-zero calculation
        N = N_plus + N_minus
        if N == 0:
            z_score = 0
        else:
            z_score = (N_plus - N_minus)/float(math.sqrt(N))
                
        TR_to_zscore[TR] = z_score #create zscore dict where 1 means activating
                                                            # -1 means inhibiting
                                                            # 0 means could not be calculated
    
    return TR_to_zscore

tr_zscore(DG, DEG_list)

{u'ATF6': 0, u'MAX': 0, u'MYB': 0, u'REL': -1.0, u'TBP': 0}