In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import scipy
from scipy import stats
import mygene
import math

import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')
% matplotlib inline

## Creating Background Network ###

#### Load Brin's transcription factor's

In [2]:
# load TR list from AnimalTFDB 
TR_db_m = pd.read_csv("Mus_musculus_transcription_factors_gene_list.txt", sep = "\t")
TR_db_h = pd.read_csv("Homo_sapiens_transcription_factors_gene_list.txt", sep = "\t")
TR_db = TR_db_m.append(TR_db_h)
TR_list_entrez = TR_db.Entrez_ID

In [3]:
len(TR_db_h)

1691

In [42]:
#translate TR list entrez to symbol
mg = mygene.MyGeneInfo()
translated_DF = mg.getgenes(set(TR_list_entrez), as_dataframe=True)
animal_TF = translated_DF["symbol"].str.upper()
len(animal_TF)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-2956...done.


2956

In [5]:
# may need to check for Nan values, depending on the database

#TR_list_not_found = translated_string_nodes["notfound"]
#
#for i in range(len(TR_list_not_found)):
#    if TR_list_not_found[i] is 'NaN':
#        print 1

In [6]:
slowkow_TF_list = []
for file_name in ['./slowkow_databases/TRED_TF.txt',
                 './slowkow_databases/ITFP_TF.txt',
                 './slowkow_databases/ENCODE_TF.txt',
                 #'/slowkow_databases/Neph2012_TF.txt',
                 './slowkow_databases/TRRUST_TF.txt',
                  './slowkow_databases/Marbach2016_TF.txt']:
    with open(file_name) as f:
        lines = f.read().splitlines()
        slowkow_TF_list.extend(lines)
        
[x.upper() for x in slowkow_TF_list]
len(set(slowkow_TF_list)) 

2682

In [29]:
jasp_df = pd.read_csv("jaspar_genereg_matrix.txt", sep = "\t", header= None, names = ['col1', 'col2', 'col3', 'col4', 'tf_genes'])
jasp_tf = jasp_df['tf_genes'].str.upper()
len(jasp_tf)

2049

In [43]:
TR_list_symbols = list(set(animal_TF) | set(slowkow_TF_list))
len(TR_list_symbols)

3694

In [46]:
TR_list_symbols = list(set(animal_TF) | set(slowkow_TF_list) | set(jasp_tf))
len(TR_list_symbols)

4962

#### Cross reference Brin's TR list with background STRING db


In [47]:
# Load STRING database as background network
STRING_DF = pd.read_excel("STRING_network.xlsx")

In [48]:
# convert symbols to all caps, make into edge list
sym1_list = STRING_DF.Symbol1.str.upper()
sym2_list = STRING_DF.Symbol2.str.upper()
weight_list = STRING_DF.Weight
sign_list = STRING_DF.Edge_Sign
db_edges = zip(sym1_list, sym2_list, weight_list)

sign_num_list = []
for sign in sign_list:
    if str(sign) == '+':
        sign_num_list.append(1)
    elif str(sign) == '-':
        sign_num_list.append(-1)
    else:
        sign_num_list.append(0)

db_sign_att = zip(sym1_list, sym2_list, sign_num_list)

In [52]:
#num unique nodes STRING
len(np.unique(list(sym1_list)+list(sym2_list)))

3580

In [53]:
# extracting TR edge information from background string db
edge_list_filtered_TR = []
sign_att_filtered_list = []
for i in range(len(db_edges)):
    if db_edges[i][0] in list(TR_list_symbols):
        edge_list_filtered_TR.append(db_edges[i])
        sign_att_filtered_list.append(db_sign_att[i])

In [54]:
# make and process directed graph from these two tables
DG = nx.DiGraph()
DG.add_weighted_edges_from(edge_list_filtered_TR)
for i in range(len(sign_att_filtered_list)):
    DG[sign_att_filtered_list[i][0]][sign_att_filtered_list[i][1]]['sign'] = sign_att_filtered_list[i][2]

In [55]:
len(DG.nodes())

687

In [56]:
# number of TR's from Brin's list that are in the AnimalTFDB database
len(list(set(sym1_list) & set(TR_list_symbols))) 
len(set(zip(*edge_list_filtered_TR)[0])) # multiple ways to do it

102

### p-value with differencially expressed genes

In [57]:
# load differencially expressed genes (experimental results)
DEG_db = pd.read_csv("differencially_expressed_genes.txt", sep = "\t")
DEG_list = []
DEG_to_updown = {}

In [58]:
# filtering for lfdr < 0.3
for i in range(len(DEG_db)):
    if str(DEG_db.symbol[i]).upper() != 'NAN': # removing Nan values
        
        if (DEG_db['lfdr.89.12'][i] < 0.3):
            DEG_list.append(str(DEG_db.symbol[i]).upper()) # filtering DEG list by lfdr < 0.3
            
            # creating dictionary between DEG symbols and their up/down value
            if DEG_db['log2.89.12'][i] != 0: 
                DEG_to_updown[str(DEG_db.symbol[i]).upper()] = (DEG_db['log2.89.12'][i])/abs(DEG_db['log2.89.12'][i])
            else:
                DEG_to_updown[str(DEG_db.symbol[i]).upper()] = 0
                

In [59]:
DEG_in_DG = set(DG.nodes()) & set(DEG_list)
zero_dict = dict(zip(DG.nodes(), [0]*len(DG.nodes())))
for gene in DEG_in_DG:
    zero_dict[gene] = DEG_to_updown[gene]
nx.set_node_attributes(DG, 'updown', zero_dict) 

In [60]:
# unique source nodes
source_nodes = list(set(zip(*DG.edges())[0]))
source_nodes

[u'HBN',
 u'PAX',
 u'SF1',
 u'AKT1',
 u'TIN',
 u'PAN',
 u'TOPORS',
 u'TAZ',
 u'SU(H)',
 u'SU(HW)',
 u'LIG3',
 u'HKB',
 u'MYB',
 u'PXN',
 u'H',
 u'NUP50',
 u'RAE1',
 u'STAT92E',
 u'SVP',
 u'SNA',
 u'MAX',
 u'ONECUT',
 u'RBBP5',
 u'SPEN',
 u'PTEN',
 u'MAD',
 u'MRPL44',
 u'SIN3A',
 u'TSG101',
 u'MRPL24',
 u'RPL8',
 u'EVE',
 u'MED1',
 u'SLBO',
 u'TRL',
 u'CDC16',
 u'TTK',
 u'NF1',
 u'GATA',
 u'SCR',
 u'RNPS1',
 u'BUB3',
 u'GCM',
 u'ING3',
 u'TAF7',
 u'TAF6',
 u'TAF5',
 u'GSTO1',
 u'TAF2',
 u'TAF1',
 u'CDC27',
 u'TBP',
 u'INR',
 u'HDAC3',
 u'NUP133',
 u'HDAC6',
 u'RFC4',
 u'ILK',
 u'REPO',
 u'TAF12',
 u'TAF11',
 u'RPS3',
 u'REL',
 u'CDC6',
 u'ECD',
 u'MRPL23',
 u'NUB',
 u'OPTIX',
 u'RPS24',
 u'RPS23',
 u'RPL23A',
 u'KLHL18',
 u'TAF4',
 u'ACHI',
 u'DFD',
 u'Z',
 u'TLL',
 u'ZEN2',
 u'KR',
 u'MARS',
 u'DL',
 u'BAP',
 u'MSH6',
 u'VHL',
 u'MED15',
 u'CNOT4',
 u'FKH',
 u'INTS6',
 u'INTS4',
 u'INTS8',
 u'ANTP',
 u'MCM7',
 u'MCM6',
 u'MCM5',
 u'MCM3',
 u'MCM2',
 u'ARR1',
 u'UTX',
 u'CG11294',
 u'PN

In [61]:
#DEG_list = [7,9,10]
#edge_list = [(2,5), (2,4), (1,5), (3,5), (3,4), (6,4), (6,7), (6,9), (6,10), (8,9), (8,10)]
#DG = nx.DiGraph()
#DG.add_edges_from(edge_list)
#sym1_list = [2, 2, 1, 3, 3, 6, 6, 6, 6, 8, 8]
#sym2_list = [5, 4, 5, 5, 4, 4, 7, 9, 10, 9, 10]
#source_nodes = list(set(zip(*DG.edges())[0]))
#print 'source_nodes: ' + str(source_nodes)

In [62]:
# calculating all the p-scores

def tr_pvalues(DG, background_list, DEG_list):
    
    source_nodes = list(set(zip(*DG.edges())[0]))  #identifying unique source nodes in graph
    
    TR_to_pvalue = {}
    for TR in source_nodes:
        x = len(list(set(DG.neighbors(TR)) & set(DEG_list))) # per TR, observed overlap between TR neighbors and DEG_list
        M = len(background_list)  # num unique nodes in universe, aka background network (STRING)
        n = len(DG.neighbors(TR)) # per TR, number of targets for that TR
        N = len(list(set(background_list) & set(DEG_list))) # number of DEG, picked from universe "at random" (TODO: intersection)
    
        TR_to_pvalue[TR] = -(scipy.stats.hypergeom.logsf(x, M, n, N, loc=0)) # remove unnecessary negative sign
        
    return TR_to_pvalue 
    
un = np.unique(list(sym1_list)+list(sym2_list))
tr_pvalues(DG, un, DEG_list)

{u'ABD-B': 2.5619299458469871,
 u'ACHI': 4.1400668115618942,
 u'AKT1': 2.0051353444087323,
 u'ANTP': 4.1400668115618942,
 u'ARR1': 3.4547738088253737,
 u'ATF6': 1.6680928067230729,
 u'BAP': 2.3873854072043095,
 u'BUB3': 1.2902562879815764,
 u'CDC16': 3.1441987530558704,
 u'CDC27': 2.0214533281293834,
 u'CDC6': 3.909551880959754,
 u'CG11294': 3.4547738088253737,
 u'CNOT4': 2.0051353444087323,
 u'DFD': 4.1400668115618942,
 u'DL': 5.3485051248405746,
 u'ECD': 2.5619299458469871,
 u'EVE': 4.1400668115618942,
 u'FKH': 4.1400668115618942,
 u'GATA': 4.1400668115618942,
 u'GCM': 4.1400668115618942,
 u'GSTO1': 4.1400668115618942,
 u'H': 2.3873854072043095,
 u'HBN': 4.1400668115618942,
 u'HDAC3': 2.3873854072043095,
 u'HDAC6': 2.7772771668834308,
 u'HKB': 3.4547738088253737,
 u'ILK': 3.4547738088253737,
 u'ING3': 2.3873854072043095,
 u'INR': 1.9074744647345609,
 u'INTS4': 5.6306387511600979,
 u'INTS6': 6.0258370197540474,
 u'INTS8': 6.5263844837145308,
 u'KLHL18': 1.8198445447027656,
 u'KR': 4.1

### z-score with DEG

In [63]:
# NOTE! This is only counting N+ and N- for nodes in our DEG list.

def tr_zscore(DG, DEG_list):
    
    source_nodes = list(set(zip(*DG.edges())[0])) #identifying unique source nodes in graph
    
    TR_to_zscore = {}
    for TR in source_nodes:
        N_minus = 0 # number of inhibiting predicting DEG edges
        N_plus = 0 # number of activating predicting DEG edges
        N_zero = 0 # number of edges with errorous calculations
    
        TRs_DEG_neighbors = set(DG.neighbors(TR)) & set(DEG_list)
        for n in TRs_DEG_neighbors:
                sign_of_edge = DG[TR][n]['sign']
                up_down_of_n = DG.node[n]['updown']
                
                # predict whether this neighbor thinks the TR is Act. or Inhib.
                if ((sign_of_edge * up_down_of_n) == 1):
                    N_plus += 1
                elif ((sign_of_edge * up_down_of_n) == -1):
                    N_minus += 1
                else:
                    N_zero += 1 # mark an error if could not predict
                    print "Issue with edge (" + str(TR) + ',' + str(n) + ')'
                
        if N_zero != 0:
            print "Could not attribute activated or inhibiting trait to " + str(N_zero) + 'nodes'
      
        # prevent a divide-by-zero calculation
        N = N_plus + N_minus
        if N == 0:
            z_score = 0
        else:
            z_score = (N_plus - N_minus)/float(math.sqrt(N))
                
        TR_to_zscore[TR] = z_score #create zscore dict where 1 means activating
                                                            # -1 means inhibiting
                                                            # 0 means could not be calculated
    
    return TR_to_zscore

tr_zscore(DG, DEG_list)

{u'ABD-B': 0,
 u'ACHI': 0,
 u'AKT1': 0,
 u'ANTP': 0,
 u'ARR1': 0,
 u'ATF6': 0,
 u'BAP': 0,
 u'BUB3': 0,
 u'CDC16': -1.0,
 u'CDC27': -1.0,
 u'CDC6': -1.0,
 u'CG11294': 0,
 u'CNOT4': 0,
 u'DFD': 0,
 u'DL': 0.0,
 u'ECD': 0,
 u'EVE': 0,
 u'FKH': 0,
 u'GATA': 0,
 u'GCM': 0,
 u'GSTO1': 0,
 u'H': 0,
 u'HBN': 0,
 u'HDAC3': 0,
 u'HDAC6': 0,
 u'HKB': 0,
 u'ILK': 0,
 u'ING3': 0,
 u'INR': 0,
 u'INTS4': -1.0,
 u'INTS6': -1.0,
 u'INTS8': -1.0,
 u'KLHL18': 0,
 u'KR': 0,
 u'LIG3': 0,
 u'MAD': 0,
 u'MARS': 0,
 u'MAX': 0,
 u'MCM2': -1.414213562373095,
 u'MCM3': -1.414213562373095,
 u'MCM5': -1.0,
 u'MCM6': -0.5773502691896258,
 u'MCM7': -1.0,
 u'MED1': -1.0,
 u'MED15': -1.414213562373095,
 u'MRPL23': 0,
 u'MRPL24': -1.7320508075688774,
 u'MRPL44': 0,
 u'MSH6': 0,
 u'MYB': 0,
 u'NF1': 0,
 u'NUB': 0,
 u'NUP133': 0,
 u'NUP50': 0,
 u'ONECUT': 0,
 u'OPTIX': 0,
 u'PAN': 0,
 u'PAX': 0.0,
 u'PNR': 0,
 u'PTEN': -1.0,
 u'PXN': 0,
 u'RAE1': 0,
 u'RBBP5': 0,
 u'REL': -1.0,
 u'REPO': 0,
 u'RFC4': -1.0,
 u'RNPS1': -1