In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import community
import umap 

In [22]:
%load_ext blackcellmagic

In [2]:
seed = 42
np.random.seed(seed)

In [24]:
def get_gene_data(data, gene_name_column, test_gene_list):
    
    """Extract data from specific genes given a larger dataframe.
    
    Inputs
    
    * data: large dataframe from where to filter
    * gene_name_column: column to filter from
    * test_gene_list : a list of genes you want to get
    
    Output
    * dataframe with the genes you want
    """
    
    gene_profiles = pd.DataFrame()

    for gene in data[gene_name_column].values:

        if gene in test_gene_list: 

            df_ = data[(data[gene_name_column] == gene)]

            gene_profiles = pd.concat([gene_profiles, df_])
    
    gene_profiles.drop_duplicates(inplace = True)
    
    return gene_profiles

In [26]:
def get_network_clusters(network_lcc, n_clusters):

    """
    input = an empyty list
    
    output = a list with the networks clusters
    
    """
    cluster_list = []

    for i in range(n_clusters):

        cluster_lcc = [
            n for n in network_lcc.nodes() if network_lcc.node[n]["modularity"] == i
        ]

        cluster_list.append(cluster_lcc)

    return cluster_list

In [27]:
path =  '~/Documents/uni/bioinfo/data/coli/'

In [28]:
tf_trn = pd.read_csv(path + "network_tf_tf.txt",
                     delimiter= '\t',
                     comment= '#', 
                     names = ['TF', 'TG', 'regType', 'ev', 'confidence', 'unnamed'], 
                     usecols = np.arange(5))
tf_trn.head()

Unnamed: 0,TF,TG,regType,ev,confidence
0,AcrR,acrR,-,"[AIBSCS, BCE, BPP, GEA, HIBSCS]",Weak
1,AcrR,flhC,-,"[GEA, HIBSCS]",Weak
2,AcrR,flhD,-,"[GEA, HIBSCS]",Weak
3,AcrR,marA,-,"[BPP, GEA, HIBSCS]",Strong
4,AcrR,marR,-,"[BPP, GEA, HIBSCS]",Strong


In [29]:
tf_trn.TF = tf_trn.TF.apply(lambda x: x.lower())
tf_trn.TG = tf_trn.TG.apply(lambda x: x.lower())

In [30]:
tf_trn.head()

Unnamed: 0,TF,TG,regType,ev,confidence
0,acrr,acrr,-,"[AIBSCS, BCE, BPP, GEA, HIBSCS]",Weak
1,acrr,flhc,-,"[GEA, HIBSCS]",Weak
2,acrr,flhd,-,"[GEA, HIBSCS]",Weak
3,acrr,mara,-,"[BPP, GEA, HIBSCS]",Strong
4,acrr,marr,-,"[BPP, GEA, HIBSCS]",Strong


In [31]:
#Let's turn the TF TRN dataframe into a graph object
net = nx.from_pandas_edgelist(df= tf_trn, source= 'TF', target='TG',
                             edge_attr='regType')

In [32]:
##Computing the LCC
net= max(nx.connected_component_subgraphs(net), key=len)

In [33]:
##Cluster the TF-TF network LCC
communities = community.best_partition(net)

In [34]:
n_clusters_tf = max(communities.values())

n_clusters_tf

10

In [35]:
nx.set_node_attributes(net, values= communities, name='modularity')

In [36]:
cluster_list = np.array(get_network_clusters(net, n_clusters_tf))

In [37]:
cluster_sizes = np.array([len(clus) for clus in cluster_list])

In [38]:
# Check if all clusters contain more than 5 TFs
# Still have to make the condition to handle when this is False
np.all(np.array(cluster_sizes) > 5)

True

In [39]:
cluster_sizes

array([10, 26, 10, 23, 11, 16,  6, 27,  8, 12])

In [40]:
trn_df = pd.read_csv(path + 'tf_tg.csv')

In [41]:
trn_df.TF = trn_df.TF.apply(lambda x: x.lower())
trn_df.TG = trn_df.TG.apply(lambda x: x.lower())

In [42]:
cluster_list[0]

['acrr', 'mara', 'marr', 'soxr', 'soxs', 'baer', 'puta', 'cpxr', 'decr', 'rob']

In [43]:
tgs_ = pd.DataFrame()

for ix, clus in enumerate(cluster_list):
    
    clus_trn = get_gene_data(trn_df, 'TF', clus)
    clus_tgs = list(set(clus_trn['TG'].values))
    
    tgs_df = pd.DataFrame({'TGs': clus_tgs})
    
    tgs_df['cluster'] = ix + 1
    
    tgs_ = pd.concat([tgs_, tgs_df])

In [44]:
trn_df.shape

(4490, 3)

In [45]:
tgs_.head()

Unnamed: 0,TGs,cluster
0,cpxq,1
1,csgf,1
2,ung,1
3,baer,1
4,inaa,1


In [48]:
denoised = pd.read_csv('~/Downloads/denoised_ecoli.csv')

In [49]:
denoised.tail()

Unnamed: 0,locustag,gene name,geneid/contrast_id,1,2,3,4,5,6,7,...,4441,4442,4443,4444,4445,4446,4447,4448,4449,4450
4316,b4702,mgtl,4701,-0.887851,-0.657262,-0.328599,-1.181665,-0.512146,-0.735307,0.852066,...,-0.412118,-0.746951,0.125378,-0.273457,-0.321287,0.083352,-0.381717,0.167324,0.05409,-0.004498
4317,b4703,pmrr,4699,0.242388,-0.039028,0.305533,0.027358,0.035909,-0.011634,0.343456,...,0.270389,-0.709234,0.257792,0.271602,-0.314563,0.216571,-0.215941,0.033267,-0.127817,0.155563
4318,b4705,mnts,4670,0.06453,0.557074,1.152848,1.205386,1.341889,1.373536,-0.899024,...,-0.763608,-1.165085,0.322687,-0.65444,-1.012207,0.006137,-0.291866,0.031349,-0.739343,0.334742
4319,b4706,irok,4687,0.039226,0.033221,0.06554,0.041758,0.074894,0.042845,0.070408,...,-0.046231,-0.039813,-0.044918,-0.063925,-0.030761,-0.072713,-0.129308,-0.058543,0.132002,-0.078313
4320,b4708,insi1,4665,0.008469,0.015506,-0.023608,-0.015918,0.015629,-0.01857,0.076349,...,-0.214242,-0.383351,0.061606,-0.240414,-0.343364,-0.295685,-0.519321,-0.4368,0.030401,-0.303851


In [50]:
nrows_data = denoised.shape[0]

In [51]:
n_clusters_tf

10

In [52]:
one_hot_mat = np.zeros((nrows_data, n_clusters_tf))

In [53]:
one_hot_mat.shape

(4321, 10)

In [54]:
for ix, gene in enumerate(denoised['gene name'].values[:5]):
    print(gene)

thrl
thra
thrb
thrc
yaax


In [55]:
tgs_[tgs_['TGs'] == 'laci'].cluster.values

array([8])

In [56]:
# populate one hot matrix
for ix, gene in enumerate(denoised['gene name']):
    
    gene_clus = tgs_[tgs_['TGs'] == gene]
    
    if gene_clus.shape[0] > 0:
        
        clusters = gene_clus.cluster.values
        clus_ix = [clus - 1 for clus in clusters]
        
        one_hot_mat[ix, clus_ix] = 1
        
    else: 
        pass

In [57]:
n_clusters_tf

10

In [58]:
one_hot_mat.shape

(4321, 10)

In [59]:
clus_samples = one_hot_mat.sum(axis = 0)

min_clus_samples = min(clus_samples)

In [60]:
min_clus_samples

88.0

In [61]:
one_hot_df = pd.DataFrame(one_hot_mat, 
                          columns = ['cluster ' + str(i) for i in np.arange(1, n_clusters_tf + 1 )])

In [62]:
denoised_data = denoised.iloc[:, 3:].values

In [64]:
denoised_reduced = umap.UMAP(n_components = int(min_clus_samples),
                             n_neighbors = 10,
                             random_state = seed).fit_transform(denoised_data)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../anaconda3/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  self.func_ir.loc))


In [188]:
denoised_umap = pd.DataFrame(denoised_reduced,
    columns = ['UMAP ' + str(int(x)) for x in np.arange(1, min_clus_samples+ 1)]
)

In [189]:
denoised_umap.head()

Unnamed: 0,UMAP 1,UMAP 2,UMAP 3,UMAP 4,UMAP 5,UMAP 6,UMAP 7,UMAP 8,UMAP 9,UMAP 10,...,UMAP 79,UMAP 80,UMAP 81,UMAP 82,UMAP 83,UMAP 84,UMAP 85,UMAP 86,UMAP 87,UMAP 88
0,0.135129,-0.106396,-0.236401,0.579665,0.023305,0.794392,-0.698494,0.120427,-0.230624,0.030787,...,0.076304,-0.067969,0.117545,-0.215664,0.02135,-0.140627,-0.246135,0.224606,0.114066,-0.363086
1,-0.013774,-0.073943,-0.23827,0.235667,-0.503754,0.491617,-0.658205,0.139508,0.044617,0.177231,...,-0.036748,0.043142,0.033953,-0.134553,-0.272892,-0.025192,-0.22419,0.071627,0.006678,-0.050241
2,-0.013519,-0.072294,-0.235005,0.22584,-0.498133,0.483273,-0.65677,0.139417,0.042824,0.174838,...,-0.043559,0.04451,0.034935,-0.125814,-0.285917,-0.02547,-0.215198,0.069223,0.000581,-0.049122
3,-0.010197,-0.075357,-0.231054,0.239673,-0.489322,0.489555,-0.658887,0.138714,0.033513,0.170951,...,-0.039889,0.039689,0.038464,-0.128308,-0.278498,-0.030545,-0.21195,0.073253,0.005572,-0.057031
4,0.162379,-0.13322,-0.201547,0.561994,-0.077095,0.72595,-0.657978,0.120503,-0.394231,0.052438,...,0.088574,-0.166591,0.21241,-0.187977,-0.081776,-0.280927,-0.091361,0.190255,0.194472,-0.363989


In [208]:
annot = denoised.iloc[:, :3]

In [209]:
denoised_hot = pd.concat([annot, denoised_umap, one_hot_df], axis = 1)

In [210]:
denoised_hot.tail()

Unnamed: 0,locustag,gene name,geneid/contrast_id,UMAP 1,UMAP 2,UMAP 3,UMAP 4,UMAP 5,UMAP 6,UMAP 7,...,UMAP 88,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8,cluster 9
4316,b4702,mgtl,4701,0.106328,-0.11553,-0.150013,0.783358,-0.235184,0.717094,-0.465359,...,-0.226276,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4317,b4703,pmrr,4699,0.158505,-0.07192,-0.181408,0.559423,-0.123821,0.616541,-0.615364,...,-0.254717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4318,b4705,mnts,4670,0.130617,-0.099335,-0.192092,0.43319,-0.048799,0.654106,-0.614379,...,-0.304926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4319,b4706,irok,4687,0.099228,-0.077835,-0.20938,0.273548,-0.077179,0.651653,-0.618855,...,-0.218458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4320,b4708,insi1,4665,0.074955,-0.085406,-0.18035,0.330875,-0.098675,0.634548,-0.602201,...,-0.20669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [214]:
one_hot_sum = one_hot_mat.sum(axis = 1)

In [215]:
one_hot_sum.shape

(4321,)

In [211]:
# add a column corresponding to genes that are TGs 
denoised_hot['TG'] = [1 if val > 0 else 0 for i, val in enumerate(one_hot_sum)]

In [212]:
tgs_data = denoised_hot[denoised_hot['TG'] == 1]

In [213]:
tgs_data.shape

(1390, 101)

In [216]:
denoised_hot.to_csv('~/Downloads/denoised_umap_hot.csv', index = False)

Make everything into a wrapper function. 

In [20]:
def annot_data_trn(tf_tf_net_path = None,
                   trn_path = None,
                   denoised_data_path = None,
                   org = 'ecoli',
                   output_path = '~/Downloads/'):

    # Load TF-TF net and TRN
    
    if tf_tf_net_path is None: 
        !wget http://regulondb.ccg.unam.mx/menu/download/datasets/files/network_tf_tf.txt


        tf_trn = pd.read_csv('network_tf_tf.txt',
                         delimiter= '\t',
                         comment= '#', 
                         names = ['TF', 'TG', 'regType', 'ev', 'confidence', 'unnamed'], 
                         usecols = np.arange(5))

    else: 
        tf_trn = pd.read_csv(tf_tf_net_path,
                         delimiter= '\t',
                         comment= '#', 
                         names = ['TF', 'TG', 'regType', 'ev', 'confidence', 'unnamed'], 
                         usecols = np.arange(5))

    if trn_path is None: 
        !wget http://regulondb.ccg.unam.mx/menu/download/datasets/files/network_tf_gene.txt

        trn = pd.read_csv('network_tf_gene.txt',
                          delimiter= '\t',
                          comment= '#', 
                          names = ['TF', 'TG', 'regType', 'ev', 'confidence', 'unnamed'], 
                          usecols = np.arange(5))

    else:
        try:
            trn = pd.read_csv(trn_path,
                          delimiter= '\t',
                          comment= '#', 
                          names = ['TF', 'TG', 'regType', 'ev', 'confidence', 'unnamed'], 
                          usecols = np.arange(5))
        except : 
            trn = pd.read_csv(trn_path)
            


    # Lowercase gene names for both datasets
    tf_trn.TF = tf_trn.TF.apply(lambda x: x.lower())
    tf_trn.TG = tf_trn.TG.apply(lambda x: x.lower())

    trn.TF = trn.TF.apply(lambda x: x.lower())
    trn.TG = trn.TG.apply(lambda x: x.lower())

    

    # Turn the TF TRN dataframe into a graph object
    net = nx.from_pandas_edgelist(df= tf_trn, source= 'TF', target='TG',
                             edge_attr='regType')

    # Compute the LCC
    net= max(nx.connected_component_subgraphs(net), key=len)

    #Cluster TF net 

    communities = community.best_partition(net)

    # Get number of clusters
    n_clusters_tf = max(communities.values())

    # Embed cluster annotation in net 
    nx.set_node_attributes(net, values= communities, name='modularity')

    # Get np.array of TF clusters
    cluster_list = np.array(get_network_clusters(net, n_clusters_tf))

    # Get cluster sizes 

    cluster_sizes = np.array([len(clus) for clus in cluster_list])

    # Select only the clusters with more than 5 TFs

    clus_list = cluster_list[cluster_sizes > 5]
    
    print('We have 'len(clus_list), 'clusters')

    # Get a DataFrame of the TGs in each cluster

    tgs_ = pd.DataFrame()

    for ix, clus in enumerate(clus_list):
        
        clus_trn = get_gene_data(trn, 'TF', clus)
        clus_tgs = list(set(clus_trn['TG'].values))
        
        tgs_df = pd.DataFrame({'TGs': clus_tgs})
        
        tgs_df['cluster'] = ix + 1
        
        tgs_ = pd.concat([tgs_, tgs_df])


    # -----Start constructing the annotated dataset ------

    if denoised_data_path is None: 
        try:
            denoised = pd.read_csv('denoised_coli.csv')
        except: 
            import download_and_preprocess_data as d

            d.download_and_preprocess_data(org)

    else: 
        denoised = pd.read_csv(denoised_data_path)


    # Get nrows of denoised data
    nrows_data = denoised.shape[0]


    # Initialize one-hot-matrix

    one_hot_mat = np.zeros((nrows_data, n_clusters_tf))

    # Populate one-hot-matrix


    for ix, gene in enumerate(denoised['gene name']):
        
        gene_clus = tgs_[tgs_['TGs'] == gene]
        
        if gene_clus.shape[0] > 0:
            
            clusters = gene_clus.cluster.values
            clus_ix = [clus - 1 for clus in clusters]
            
            one_hot_mat[ix, clus_ix] = 1
            
        else: 
            pass

    # Make one-hot-matrix into a dataframe

    one_hot_df = pd.DataFrame(one_hot_mat, 
                    columns = ['cluster ' + str(i) for i in np.arange(1, n_clusters_tf + 1 )])


    # Get the n_samples of smallest cluster
    clus_samples = one_hot_mat.sum(axis = 0)

    min_clus_samples = min(clus_samples)

    # Separate denoised and annotated data 
    annot = denoised.iloc[:, :3]#.values
    denoised_data = denoised.iloc[:, 3:].values

    # Apply UMAP to denoised data 

    denoised_reduced = umap.UMAP(n_components = int(min_clus_samples), 
                                 n_neighbors = 10,
                             random_state = seed).fit_transform(denoised_data)

    # Turn UMAP data into a dataframe

    denoised_umap = pd.DataFrame(denoised_reduced,
        columns = ['UMAP ' + str(int(x)) for x in np.arange(1, min_clus_samples+ 1)]
    )

    # Denoised UMAP data plus annotation and one hot matrix 
    denoised_hot = pd.concat([annot, denoised_umap, one_hot_df], axis = 1)

    # add a column corresponding to genes that are TGs 
    one_hot_sum = one_hot_mat.sum(axis = 1)# helper indicator array
    denoised_hot['TG'] = [1 if val > 0 else 0 for i, val in enumerate(one_hot_sum)]
    
    if output_path is not '~/Downloads/': 
        denoised_hot.to_csv( output_path + 'denoised_umap_hot.csv', index = False)
    else: 
        denoised_hot.to_csv( '~/Downloads/' + 'denoised_umap_hot.csv', index = False)


In [None]:
tf_tf_net_path = path + "network_tf_tf.txt"

trn_path = path + 'tf_tg.csv'

denoised_data_path = '~/Downloads/denoised_ecoli.csv'

In [21]:
annot_data_trn(tf_tf_net_path, trn_path, denoised_data_path)

ValueError: n_components must be greater than 0