# Motif enrichment analysis

In [3]:
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, "./scripts")
import functions as f
from matplotlib import pyplot as plt
import seaborn as sns
from copy import copy
import multiprocessing as mp
import joblib
from tqdm import tqdm
from scipy import stats

In [2]:
cfg = f.get_actual_parametrization("./config.json")

RANDOM_SEED: 19
TEST_NETWORK_SIZE: 500
TEST_NETWORK_LINK_PROB: 0.1
N_CORES_TO_USE: -1
NETWORK_TO_SEARCH_IN: gs0.1


### Motifs library

There are conventional denotations for all possible triads as presented below

<img src="./pics/triads.png" width=2000 height=20/>

# Yeast Tnet

In [47]:
cfg = f.update_cfg("./config.json", "NETWORK_TO_SEARCH_IN", "yeast")

RANDOM_SEED: 19
TEST_NETWORK_SIZE: 500
TEST_NETWORK_LINK_PROB: 0.1
N_CORES_TO_USE: -1
NETWORK_TO_SEARCH_IN: yeast


In [48]:
interaction_matrix = f.get_interaction_matrix(cfg)

In [49]:
%%time
motifs_orig, counter_orig = f.motif_search(cfg, interaction_matrix, batch_size=10000)
counter_orig

CPU times: user 2.46 s, sys: 151 ms, total: 2.61 s
Wall time: 7.68 s


{'021C': 37631, '021D': 1059856, '021U': 26042, '030C': 8, '030T': 3370}

In [50]:
node_motif_distr = {motif: {} for motif in motifs_orig.keys()}
for motif in motifs_orig.keys():
    for triad in motifs_orig[motif]:
        i, j, k = map(int, triad.split("_"))
        try:
            node_motif_distr[motif][i] += 1
        except KeyError:
            node_motif_distr[motif][i] = 1
        try:
            node_motif_distr[motif][j] += 1
        except KeyError:
            node_motif_distr[motif][j] = 1
        try:
            node_motif_distr[motif][k] += 1
        except KeyError:
            node_motif_distr[motif][k] = 1

In [51]:
distr_dict = {motif: {} for motif in motifs_orig.keys()}
for motif in motifs_orig.keys():
    nodes = [x[0] for x in node_motif_distr[motif].items()]
    n_motifs = [x[1] for x in node_motif_distr[motif].items()]
    distr_dict[motif] = pd.DataFrame(data=n_motifs, index=nodes, columns=["n_motifs"])

In [59]:
print("\nYeast nodes participation:\n")
for motif, distr in distr_dict.items():
    print(f"{motif}:\ttotal {len(distr)} \
genes involved ({100*len(distr)/interaction_matrix.shape[0]:.2f}%)")
print()


Yeast nodes participation:

021C:	total 4252 genes involved (95.74%)
021D:	total 4439 genes involved (99.95%)
021U:	total 2750 genes involved (61.92%)
030C:	total 17 genes involved (0.38%)
030T:	total 1201 genes involved (27.04%)



# Ecoli Tnet

In [60]:
cfg = f.update_cfg("./config.json", "NETWORK_TO_SEARCH_IN", "ecoli")

RANDOM_SEED: 19
TEST_NETWORK_SIZE: 500
TEST_NETWORK_LINK_PROB: 0.1
N_CORES_TO_USE: -1
NETWORK_TO_SEARCH_IN: ecoli


In [61]:
interaction_matrix = f.get_interaction_matrix(cfg)

In [62]:
%%time
motifs_orig, counter_orig = f.motif_search(cfg, interaction_matrix, batch_size=10000)
counter_orig

CPU times: user 939 ms, sys: 109 ms, total: 1.05 s
Wall time: 3.34 s


{'021C': 3938, '021D': 329287, '021U': 4365, '030C': 0, '030T': 1392}

In [63]:
node_motif_distr = {motif: {} for motif in motifs_orig.keys()}
for motif in motifs_orig.keys():
    for triad in motifs_orig[motif]:
        i, j, k = map(int, triad.split("_"))
        try:
            node_motif_distr[motif][i] += 1
        except KeyError:
            node_motif_distr[motif][i] = 1
        try:
            node_motif_distr[motif][j] += 1
        except KeyError:
            node_motif_distr[motif][j] = 1
        try:
            node_motif_distr[motif][k] += 1
        except KeyError:
            node_motif_distr[motif][k] = 1

In [64]:
distr_dict = {motif: {} for motif in motifs_orig.keys()}
for motif in motifs_orig.keys():
    nodes = [x[0] for x in node_motif_distr[motif].items()]
    n_motifs = [x[1] for x in node_motif_distr[motif].items()]
    distr_dict[motif] = pd.DataFrame(data=n_motifs, index=nodes, columns=["n_motifs"])

In [66]:
print("\nEcoli nodes participation:\n")
for motif, distr in distr_dict.items():
    print(f"{motif}:\ttotal {len(distr)} \
genes involved ({100*len(distr)/interaction_matrix.shape[0]:.2f}%)")
print()


Ecoli nodes participation:

021C:	total 1361 genes involved (71.00%)
021D:	total 1896 genes involved (98.90%)
021U:	total 968 genes involved (50.50%)
030C:	total 0 genes involved (0.00%)
030T:	total 717 genes involved (37.40%)



In [9]:
import joblib
from itertools import combinations

In [10]:
def connected_triads_generator(interaction_matrix):
    interaction_matrix_adj = interaction_matrix - np.diag(np.diag(interaction_matrix))
    tg_idxs, tf_idxs = np.where(interaction_matrix_adj != 0)
    links = pd.DataFrame(index=range(len(tf_idxs)), columns=["tf", "tg"])
    links.tf = tf_idxs
    links.tg = tg_idxs
    links_tf = links.set_index("tf", drop=False)[["tg"]]
    
    cascades = links.join(links_tf[["tg"]], on="tg", how="inner", rsuffix="_final")
    cascades = cascades[cascades.tf != cascades.tg_final]
    
    for cascade in cascades.values:
        yield tuple(cascade)
    
    grouper = links.groupby("tg")
    counter = grouper["tf"].count()
    for tg in counter[counter > 1].index:
        tf_pairs = combinations(links[links.tg == tg].tf.values, 2)
        for tf_1, tf_2 in tf_pairs:
            yield tf_1, tf_2, tg
    
    grouper = links.groupby("tf")
    counter = grouper["tg"].count()
    for tf in counter[counter > 1].index:
        tg_pairs = combinations(links[links.tf == tf].tg.values, 2)
        for tg_1, tg_2 in tg_pairs:
            yield tf, tg_1, tg_2

In [4]:
matrix = joblib.load("./networks/mouse/regnet/interaction_matrix.gz")

In [11]:
for triad in tqdm(connected_triads_generator(matrix)):
    pass

144817125it [01:29, 1625178.87it/s]


In [12]:
matrix = joblib.load("./networks/human/regnet/interaction_matrix.gz")

In [13]:
for triad in tqdm(connected_triads_generator(matrix)):
    pass

239275889it [01:31, 2617227.53it/s]


In [14]:
!which python

/opt/anaconda3/bin/python
