# Network construction

Network structure formulation in term of binary interaction matrix

In [1]:
import os
import sys
sys.path.insert(0, "./scripts")
import numpy as np
import pandas as pd
from itertools import combinations, product
import joblib
import functions as f

Actual parametrisation

In [2]:
cfg = f.get_actual_parametrization("./config.json")

RANDOM_SEED: 19
TEST_NETWORK_SIZE: 300
TEST_NETWORK_LINK_PROB: 0.01
N_CORES_TO_USE: -1
NETWORK_TO_SEARCH_IN: gs0.1


## Random network for testing

In [3]:
cfg = f.update_cfg("./config.json", "TEST_NETWORK_LINK_PROB", 0.1, verbose=False)
cfg = f.update_cfg("./config.json", "TEST_NETWORK_SIZE", 500)

RANDOM_SEED: 19
TEST_NETWORK_SIZE: 500
TEST_NETWORK_LINK_PROB: 0.1
N_CORES_TO_USE: -1
NETWORK_TO_SEARCH_IN: gs0.1


In [4]:
TEST_NETWORK_PATH = "./networks/test"

In [5]:
np.random.seed(cfg["RANDOM_SEED"])
network_size = cfg["TEST_NETWORK_SIZE"]
link_probability = cfg["TEST_NETWORK_LINK_PROB"]
interaction_matrix = np.random.binomial(
    1, link_probability, size=(network_size, network_size)
)

In [6]:
if not os.path.exists(TEST_NETWORK_PATH):
    os.mkdir(TEST_NETWORK_PATH)

joblib.dump(interaction_matrix, os.path.join(TEST_NETWORK_PATH, "interaction_matrix.gz"));

## Yeast Tnet

In [7]:
YEAST_NETWORK_PATH = "./networks/yeast"

In [8]:
edges = pd.read_csv("./raw_data/tfcomb/tnet.txt", sep="\t")
nodes = sorted(np.unique(np.concatenate((edges.Tf.unique(), edges.Tg.unique()))))
nodes = pd.DataFrame(data=range(len(nodes)), index=nodes, columns=["idx"])
print(f"Total genes number:\t{len(nodes)}")
print(f"Interactions number:\t{len(edges)}")
edges.head()

Total genes number:	4441
Interactions number:	12873


Unnamed: 0,Tf,Tg
0,YAL051W,YAL016W
1,YAL051W,YAL034WA
2,YAL051W,YAL035CA
3,YAL051W,YAL035W
4,YAL051W,YAL036C


In [9]:
edges_ = edges.join(nodes, on="Tf").join(nodes, on="Tg", lsuffix="_tf", rsuffix="_tg")
tf_nodes = edges_["idx_tf"].unique()
print(f"Total TF:\t{len(tf_nodes)}")
tg_nodes = edges_["idx_tg"].unique()
print(f"Total TG:\t{len(tg_nodes)}")
tf_x_tg_nodes = np.array(sorted(set(tf_nodes) & set(tg_nodes)))
print(f"TF and TG:\t{len(tf_x_tg_nodes)}")
tg_only_nodes = np.array(sorted(set(tg_nodes) - set(tf_nodes)))
print(f"TG only:\t{len(tg_only_nodes)}")
tf_only_nodes = np.array(sorted(set(tf_nodes) - set(tg_nodes)))
print(f"TF only:\t{len(tf_only_nodes)}")

meta = tf_nodes, tf_only_nodes, tg_nodes, tg_only_nodes, tf_x_tg_nodes
joblib.dump(meta, os.path.join(YEAST_NETWORK_PATH, "meta.pkl"));

Total TF:	157
Total TG:	4410
TF and TG:	126
TG only:	4284
TF only:	31


In [10]:
def build_Tnet(edges, n):
    Tnet = np.zeros((n, n))
    Tnet[edges[:, 0], edges[:, 1]] = 1
    return Tnet

np_edges = edges_[["idx_tg", "idx_tf"]].values
interaction_matrix = build_Tnet(np_edges, len(nodes))

if not os.path.exists(YEAST_NETWORK_PATH):
    os.mkdir(YEAST_NETWORK_PATH)

joblib.dump(interaction_matrix, os.path.join(YEAST_NETWORK_PATH, "interaction_matrix.gz"));

## E. coli network

In [11]:
ECOLI_NETWORK_PATH = "./networks/ecoli"

In [12]:
df = f.read_ecoli_network("./raw_data/ecoli/network_tf_gene.txt")

In [13]:
edges = df[[1, 3]]
edges.columns = ["tf", "tg"]
edges.tf = edges.tf.apply(lambda x: x.lower())
edges.tg = edges.tg.apply(lambda x: x.lower())
nodes = sorted(np.unique(np.concatenate((edges.tf.unique(), edges.tg.unique()))))
nodes = pd.DataFrame(data=range(len(nodes)), index=nodes, columns=["idx"])
print(f"Total genes number:\t{len(nodes)}")
print(f"Interactions number:\t{len(edges)}")
edges.head()

Total genes number:	1917
Interactions number:	4693


Unnamed: 0,tf,tg
0,accb,accb
1,accb,accc
2,acrr,acra
3,acrr,acrb
4,acrr,acrr


In [14]:
edges_ = edges.join(nodes, on="tf").join(nodes, on="tg", lsuffix="_tf", rsuffix="_tg")
tf_nodes = edges_["idx_tf"].unique()
print(f"Total TF:\t{len(tf_nodes)}")
tg_nodes = edges_["idx_tg"].unique()
print(f"Total TG:\t{len(tg_nodes)}")
tf_x_tg_nodes = np.array(sorted(set(tf_nodes) & set(tg_nodes)))
print(f"TF and TG:\t{len(tf_x_tg_nodes)}")
tg_only_nodes = np.array(sorted(set(tg_nodes) - set(tf_nodes)))
print(f"TG only:\t{len(tg_only_nodes)}")
tf_only_nodes = np.array(sorted(set(tf_nodes) - set(tg_nodes)))
print(f"TF only:\t{len(tf_only_nodes)}")

Total TF:	211
Total TG:	1854
TF and TG:	148
TG only:	1706
TF only:	63


In [15]:
def build_Tnet(edges, n):
    Tnet = np.zeros((n, n))
    Tnet[edges[:, 0], edges[:, 1]] = 1
    return Tnet

np_edges = edges_[["idx_tg", "idx_tf"]].values
interaction_matrix = build_Tnet(np_edges, len(nodes))

if not os.path.exists(ECOLI_NETWORK_PATH):
    os.mkdir(ECOLI_NETWORK_PATH)

joblib.dump(interaction_matrix, os.path.join(ECOLI_NETWORK_PATH, "interaction_matrix.gz"));

## Gene Spyder

### SNR 0.01

In [16]:
GS_NETWORK_PATH = "./networks/gs0.01"

interaction_matrix = pd.read_csv("./raw_data/GS_N800/N800_SNR_0.01.csv", header=None).astype(bool).astype(int).values
print(f"Total genes number:\t{interaction_matrix.shape[0]}")
interaction_matrix_adj = interaction_matrix - np.diag(np.diag(interaction_matrix))
print(f"Interactions number:\t{interaction_matrix_adj.sum()}")
print()
tg_idxs, tf_idxs = np.where(interaction_matrix_adj != 0)
print(f"Total TF:\t{np.unique(tf_idxs).shape[0]}")
print(f"Total TG:\t{np.unique(tg_idxs).shape[0]}")
print(f"TF and TG:\t{len(set(tg_idxs) & set(tf_idxs))}")
print(f"TG only:\t{len(set(tg_idxs) - set(tf_idxs))}")
print(f"TF only:\t{len(set(tf_idxs) - set(tg_idxs))}")

if not os.path.exists(GS_NETWORK_PATH):
    os.mkdir(GS_NETWORK_PATH)

joblib.dump(interaction_matrix, os.path.join(GS_NETWORK_PATH, "interaction_matrix.gz"));

Total genes number:	800
Interactions number:	1925

Total TF:	693
Total TG:	706
TF and TG:	604
TG only:	102
TF only:	89


### SNR 0.1

In [17]:
GS_NETWORK_PATH = "./networks/gs0.1"

interaction_matrix = pd.read_csv("./raw_data/GS_N800/N800_SNR_0.1.csv", header=None).astype(bool).astype(int).values
print(f"Total genes number:\t{interaction_matrix.shape[0]}")
interaction_matrix_adj = interaction_matrix - np.diag(np.diag(interaction_matrix))
print(f"Interactions number:\t{interaction_matrix_adj.sum()}")
print()
tg_idxs, tf_idxs = np.where(interaction_matrix_adj != 0)
print(f"Total TF:\t{np.unique(tf_idxs).shape[0]}")
print(f"Total TG:\t{np.unique(tg_idxs).shape[0]}")
print(f"TF and TG:\t{len(set(tg_idxs) & set(tf_idxs))}")
print(f"TG only:\t{len(set(tg_idxs) - set(tf_idxs))}")
print(f"TF only:\t{len(set(tf_idxs) - set(tg_idxs))}")

if not os.path.exists(GS_NETWORK_PATH):
    os.mkdir(GS_NETWORK_PATH)

joblib.dump(interaction_matrix, os.path.join(GS_NETWORK_PATH, "interaction_matrix.gz"));

Total genes number:	800
Interactions number:	1913

Total TF:	706
Total TG:	683
TF and TG:	594
TG only:	89
TF only:	112


In [18]:
GS_NETWORK_PATH = "./networks/gs1"

interaction_matrix = pd.read_csv("./raw_data/GS_N800/N800_SNR_1.csv", header=None).astype(bool).astype(int).values
print(f"Total genes number:\t{interaction_matrix.shape[0]}")
interaction_matrix_adj = interaction_matrix - np.diag(np.diag(interaction_matrix))
print(f"Interactions number:\t{interaction_matrix_adj.sum()}")
print()
tg_idxs, tf_idxs = np.where(interaction_matrix_adj != 0)
print(f"Total TF:\t{np.unique(tf_idxs).shape[0]}")
print(f"Total TG:\t{np.unique(tg_idxs).shape[0]}")
print(f"TF and TG:\t{len(set(tg_idxs) & set(tf_idxs))}")
print(f"TG only:\t{len(set(tg_idxs) - set(tf_idxs))}")
print(f"TF only:\t{len(set(tf_idxs) - set(tg_idxs))}")

if not os.path.exists(GS_NETWORK_PATH):
    os.mkdir(GS_NETWORK_PATH)

joblib.dump(interaction_matrix, os.path.join(GS_NETWORK_PATH, "interaction_matrix.gz"));

Total genes number:	800
Interactions number:	1929

Total TF:	692
Total TG:	716
TF and TG:	614
TG only:	102
TF only:	78
