"""
compute the average distance
from first k-genes in progeni to drug target
"""

In [1]:
import sys
import os
import pandas as pd
import scipy.stats as ss

import networkx as nx
import matplotlib.pyplot as plt
plt.style.use('seaborn-paper')

import dist_utils

# current directory
netphix_dir = os.getcwd() + "/"

In [10]:
# read files

# network 
# radius of the largest connected compoent is 6. max_dist set to 7.
net_file = netphix_dir + "data/HumanStringNet.txt"
all_net = nx.read_edgelist(net_file, data=(('weight',float),))
nodes = list(all_net.nodes())

# drug targets
drug_id_df = pd.read_csv(netphix_dir + "data/drug_target_id.txt", sep="\t", index_col=0)
gdsc_target_df = pd.read_csv(netphix_dir + "data/gdsc_drug_targets.tsv", sep="\t", index_col=0)
drug_target_df = drug_id_df.merge(gdsc_target_df, left_on="drug", right_index=True)
drug_target_df["checked_targets"] = drug_target_df["Target"].apply(dist_utils.check_target_nodes, args=(nodes,))
id_targets = drug_target_df["checked_targets"]
drug_targets = drug_target_df.set_index("drug")["checked_targets"]
drug_targets_dic = drug_targets.to_dict()


In [11]:
# pick a random module of size 20 and compute distance

random_df = pd.DataFrame()
all_genes = list(all_net.nodes())
for i in range(1, 266):
    random_genes = dist_utils.choose_random_genes(all_genes, 20)
    random_df[i] = random_genes

random_mean_dist_df = dist_utils.comp_mean_dist2(random_df, all_net, id_targets)

random_dist_results_df = pd.DataFrame()
random_dist_results_df["random_top5"] = random_mean_dist_df.iloc[:5].mean(skipna=True)
random_dist_results_df["random_top10"] = random_mean_dist_df.iloc[:10].mean(skipna=True)
random_dist_results_df["random_top20"] = random_mean_dist_df.iloc[:20].mean(skipna=True)

In [12]:
# netphix modules
netphix_file = netphix_dir +"results/max_sig_combined_modules_0.05.tsv"
netphix_modules_df = pd.read_csv(netphix_file, sep="\t")

netphix_dist_results_df = dist_utils.comp_mean_dist(netphix_modules_df, all_net, drug_targets_dic)

In [29]:
# statistics
# 
print(netphix_dist_results_df.mean())


print(ss.ttest_ind(netphix_dist_results_df.both.values, random_dist_results_df.random_top5.values, nan_policy='omit'))
print(ss.ttest_ind(netphix_dist_results_df.both.values, random_dist_results_df.random_top20.values, nan_policy='omit'))
print(ss.ttest_ind(netphix_dist_results_df.inc.values, netphix_dist_results_df.dec.values, nan_policy='omit'))


dec          2.370366
inc          1.968476
both         2.123344
no_target    2.152620
dtype: float64
dec          2.669540
inc          2.228588
both         2.500609
no_target    2.544827
dtype: float64
progeni_top5     3.020476
progeni_top10    2.991471
progeni_top20    2.944085
dtype: float64
random_top5     2.978014
random_top10    2.973667
random_top20    2.977317
dtype: float64
Ttest_indResult(statistic=-17.90892400679556, pvalue=2.8659937420400063e-58)
Ttest_indResult(statistic=-7.31698561270371, pvalue=2.147050781701636e-12)
Ttest_indResult(statistic=-18.655685158774396, pvalue=3.7523470178959193e-62)
Ttest_indResult(statistic=-8.074806879762898, pvalue=1.4722648848021077e-14)
Ttest_indResult(statistic=-7.776916946174541, pvalue=2.2292608503051995e-14)
Ttest_indResult(statistic=-3.439854094278539, pvalue=0.0008032551312499787)
