# Imports

In [16]:
import numpy as np
import pandas as pd

# Functions

In [8]:
# function to get JAK-dep and ind clusters to evaluate
# inputs: rmse = numpy col array of rmses, jak_dep = boolean; True means test for JAK dep. cluster, False means test for ind. cluster,
# per = percentile of rmse, large_clus = int of minimum cluster sizes to consider
# outputs: clus_bounds = list of tuples that have indices of cluster bounds
def get_jak_clus(rmse,jak_dep,per,large_clus):
    # save bounds of clusters 
    clus_bounds = []
    check = 0 

    # if checking for jak dependent cluster
    if jak_dep == True:
        # find largest cluster that meets threshold
        for i in range(rmse.shape[0]):
            if i >= check:
                for j in range(rmse.shape[0],i,-1):
                    # check if median meets threshold
                    if np.median(rmse[i:j]) >= np.percentile(rmse,per):
                        # only save if cluster is large enough 
                        if np.arange(i,j).shape[0] >= large_clus:
                            clus_bounds.append((i,j))
                            check = j
                            break
                        else:
                            check = j
                            break

    # if checking for jak independent cluster
    elif jak_dep == False:
        # find largest cluster that meets threshold
        for i in range(rmse.shape[0]):
            if i >= check:
                for j in range(rmse.shape[0],i,-1):
                    # check if median meets threshold
                    if np.median(rmse[i:j]) <= np.percentile(rmse,per):
                        # only save if cluster is large enough
                        if np.arange(i,j).shape[0] >= large_clus:
                            clus_bounds.append((i,j))
                            check = j
                            break
                        else:
                            check = j
                            break
    print('Done')
    
    return clus_bounds

# Main Script

In [19]:
# load IL-6 1 ng gene prediction RMSEs (RMSEs from paper predictions)
il6_low_stat1_rmse = np.loadtxt('Data/test_jak2i_rmse_pSTAT1.txt').reshape(-1,3)

# average RMSEs, no 4H (since we do not have RNAseq validation data for 4H)
il6_low_stat1_avg_rmse = np.mean(il6_low_stat1_rmse[:,0:2],axis=1)

# load gene list (in same order as RMSEs)
genes_df = pd.read_csv('Data/genes.txt', header=None,index_col=0)

In [9]:
# get bounds of model-predicted independent (10th percentile) and dependent (90th percentile) clusters 
il6_dep_clus_bounds_90th = get_jak_clus(il6_low_stat1_avg_rmse,jak_dep=True,per=90,large_clus=10)
il6_ind_clus_bounds_10th = get_jak_clus(il6_low_stat1_avg_rmse,jak_dep=False,per=10,large_clus=10)


Done
Done


In [22]:
# get all model-predicted il6 jak2-dependent genes
save_dep_genes = []
for i in range(len(il6_dep_clus_bounds_90th)):
    clus_ind = np.arange(il6_dep_clus_bounds_90th[i][0],il6_dep_clus_bounds_90th[i][1])
    save_dep_genes+=list(genes_df.index[clus_ind])

# get all model-predicted il6 jak2-independent genes
save_ind_genes = []
for i in range(len(il6_ind_clus_bounds_10th)):
    clus_ind = np.arange(il6_ind_clus_bounds_10th[i][0],il6_ind_clus_bounds_10th[i][1])
    save_ind_genes+=list(genes_df.index[clus_ind])
