In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import community
import matplotlib.pyplot as plt
supercluster_mrna = pd.read_csv("Supercluster_iPSC_network_mRNA_all_zelllines.csv")

#load string DB files
string_interaction = pd.read_csv("string_hsa_preprocess.csv") # needs to be downloaded
string_interaction["score"] = string_interaction["combined_score"].apply(lambda x: x/1000)
string_interaction_high = string_interaction.loc[string_interaction["score"] >= 0.4]
supercluster_mrna.head() #data exploration

Unnamed: 0.1,Unnamed: 0,external_gene_name,cluster,0,1,2,3,4,5,6,...,9,10,11,12,13,14,15,16,17,hierachical_cluster
0,0,A1BG,bisque4,-1.009933,-0.926393,-0.90005,-0.778902,-0.638063,-1.096838,-0.639447,...,0.649008,0.363506,0.441897,0.636158,0.515317,0.194895,1.236746,1.212811,1.764265,3
1,1,ABHD2,bisque4,-0.792052,-0.788651,-0.791197,-1.252975,-0.995066,-0.994503,-0.581671,...,0.389112,0.976313,0.422731,0.88536,0.83576,1.016799,1.21574,1.281514,1.049289,3
2,2,AC003681.1,bisque4,0.621397,0.473684,-0.07203,-0.607909,0.841732,0.002464,-1.190315,...,-0.258653,0.310259,0.504362,0.190671,-0.788582,0.167896,1.060799,0.956414,0.371635,3
3,3,AC067852.3,bisque4,-1.395012,-0.737278,-1.249986,-1.344368,-0.221268,-0.578643,0.241014,...,0.5036,1.174592,0.399291,0.318573,-0.400759,-0.518624,1.286129,1.419799,1.097633,3
4,4,AC087163.2,bisque4,-0.494312,-0.663351,-0.380047,-0.292224,-0.513602,0.35078,0.512961,...,0.281627,0.715792,0.375079,-0.002726,-1.943826,-0.343309,0.483261,1.013327,0.952143,3


In [2]:
# function to read string db file, which was preprocessed handily before
def string_from_file(genes_liste, string):
    
    """ fucntion uses genes as input and give network as output
    - high interaction file was used with confidence score > 0.4"""
    
    string_network = string[string["preferred_name_x"].isin(genes_liste)]
    string_end = string_network[string_network["preferred_name_y"].isin(genes_liste)]
    string_end = string_end.drop(["Unnamed: 0"], axis=1)
    string_end = string_end[~string_end[['preferred_name_x', 'preferred_name_y']].apply(frozenset, 1).duplicated()]
    return string_end

In [3]:
# network function to elucidate centrality degree and betweeness centrality based on stringDB interactions


def draw_network_get_degree(df1_diana):
    """
    df1_diana --> network file as dataframe
    returns dict with eigentvector and betweeness centrality 
    """
    G = nx.from_pandas_edgelist(df1_diana, "preferred_name_x", "preferred_name_y", edge_attr="combined_score")

    # compose the network form both
    # get the communities for each network
    parts = community.best_partition(G, weight="combined_score")
    values_community = [parts.get(node) for node in G.nodes()]
    centrality_genes = nx.eigenvector_centrality_numpy(
        G, "combined_score")  # get the eigenvector centrality
    betweeness_genes = nx.betweenness_centrality(
        G, weight="combined_score")  # get the betweenes centrality

    return centrality_genes, betweeness_genes

In [4]:
"""detect hub-genes by eigenvector centrality and in-betweenens centrality
 here just change ther hierachical cluster to cluster for the module specific detection"""
eigenvector = pd.DataFrame()  # dataframe for eigenvector values
betweeness_cent = pd.DataFrame()  # dataframe for betweenes values

# go through each hierachical cluster
for i in set(supercluster_mrna["hierachical_cluster"]):

    genes_liste = supercluster_mrna[supercluster_mrna["hierachical_cluster"]
                                    == i]["external_gene_name"].tolist()  # retrieve genes for each cluster

    # get the network for each cluster
    genes = string_from_file(genes_liste, string_interaction_high)

    centrality = pd.DataFrame()
    centrality_eig, betweeness = draw_network_get_degree(genes)  # get the data

    centrality["genes"] = centrality_eig.keys()
    centrality["eigenvector"] = centrality_eig.values()
    centrality["betweeness"] = betweeness.values()
    centrality["cluster"] = centrality.shape[0] * [str(i)]

    centrality = centrality.sort_values("eigenvector", ascending=False)
    betweeness = centrality.sort_values("betweeness", ascending=False)

    eigenvector = eigenvector.append(
        centrality[["genes", "eigenvector", "cluster"]].iloc[:5, :])
    betweeness_cent = betweeness_cent.append(
        betweeness[["genes", "betweeness", "cluster"]].iloc[:5, :])

         preferred_name_x preferred_name_y  combined_score  score
18850                ARF5            GORAB             418  0.418
18909              TRIP11            GORAB             597  0.597
18937               PAIP1            GORAB             423  0.423
18938                MTBP            GORAB             400  0.400
18995               SCYL3            GORAB             543  0.543
...                   ...              ...             ...    ...
11621512          TMEM237           CEP162             545  0.545
11651673           CCDC66            CEP63             477  0.477
11651723           CCDC14            CEP63             844  0.844
11653674            TTC23           CCDC66             447  0.447
11653700           CCDC14           CCDC66             606  0.606

[7435 rows x 4 columns]
         preferred_name_x preferred_name_y  combined_score  score
10968              CX3CL1             GNB4             900  0.900
10970               HEBP1             GNB4         

In [10]:
eigenvector.to_csv("eigenvector_centrality_superstages.csv")
betweeness_cent.to_csv("betweeneess_centrality_superstages.csv")