### Overview:

In this notebook, we show how we compute summary information about each community:
- Hubness:  For each community, we compute the hubness of each of its member pathways. This gives us a sense of which pathways may have the most relevance to the others
- KMer labels: Here, we perform the algorithm described in our paper to annotate each community with top-occuring k-mers across the pathways' names. 

Information computed here is displayed in figures in the paper, as well as on the website. 

In [10]:
import networkx as nx
import numpy as np
import os
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import string

import utils

### Helper functions:

In [2]:
def text_to_words(text, added_stop_words, removed_stop_words):
    # tokens = word_tokenize(text)
    tokens = re.split('-|_| ',text)
    # convert to upper case
    tokens = [w.upper() for w in tokens]

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalnum()]
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    for w in added_stop_words:
        stop_words.add(w.lower())
    for w in removed_stop_words:
        stop_words.remove(w.lower())
    words = [w for w in words if not w.lower() in stop_words]
    
    return words

def get_kmers(string, k, sep='-|_| ', return_short=True, 
              added_stop_words=["KEGG", "REACTOME", "GO"], removed_stop_words=["up", "down"]):
    
    stringslist = text_to_words(string, added_stop_words, removed_stop_words)
    kmers = []
    for i in range(len(stringslist)-k+1):
        kmers.append(" ".join(stringslist[i:i+k]))
    if (len(kmers)>0) or (not return_short):
        return kmers
    else:
        return([" ".join(stringslist)])
    
    


## Load gene sets

In [3]:
acronym_to_folder = {"KEGG": "c2.all.v7.0.symbols_JustK", "REACTOME":"c2.all.v7.0.symbols_JustR",
                  "GO_BP": "c5.bp.v7.0.symbols_SHORT", "GO_CC": "c5.cc.v7.0.symbols", "GO_MF": "c5.mf.v7.0.symbols"}
folder_to_acronym = utils.reverse_dict(acronym_to_folder, assume_unique = True)

pway_subfolders =  'c2.all.v7.0.symbols_JustK-c2.all.v7.0.symbols_JustR-c5.bp.v7.0.symbols_SHORT-c5.mf.v7.0.symbols'

gsets_folders = pway_subfolders.split("-")
gsets_acronyms = [folder_to_acronym[x] for x in gsets_folders]

In [4]:
pathways = {}
pway_indices = {}

cur_idx = 0
for foldername in gsets_folders:
    name = folder_to_acronym[foldername]
    pathways[name] = np.loadtxt('../adj_matrices/%s/pathway_names.txt'%foldername, dtype=str)
    pway_indices[name] = np.arange(cur_idx, cur_idx + len(pathways[name])).astype(int)
    cur_idx += len(pathways[name])
    
pathway_names = np.hstack([pathways[name] for name in gsets_acronyms])

coms_labels_file = "../Full_graph_louvain_with_weights_community_labels/0.4/labels.tsv"
new_com_df = pd.read_csv(coms_labels_file, delimiter="\t", names=["pathways", "com"])
new_com_df = new_com_df.merge(pd.DataFrame(pathway_names.reshape(len(pathway_names),1), columns=["pathways"]), how="right", on="pathways")
coms = new_com_df["com"].values

### Load curated labels

In [5]:
gmts, true_labels_unique, true_labels, true_labels_names= utils.load_curated_labels(pway_dbs=gsets_acronyms)
pathway_names_curated_categories =  np.hstack([true_labels_names[name] for name in gsets_acronyms])

Number of true categories:  38
[ 1.  3.  4.  5.  6.  7.  8.  9. 16. 17. 21. 22. 23. 24. 25. 26. 27. 28.
 29. 30. 31. 32. 34. 35. 36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 54. 55.
 56. 57.]
Number of true categories:  27
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23. 24. 25. 26.]
Number of true categories:  64
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35.
 36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53.
 54. 55. 56. 57. 58. 59. 60. 61. 62. 63.]
Number of true categories:  69
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35.
 36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53.
 54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68.]


### Load graph & get within-community hubness for members

In [6]:
weights = utils.load_graph_edges(gsets_folders, "../adj_matrices", pathways)

c2.all.v7.0.symbols_JustK
	 c2.all.v7.0.symbols_JustK 186
	 c2.all.v7.0.symbols_JustK c2.all.v7.0.symbols_JustR
	 c2.all.v7.0.symbols_JustK c5.bp.v7.0.symbols_SHORT
	 c2.all.v7.0.symbols_JustK c5.mf.v7.0.symbols
c2.all.v7.0.symbols_JustR
	transposed! c2.all.v7.0.symbols_JustR c2.all.v7.0.symbols_JustK
	 c2.all.v7.0.symbols_JustR 1499
	 c2.all.v7.0.symbols_JustR c5.bp.v7.0.symbols_SHORT
	 c2.all.v7.0.symbols_JustR c5.mf.v7.0.symbols
c5.bp.v7.0.symbols_SHORT
	transposed! c5.bp.v7.0.symbols_SHORT c2.all.v7.0.symbols_JustK
	transposed! c5.bp.v7.0.symbols_SHORT c2.all.v7.0.symbols_JustR
	 c5.bp.v7.0.symbols_SHORT 1517
	 c5.bp.v7.0.symbols_SHORT c5.mf.v7.0.symbols
c5.mf.v7.0.symbols
	transposed! c5.mf.v7.0.symbols c2.all.v7.0.symbols_JustK
	transposed! c5.mf.v7.0.symbols c2.all.v7.0.symbols_JustR
	transposed! c5.mf.v7.0.symbols c5.bp.v7.0.symbols_SHORT
	 c5.mf.v7.0.symbols 1645


  weights =  -1 * np.log10(pvals)


In [7]:
# FOR EACH COMMUNITY, CALCULATE HUBNESS OF MEMBERS IN THEIR SUBGRAPH
hubs_dfs = []
for com in np.unique(coms):
    if com%5==0:print(com)
    pway_idx = np.where(coms==com)
    pways_in_com = pathway_names[pway_idx]

    internal_weights = weights[pway_idx][:, pway_idx]
    internal_weights = internal_weights.reshape((len(internal_weights), len(internal_weights)))
    internal_G = nx.from_numpy_matrix(internal_weights)
    weighted_g_hubs = nx.hits(internal_G)[0]
    hubs_df = pd.DataFrame(weighted_g_hubs.values(), index=pways_in_com, columns=["hubness"])
    hubs_df.index.set_names(['pathway'], inplace=True)
    hubs_dfs.append(hubs_df.reset_index())

0
5
10
15
20
25
30


In [8]:
hubs_dfs_for_app = []
for c,df in enumerate(hubs_dfs):
    df["community"]=c+1
    hubs_dfs_for_app.append(df)
    
com_members_hubness_app = pd.concat(hubs_dfs).reset_index(drop=True)
com_members_hubness_app.to_csv("appendix_tables/com_members_and_hubness_in_com.csv", index=False)

com_members_hubness_app["curated category"] = pathway_names_curated_categories
com_members_hubness_app.to_csv("appendix_tables/com_members_and_hubness_in_com_w_curated.csv", index=False)

In [9]:
com_members_hubness_app.head()

Unnamed: 0,pathway,hubness,community,curated category
0,KEGG_GLYCOLYSIS_GLUCONEOGENESIS,0.001784,1,Carbohydrate metabolism
1,KEGG_CITRATE_CYCLE_TCA_CYCLE,0.002442,1,Carbohydrate metabolism
2,KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS,0.001103,1,Carbohydrate metabolism
3,KEGG_ASCORBATE_AND_ALDARATE_METABOLISM,0.001244,1,Carbohydrate metabolism
4,KEGG_FATTY_ACID_METABOLISM,0.002164,1,Carbohydrate metabolism


### Identifying top k-mer labels for each community:

In [10]:
def sorted_kmers(k, com):
    pway_idx = np.where(coms==com)
    pways_in_com = pathway_names[pway_idx]
    
    pathway_kmers = []
    pway_kmer_df_names = []
    for i,name in enumerate(pways_in_com):
        kmers = get_kmers(name, k)
        pathway_kmers += kmers
        pway_kmer_df_names += len(kmers)*[name]

    df = pd.DataFrame(np.vstack([pway_kmer_df_names, pathway_kmers]).T, columns=["pathway", "kmer"])
    df = df.merge(hubs_dfs[com], on="pathway")

    grouped_counts=df.groupby(['kmer']).size().to_frame('count').reset_index()
    grouped_hubness=df[["kmer", "hubness"]].groupby("kmer").mean().reset_index()

    kmers_df = grouped_hubness.merge(grouped_counts, on='kmer').sort_values(["count", "hubness"], ascending=False)
    kmers_df = kmers_df[kmers_df["count"] >= kmers_min_appearances]
    kmers_df["k"] = k
    return kmers_df


In [11]:
starting_k = 3
kmers_min_appearances = 3
num_rows_appendix = 10    

kmer_pathways_dfs = []
for com in np.unique(coms):
    k = starting_k
    kmers_df = sorted_kmers(k, com)
    while len(kmers_df) < num_rows_appendix: 
        k -= 1
        if k==0:
            break
        print("Community %i: k=%i"%(com+1, k))
        kmers_df = pd.concat([kmers_df,sorted_kmers(k, com)])
    
    kmers_df["community"] = com+1
    kmer_pathways_dfs.append(kmers_df[:num_rows_appendix]) 


Community 2: k=2
Community 2: k=1
Community 5: k=2
Community 6: k=2
Community 7: k=2
Community 8: k=2
Community 8: k=1
Community 10: k=2
Community 13: k=2
Community 14: k=2
Community 15: k=2
Community 16: k=2
Community 17: k=2
Community 19: k=2
Community 19: k=1
Community 21: k=2
Community 22: k=2
Community 23: k=2
Community 24: k=2
Community 24: k=1
Community 25: k=2
Community 27: k=2
Community 27: k=1
Community 28: k=2
Community 28: k=1
Community 30: k=2
Community 30: k=1
Community 31: k=2
Community 32: k=2
Community 32: k=1
Community 33: k=2
Community 33: k=1
Community 34: k=2
Community 35: k=2
Community 35: k=1


In [12]:
kmers_for_web = pd.concat([x for x in kmer_pathways_dfs]).reset_index(drop=True)

In [13]:
kmers_for_web.head()

Unnamed: 0,kmer,hubness,count,k,community
0,OXIDOREDUCTASE ACTIVITY ACTING,0.003523,33,3,1
1,ACTIVITY ACTING CH,0.002293,11,3,1
2,NAD P H,0.006186,7,3,1
3,ACTING PAIRED DONORS,0.004708,7,3,1
4,ACTIVITY ACTING PAIRED,0.004708,7,3,1


In [14]:
# Supplementary table containing the top (up to) 10 kmers for each community
kmers_for_web.to_csv("appendix_tables/all_kmers.csv", index=False)

### All kmers associated with any community
We include this as a supplementary table; this is also used in our webpage for when users want to query a specific process

In [15]:
starting_k = 3
kmers_min_appearances = 3
num_rows_appendix = 10    

kmer_pathways_dfs_allk = []
for com in np.unique(coms):
    comsize = (coms==0).sum()

    for k in range(1,starting_k+1):
        kmers_df = sorted_kmers(k, com)
        kmers_df["community"] = com+1
        kmers_df["count_fraction"] = kmers_df["count"]/comsize
        kmer_pathways_dfs_allk.append(kmers_df) 

In [18]:
kmers_query_for_web = pd.concat([x for x in kmer_pathways_dfs_allk]).reset_index(drop=True)
kmers_query_for_web.to_csv("appendix_tables/query_all_kmers.csv", index=False)

In [19]:
kmers_query_for_web.head()

Unnamed: 0,kmer,hubness,count,k,community,count_fraction
0,ACTIVITY,0.001924,158,1,1,0.321138
1,PROCESS,0.001121,69,1,1,0.140244
2,BINDING,0.002139,62,1,1,0.126016
3,METABOLISM,0.001676,60,1,1,0.121951
4,OXIDOREDUCTASE,0.003545,45,1,1,0.091463


### Example of querying for a specific process:

In [20]:
search_term = "CANCER"

In [21]:
kmers_query_for_web[kmers_query_for_web["kmer"]==search_term].sort_values(["count_fraction", "hubness"], ascending=[False, False])

Unnamed: 0,kmer,hubness,count,k,community,count_fraction
959,CANCER,0.005943,14,1,10,0.028455
1970,CANCER,0.00109,4,1,20,0.00813
