In [13]:
import pandas as pd
import igraph as ig
import math
import random
import numpy as np

In [7]:
score_file = '../ppi_ml/results/ppi_predict/20_feats/scored_interactions_fdr10_ExtraTreesClassifier.csv'
annotations = '../ppi_ml/annotations/leca_eunog_annots_complete.030721.csv'
outdir = '../ppi_ml/results/test_new_walktrap/'
seed = 13
steps = 5

In [3]:
def make_graph(score_file):
    # read in data
    scores = pd.read_csv(score_file)
    # format graph dataframe
    gdf = pd.DataFrame()
    gdf[['ID1','ID2']] = scores['ID'].str.split(' ', expand=True)
    gdf['weight'] = scores['ppi_score']
    graph = ig.Graph.TupleList(gdf.itertuples(index=False), directed=False, weights=True)
    return(graph)

def walktrap(graph, n_steps=4, n_clusters=None):
    # run walktrap & get clusters
    clusters = graph.community_walktrap(weights='weight', steps=n_steps).as_clustering(n_clusters)
    # write clusters & IDs to dict
    clst_dict = dict()
    for cluster, id_list in enumerate(clusters):
        clst_dict.update({cluster: id_list})
    # get node ids & number of complexes
    nodes = graph.get_vertex_dataframe()
    n_cmplx = len(clusters)
    # format & return data frame
    clst_df = (pd.DataFrame.from_dict(clst_dict, orient='index').T
           .melt(var_name='id', value_name='value')
           .dropna(subset=['value'])
           .astype(int)
           .rename(columns={'id':f'cut_{n_cmplx}', 'value':'ID'}))
    clst_df['ID'].replace(nodes['name'], inplace=True)
    out_df = clst_df[['ID', f'cut_{n_cmplx}']].reset_index(drop=True)
    return(out_df)

In [4]:
# set seed if specified
random.seed(seed)

# format data into igraph object
ppi_graph = make_graph(score_file)
total_prots = ppi_graph.vcount()

# get dendrogram w/ optimal number of clusters
df_opt = walktrap(ppi_graph)
n_opt = len(df_opt.iloc[:,1].drop_duplicates())

# get range of cuts 
cuts = np.linspace(n_opt, total_prots, 8, endpoint=False)
cuts = np.floor(intervals)
cuts = np.delete(intervals, 0)

df_list = []
for i in cuts:
    clst = walktrap(ppi_graph, n_steps=steps, n_clusters=int(i))
    df_list.append(clst)
    
# merge all cuts
for df in df_list:
    df_opt = df_opt.merge(df, how='left', on='ID')

# sort clusters
sort_cols = df_opt.columns.values[1:].tolist()
df_out = df_opt.sort_values(sort_cols)

# join annotations if specified
annot_df = pd.read_csv(annotations)
df_out = df_out.merge(annot_df, how='left', on=['ID'])

# write results
df_out.to_csv(outdir+f'walktrap_test_{steps}steps_even-cuts.csv', index=False)

In [27]:
total_prots = ppi_graph.vcount()
intervals = np.linspace(n_opt, total_prots, 8, endpoint=False)
intervals = np.floor(intervals)
intervals = np.delete(intervals, 0)
for i in intervals:
    print(i)

581.0
969.0
1356.0
1744.0
2132.0
2519.0
2907.0


#### accessing graph attributes

In [5]:
# nodes = g.get_vertex_dataframe()
# edges = g.get_edge_dataframe()
# edges['source'].replace(nodes['name'], inplace=True)
# edges['target'].replace(nodes['name'], inplace=True)
# edges

#### TODO: edge betweenness

In [6]:
# # calculate dendrogram
# dendrogram = g.community_edge_betweenness()
# # convert it into a flat clustering
# clusters = dendrogram.as_clustering()
# # get the membership vector
# membership = clusters.membership
# # write results
# import csv
# writer = csv.writer(open(outdir+'ebc_test.csv', 'wb'))
# for name, membership in zip(g.vs["name"], membership):
#     writer.writerow([name, membership])