<a href="https://colab.research.google.com/github/perlatomdpi/Graph-algorithms/blob/main/GPU_Accelerated_Centrality_Measures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Accelerated Centrality Measures**
Computing graph statistics such as centrality (cugraph) <br>
Filtering to interesting subgraphs using the statistics (cudf) <br>

In [None]:
# **Accelerated Centrality Measures**
Computing graph statistics such as centrality (cugraph)

# **Load data**

In [None]:
# Load as cuDF
df = cudf.read_csv('../Data/my_data.csv')

# Create DF
df = pd.DataFrame(df)

# **Largest Component via Networkx**

In [None]:
# Configure the Graph
G = nx.nx.from_pandas_edgelist(df, 'unique', 'article_id', edge_attr=True)

# Calculate giant component via networkx
G = sorted(nx.connected_components(G), key=len, reverse=True)
G0 = iG.subgraph(gG[0])

# **Conveet Giant Component to Pandas**

In [None]:
# Convert Giant Component G0 to a dataset
edgelist = nx.to_edgelist(G0)
source = [s for s, t, d in edgelist]
target = [t for s, t, d in edgelist]
weight = [d['weight'] for s, t, d in edgelist]

edges = pd.DataFrame({'source': source,
                      'target': target,
                      'weight': weight})

# **CREATE cuGRAPH and list of NODES**

In [None]:
# cuGraph depends on cuDF for data loading
gdf = cudf.from_pandas(edges)

In [None]:
def edges_to_cugraph(df, src_col, dst_col, drop_self_loops=False):
    
    # Drop self loop
    if drop_self_loops:
        df = df.copy(deep=False)
        df = df[ ~(df[src_col] == df[dst_col]) ]

    # Create list of nodes
    nodes_gdf = cudf.DataFrame({
        'id': cudf.concat([ df[src_col], df[dst_col] ], ignore_index=True, sort=False).unique()
    })
    nodes_gdf['idx'] = arange(0, len(nodes_gdf), dtype='int32')
    
    # Create cuGraph
    edges_gdf = df[[src_col, dst_col]]\
        .merge(
            nodes_gdf.rename(columns={'idx': 'src_idx'}, copy=False),
            left_on=src_col, right_on='id')\
        .merge(
            nodes_gdf.rename(columns={'idx': 'dst_idx'}, copy=False),
            left_on=dst_col, right_on='id')
    
    G = cugraph.Graph()
    G.from_cudf_edgelist(edges_gdf, source='src_idx', destination='dst_idx')
    
    return nodes_gdf, G

In [None]:
(nodes_gdf, G) = edges_to_cugraph(gdf, 'source', 'target', drop_self_loops=False)

# **Centrality Measures via cuGraph**

In [None]:
# Create new column stats
def with_vertex_calc(nodes_gdf, g_out, node_col='idx', computed_idx='vertex', computed_col='label', new_col=None):

    #print('got cols', nodes_gdf.columns, g_out.columns)
    if new_col is None:
        new_col = computed_col
        
    return nodes_gdf.merge(
        g_out[[computed_idx, computed_col]].rename(columns={
                computed_idx: node_col,
                computed_col: new_col
            }, copy=False),
        how='left',
        on=node_col)

In [None]:
# Calculate the size 
def size_by_col(nodes_gdf, col):
    #gdf[[col, <col>_size]]
    group_size = nodes_gdf[['idx', col]].groupby(col).count().reset_index().rename(columns={'idx': f'{col}_size'})
    return nodes_gdf.merge(group_size, how='left', on=col)

In [None]:
# Create stats
def decorate_graph(G, nodes_gdf):
    
    nodes_gdf = nodes_gdf.copy(deep=False)

    print('pagerank')
    nodes_gdf = with_vertex_calc(nodes_gdf, cugraph.pagerank(G), computed_col='pagerank')
    
    print('hit')
    nodes_gdf = with_vertex_calc(nodes_gdf, cugraph.hits(G), computed_col='hubs')

    print('katz')
    nodes_gdf = with_vertex_calc(nodes_gdf, cugraph.katz_centrality(G, alpha=0.01), computed_col='katz_centrality')

    print('bc')
    nodes_gdf = with_vertex_calc(nodes_gdf, cugraph.betweenness_centrality(G, k = 5000, seed = 123), computed_col='betweenness_centrality')

    print('louvain')
    nodes_gdf = with_vertex_calc(nodes_gdf, cugraph.louvain(G)[0], computed_col='partition', new_col='louvain')
    
    print('...with size')
    nodes_gdf = size_by_col(nodes_gdf, 'louvain')

    print('weakcc')
    nodes_gdf = with_vertex_calc(nodes_gdf, cugraph.weakly_connected_components(G),
                                 computed_idx='vertices',
                                 computed_col='labels', new_col='community_weak')

    print('...with size')
    nodes_gdf = size_by_col(nodes_gdf, 'community_weak')

    print('core_number')
    nodes_gdf = with_vertex_calc(nodes_gdf, cugraph.core_number(G), computed_col='core_number')
    
    print('...with size')
    nodes_gdf = size_by_col(nodes_gdf, 'core_number')

    print('degree')
    nodes_gdf = with_vertex_calc(nodes_gdf, G.degree().assign(vertex=nodes_gdf['idx']), computed_col='degree')

    print('degrees')
    degrees = G.degrees()
    
    print('in_degree')
    nodes_gdf = with_vertex_calc(nodes_gdf, degrees, computed_col='in_degree')

    print('out_degree')
    nodes_gdf = with_vertex_calc(nodes_gdf, degrees, computed_col='out_degree')

    return nodes_gdf

In [None]:
nodes_decorated_gdf = decorate_graph(G, nodes_gdf)

In [None]:
#  Range of stats
'min/max node katz_centrality', nodes_decorated_gdf['hubs'].min(), nodes_decorated_gdf['hubs'].max()

In [None]:
# Top 15 authors by stats
nodes_decorated_gdf.sort_values(by='hubs', ascending=False).head(15) 