In [1]:
import pandas as pd
import numpy as np
import graph_tool.all as gt

# 1. Data Simulation

np.random.seed(42)  # for reproducibility

num_nodes = 100
num_records = 500

nodes = [f'company_{i}' for i in range(num_nodes)]

df = pd.DataFrame({
    'yearmonth': np.random.choice(['202301', '202302', '202303'], num_records),
    'node_origin': np.random.choice(nodes, num_records),
    'node_destiny': np.random.choice(nodes, num_records),
    'total_value_transactions': np.random.rand(num_records) * 1000,
    'quantity_transactions': np.random.randint(1, 5, num_records)
})

# 2. Graph Construction

G = gt.Graph(directed=True)
vertex_map = {}
weight_prop = G.new_edge_property("double")

for _, row in df.iterrows():
    if row['node_origin'] not in vertex_map:
        v1 = G.add_vertex()
        vertex_map[row['node_origin']] = v1
    else:
        v1 = vertex_map[row['node_origin']]

    if row['node_destiny'] not in vertex_map:
        v2 = G.add_vertex()
        vertex_map[row['node_destiny']] = v2
    else:
        v2 = vertex_map[row['node_destiny']]

    e = G.add_edge(v1, v2)
    weight_prop[e] = row['total_value_transactions']

G.edge_properties["weight"] = weight_prop

# 3. Graph Features Calculation

in_degree = G.get_in_degrees(G.get_vertices())
out_degree = G.get_out_degrees(G.get_vertices())

strength_in = G.get_in_degrees(G.get_vertices(), eweight=weight_prop)
strength_out = G.get_out_degrees(G.get_vertices(), eweight=weight_prop)

eigenvector_centrality, _ = gt.eigenvector(G, weight=weight_prop)
clustering_coefficient = gt.local_clustering(G, weight=weight_prop)

# Displaying some values for demonstration
print(in_degree[:5])
print(out_degree[:5])
print(strength_in[:5])
print(strength_out[:5])
print(eigenvector_centrality.a[:5])
print(clustering_coefficient.a[:5])

ModuleNotFoundError: No module named 'graph_tool'

In [2]:
import sys
print(sys.path)

['/Users/giansantoro/ViralDashboard', '/Users/giansantoro/opt/anaconda3/lib/python38.zip', '/Users/giansantoro/opt/anaconda3/lib/python3.8', '/Users/giansantoro/opt/anaconda3/lib/python3.8/lib-dynload', '', '/Users/giansantoro/.local/lib/python3.8/site-packages', '/Users/giansantoro/opt/anaconda3/lib/python3.8/site-packages', '/Users/giansantoro/opt/anaconda3/lib/python3.8/site-packages/aeosa', '/Users/giansantoro/opt/anaconda3/lib/python3.8/site-packages/d3py-0.2.3-py3.8.egg', '/Users/giansantoro/opt/anaconda3/lib/python3.8/site-packages/IPython/extensions', '/Users/giansantoro/.ipython']


In [3]:
!pip install snap-stanford

[0mCollecting snap-stanford
  Downloading snap_stanford-6.0.0-cp38-cp38-macosx_10_14_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0mInstalling collected packages: snap-stanford
Successfully installed snap-stanford-6.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import pandas as pd
import numpy as np
import igraph as ig

# 1. Data Simulation
np.random.seed(42)  # for reproducibility

num_nodes = 100
num_records = 500

nodes = [f'company_{i}' for i in range(num_nodes)]

df = pd.DataFrame({
    'yearmonth': np.random.choice(['202301', '202302', '202303'], num_records),
    'node_origin': np.random.choice(nodes, num_records),
    'node_destiny': np.random.choice(nodes, num_records),
    'total_value_transactions': np.random.rand(num_records) * 1000,
    'quantity_transactions': np.random.randint(1, 5, num_records)
})

# 2. Graph Construction
unique_nodes = pd.concat([df['node_origin'], df['node_destiny']]).unique()
g = ig.Graph(directed=True)
g.add_vertices(unique_nodes)
g.add_edges(list(zip(df['node_origin'], df['node_destiny'])))
g.es['weight'] = df['total_value_transactions'].tolist()

# 3. Graph Features Calculation
in_degree = g.indegree()
out_degree = g.outdegree()

strength_in = g.strength(weights='weight', mode='in')
strength_out = g.strength(weights='weight', mode='out')

eigenvector_centrality = g.eigenvector_centrality(weights='weight', scale=True)
clustering_coefficient = g.transitivity_local_undirected(vertices=None, mode="zero")

# 4. Constructing the final output dataframe
result = pd.DataFrame({
    'company_id': unique_nodes,
    'in_degree': in_degree,
    'out_degree': out_degree,
    'strength_in': strength_in,
    'strength_out': strength_out,
    'eigenvector_centrality': eigenvector_centrality,
    'clustering_coefficient': clustering_coefficient
})

print(result.head())

   company_id  in_degree  out_degree  strength_in  strength_out  \
0   company_9          5           5  1722.155708   3376.039564   
1  company_18          4           7  2987.742820   5233.583908   
2  company_57          1           7   663.804528   2523.189640   
3  company_95          3           6   346.886492   3819.540565   
4   company_0          9           7  5095.014788   2597.197126   

   eigenvector_centrality  clustering_coefficient  
0                0.345615                0.044444  
1                0.165132                0.054545  
2                0.069706                0.190476  
3                0.047107                0.107143  
4                0.500072                0.058333  


  eigenvector_centrality = g.eigenvector_centrality(weights='weight', scale=True)


In [13]:
import pandas as pd
import numpy as np
import igraph as ig

# 1. Data Simulation
np.random.seed(42)  # for reproducibility

num_nodes = 100
num_records = 500

nodes = [f'company_{i}' for i in range(num_nodes)]

df = pd.DataFrame({
    'yearmonth': np.random.choice(['202301', '202302', '202303'], num_records),
    'node_origin': np.random.choice(nodes, num_records),
    'node_destiny': np.random.choice(nodes, num_records),
    'total_value_transactions': np.random.rand(num_records) * 1000,
    'quantity_transactions': np.random.randint(1, 5, num_records)
})

# 2. Graph Construction
unique_nodes = pd.concat([df['node_origin'], df['node_destiny']]).unique()
g = ig.Graph(directed=True)
g.add_vertices(unique_nodes)
g.add_edges(list(zip(df['node_origin'], df['node_destiny'])))
g.es['weight'] = df['total_value_transactions'].tolist()

# 3. Graph Features Calculation
in_degree = g.indegree()
out_degree = g.outdegree()
strength_in = g.strength(weights='weight', mode='in')
strength_out = g.strength(weights='weight', mode='out')
eigenvector_centrality = g.eigenvector_centrality(scale=True, weights='weight')
clustering_coefficient = g.transitivity_local_undirected(vertices=None, mode="zero")

# New Metrics
closeness_centrality = g.closeness()
betweenness_centrality = g.betweenness()
pagerank = g.pagerank()
hub_score, authority_score = g.hub_score(), g.authority_score()
eccentricity = g.eccentricity()
in_closeness_centrality = g.closeness(mode="in")
out_closeness_centrality = g.closeness(mode="out")
#katz_centrality = g.katz_centrality()
feedback_centrality = g.feedback_arc_set()
dyad_census = g.dyad_census()
triad_census = g.triad_census()
personalized_pagerank = g.personalized_pagerank()
assortativity_coefficient = g.assortativity_degree(directed=True)
#subgraph_centrality = g.subgraph_centrality()
feedback_centrality = g.feedback_arc_set()
#percolation_centrality = g.community_fastgreedy(weights='weight').as_clustering().membership

# 4. Constructing the final output dataframe
result = pd.DataFrame({
    'company_id': unique_nodes,
    'in_degree': in_degree,
    'out_degree': out_degree,
    'strength_in': strength_in,
    'strength_out': strength_out,
    'eigenvector_centrality': eigenvector_centrality,
    'clustering_coefficient': clustering_coefficient,
    'closeness_centrality': closeness_centrality,
    'betweenness_centrality': betweenness_centrality,
    'pagerank': pagerank,
    'hub_score': hub_score,
    'authority_score': authority_score,
    'eccentricity': eccentricity,
    'in_closeness_centrality': in_closeness_centrality,
    'out_closeness_centrality': out_closeness_centrality,
#    'katz_centrality': katz_centrality,
#    'subgraph_centrality': subgraph_centrality,
    # 'feedback_centrality': feedback_centrality,  # This returns a list of edges. Needs special handling.
#    'percolation_centrality': percolation_centrality,
    # 'bridging_centrality': bridging_centrality,  # Derived metric, needs computation
    # 'load_centrality': load_centrality  # Derived metric, needs computation
})

# Adjusting the final dataframe
#result['katz_centrality'] = katz_centrality
result['feedback_centrality'] = [1 if node in feedback_centrality else 0 for node in range(len(unique_nodes))]  # 1 if node is in feedback set, 0 otherwise
result['mutual_dyads'] = dyad_census[0]
result['asymmetric_dyads'] = dyad_census[1]
result['null_dyads'] = dyad_census[2]
result['personalized_pagerank'] = personalized_pagerank
result['assortativity_coefficient'] = [assortativity_coefficient] * len(unique_nodes)  # constant for all nodes

# Note: Triad Census returns a tuple of 16 values for different configurations.
# Extracting some of the critical configurations (you can include others as needed)
result['003_triads'] = triad_census[0]  # All three nodes are unconnected
result['102_triads'] = triad_census[5]  # Two of the nodes are connected

print(result.head())



   company_id  in_degree  out_degree  strength_in  strength_out  \
0   company_9          5           5  1722.155708   3376.039564   
1  company_18          4           7  2987.742820   5233.583908   
2  company_57          1           7   663.804528   2523.189640   
3  company_95          3           6   346.886492   3819.540565   
4   company_0          9           7  5095.014788   2597.197126   

   eigenvector_centrality  clustering_coefficient  closeness_centrality  \
0                0.345615                0.044444              0.450000   
1                0.165132                0.054545              0.445946   
2                0.069706                0.190476              0.443946   
3                0.047107                0.107143              0.423077   
4                0.500072                0.058333              0.495000   

   betweenness_centrality  pagerank  ...  in_closeness_centrality  \
0              388.612113  0.015325  ...                 0.357664   
1       

  eigenvector_centrality = g.eigenvector_centrality(scale=True, weights='weight')


In [14]:
import dask

In [None]:
import pandas as pd
import numpy as np
import igraph as ig
import dask
from dask import delayed, compute

# Simulating Data
np.random.seed(0)
n = 1000
node_origin = ['company_' + str(i) for i in np.random.choice(range(n), 10000)]
node_destiny = ['company_' + str(i) for i in np.random.choice(range(n), 10000)]
total_value_transactions = np.random.rand(10000)
quantity_transactions = np.random.randint(1, 100, size=10000)

data = pd.DataFrame({
    'node_origin': node_origin,
    'node_destiny': node_destiny,
    'total_value_transactions': total_value_transactions,
    'quantity_transactions': quantity_transactions
})

unique_nodes = pd.concat([data.node_origin, data.node_destiny]).unique()
g = ig.Graph.TupleList(edges=data[['node_origin', 'node_destiny', 'total_value_transactions']].itertuples(index=False), directed=True, weights=True)

# Using Dask to Parallelize Computations
@dask.delayed
def compute_metrics(graph):
    # Metrics computation
    degree_in = graph.degree(mode='in')
    degree_out = graph.degree(mode='out')
    strength_in = graph.strength(weights='weight', mode='in')
    strength_out = graph.strength(weights='weight', mode='out')
    eigenvector_centrality = graph.eigenvector_centrality(weights='weight', scale=True)
    clustering_coefficient = graph.transitivity_local_undirected(vertices=None, mode='zero', weights='weight')
    betweenness_centrality = graph.betweenness(vertices=None, directed=True, weights='weight')
    closeness_centrality = graph.closeness(vertices=None, directed=True, weights='weight')
    personalized_pagerank = graph.personalized_pagerank(personalization=None, weights='weight')
    assortativity_coefficient = graph.assortativity_degree(directed=True)
    #percolation_centrality = graph.community_fastgreedy(weights='weight').as_clustering().membership
    
    return degree_in, degree_out, strength_in, strength_out, eigenvector_centrality, clustering_coefficient, betweenness_centrality, closeness_centrality, personalized_pagerank, assortativity_coefficient, percolation_centrality

# Parallelize computation across cores:
results = compute_metrics(g)  
computed_results = dask.compute(results)  # Here's the correction.

degree_in, degree_out, strength_in, strength_out, eigenvector_centrality, clustering_coefficient, betweenness_centrality, closeness_centrality, personalized_pagerank, assortativity_coefficient, percolation_centrality = computed_results[0]
# Constructing the final output dataframe
result = pd.DataFrame({
    'company_id': unique_nodes,
    'degree_in': degree_in,
    'degree_out': degree_out,
    'strength_in': strength_in,
    'strength_out': strength_out,
    'eigenvector_centrality': eigenvector_centrality,
    'clustering_coefficient': clustering_coefficient,
    'betweenness_centrality': betweenness_centrality,
    'closeness_centrality': closeness_centrality,
    'personalized_pagerank': personalized_pagerank,
    'assortativity_coefficient': [assortativity_coefficient] * len(unique_nodes),  # Graph level metric
    '#percolation_centrality': percolation_centrality
})

print(result.head())

In [1]:
import pandas as pd
import numpy as np
import igraph as ig
import dask
from dask import delayed, compute

# Simulating Data
np.random.seed(0)
n = 1000
node_origin = ['company_' + str(i) for i in np.random.choice(range(n), 10000)]
node_destiny = ['company_' + str(i) for i in np.random.choice(range(n), 10000)]
total_value_transactions = np.random.rand(10000)
quantity_transactions = np.random.randint(1, 100, size=10000)

data = pd.DataFrame({
    'node_origin': node_origin,
    'node_destiny': node_destiny,
    'total_value_transactions': total_value_transactions,
    'quantity_transactions': quantity_transactions
})

unique_nodes = pd.concat([data.node_origin, data.node_destiny]).unique()
g = ig.Graph.TupleList(edges=data[['node_origin', 'node_destiny', 'total_value_transactions']].itertuples(index=False), directed=True, weights=True)

# Using Dask to Parallelize Computations
@dask.delayed
def compute_metrics(graph):
    # Metrics computation
    degree_in = graph.degree(mode='in')
    degree_out = graph.degree(mode='out')
    #strength_in = graph.strength(weights='weight', mode='in')
    #strength_out = graph.strength(weights='weight', mode='out')
    #eigenvector_centrality = graph.eigenvector_centrality(weights='weight', scale=True)
    #clustering_coefficient = graph.transitivity_local_undirected(vertices=None, mode='zero', weights='weight')
    #betweenness_centrality = graph.betweenness(vertices=None, directed=True, weights='weight')
    #closeness_centrality = graph.closeness(vertices=None, directed=True, weights='weight')
    #personalized_pagerank = graph.personalized_pagerank(personalization=None, weights='weight')
    #assortativity_coefficient = graph.assortativity_degree(directed=True)
    #percolation_centrality = graph.community_fastgreedy(weights='weight').as_clustering().membership
    
    #return degree_in, degree_out, strength_in, strength_out, eigenvector_centrality, clustering_coefficient, betweenness_centrality, closeness_centrality, personalized_pagerank, assortativity_coefficient, percolation_centrality
    return degree_in, degree_out
# Parallelize computation across cores:
results = compute_metrics(g)  
computed_results = dask.compute(results)  # Here's the correction.

#degree_in, degree_out, strength_in, strength_out, eigenvector_centrality, clustering_coefficient, betweenness_centrality, closeness_centrality, personalized_pagerank, assortativity_coefficient, percolation_centrality = computed_results[0]
degree_in, degree_out = computed_results[0]
# Constructing the final output dataframe
result = pd.DataFrame({
    'company_id': unique_nodes,
    'degree_in': degree_in,
    'degree_out': degree_out,
    #'strength_in': strength_in,
    #'strength_out': strength_out,
    #'eigenvector_centrality': eigenvector_centrality,
    #'clustering_coefficient': clustering_coefficient,
    #'betweenness_centrality': betweenness_centrality,
    #'closeness_centrality': closeness_centrality,
    #'personalized_pagerank': personalized_pagerank,
    #'assortativity_coefficient': [assortativity_coefficient] * len(unique_nodes),  # Graph level metric
    #'#percolation_centrality': percolation_centrality
})

print(result.head())

    company_id  degree_in  degree_out
0  company_684         12           7
1  company_559         11           9
2  company_629          8          13
3  company_192         15           8
4  company_835          8           8
