In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from  MAGNN_preprocess_utils.preprocess import (
    generate_paths,
    evaluate_metapath_adjacency_matrix,
    validate_expected_metapaths,
    get_symmetric_metapath_neighbor_pairs,
    get_edge_metapath_idx_array
)

## Check for non-zero entries in the adjacency matrix

In [10]:
# Load the adjacency matrix
adjM = np.load('data/preprocessed/raw_adjM.npz')

(32923, 32923)

In [14]:
# example of non-zero entries in the adjacency matrix
adjM = adjM["arr_0"]
adjM[8202:8212, 0:10]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 1, 1, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 0, 1, 1, 1, 1, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 2, 1, 1, 1, 1, 1, 1, 2, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 1]])

## Metapath Analysis

In [12]:
# TODO: Any other ways to summarize and compare the non-zero metapath adjacency matrix?

### Path count analysis
- sum of all values of each metapath type adjM (total path count)
- max value of node pair 
- average path count per node pair (mean_node_pair) <br> `metapath_adjM.sum() / (metapath_adjM.shape[0] * metapath_adjM.shape[1]`)
- min value of node pair in non-zero metapath adjM
- mean value of node pair in non-zero metapath adjM
- **desnity** is used for plots (proportion of non-zero node pairs in the graph network have at least one path along the metapath) <br> `np.count_nonzero(metapath_adjM) / (metapath_adjM[0] * metapath_adjM[1])`

In [18]:
type_mask = np.load('data/preprocessed/node_types.npy')

In [19]:
expected_metapath_l3 = generate_paths(3)
metapath_len3 = evaluate_metapath_adjacency_matrix(adjM, type_mask, expected_metapath_l3)
metapath_len3_df = pd.DataFrame(metapath_len3)
metapath_len3_df

Unnamed: 0,metapath,sum,max,min,mean,density,non_zero_mean
0,"(0, 1, 0)",3083925342,177,1,45.842081,0.729859,62.809481
1,"(0, 1, 2)",4465846,12,1,0.022855,0.01565,1.460403
2,"(0, 2, 0)",384891158,1011,1,5.721349,0.01541,371.274302
3,"(0, 2, 1)",1497403,159,1,0.203302,0.042238,4.813284
4,"(1, 0, 1)",41568096,6493,1,51.547482,0.094224,547.071003
5,"(1, 0, 2)",2961493,65,1,0.138432,0.007306,18.947734
6,"(1, 2, 0)",1497403,159,1,0.203302,0.042238,4.813284
7,"(1, 2, 1)",92218,20017,1,0.114357,0.035901,3.185313
8,"(2, 0, 1)",2961493,65,1,0.138432,0.007306,18.947734
9,"(2, 0, 2)",437895926,906,1,0.771575,0.00386,199.866325


In [None]:
sorted_metapath_len3_df = metapath_len3_df.sort_values(by='density', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(sorted_metapath_len3_df['metapath'].astype(str), sorted_metapath_len3_df['density'], color='skyblue')
plt.xlabel('Density')
plt.ylabel('Metapath Types')
plt.gca().invert_yaxis()
for index, value in enumerate(sorted_metapath_len3_df['density']):
    plt.text(value, index, f'{value:.2f}', va='center')
plt.show()

In [None]:
possible_metapath_l4 = generate_paths(4)
# [1, 0, 0, 1] # this metapath does not exist, output is 0
metapath_len4 = evaluate_metapath_adjacency_matrix(adjM, type_mask, possible_metapath_l4)
metapath_len4_df = pd.DataFrame(metapath_len4)
metapath_len4_df

In [None]:
sorted_metapath_len4_df = metapath_len4_df.sort_values(by='density', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(sorted_metapath_len4_df['metapath'].astype(str), sorted_metapath_len4_df['density'], color='skyblue')
plt.xlabel('Density')
plt.ylabel('Metapath Types')
plt.gca().invert_yaxis()
for index, value in enumerate(sorted_metapath_len4_df['density']):
    plt.text(value, index, f'{value:.2f}', va='center')
plt.show()

In [None]:
possible_metapath_l5 = generate_paths(5)
metapath_len5 = evaluate_metapath_adjacency_matrix(adjM, type_mask, possible_metapath_l5)
metapath_len5_df = pd.DataFrame(metapath_len5)
metapath_len5_df

In [None]:
sorted_metapath_len5_df = metapath_len5_df.sort_values(by='density', ascending=False)

plt.figure(figsize=(14, 8))
plt.barh(sorted_metapath_len5_df['metapath'].astype(str), sorted_metapath_len5_df['density'], color='skyblue')
plt.xlabel('Density')
plt.ylabel('Metapath Types')
plt.gca().invert_yaxis()
for index, value in enumerate(sorted_metapath_len5_df['density']):
    plt.text(value, index, f'{value:.2f}', va='center')
plt.show()

In [None]:
possible_metapath_l6 = generate_paths(6)
metapath_len6 = evaluate_metapath_adjacency_matrix(adjM, type_mask, possible_metapath_l6)
metapath_len6_df = pd.DataFrame(metapath_len6)
metapath_len6_df

In [None]:
sorted_metapath_len6_df = metapath_len6_df.sort_values(by='density', ascending=False)

plt.figure(figsize=(20, 12))
plt.barh(sorted_metapath_len6_df['metapath'].astype(str), sorted_metapath_len6_df['density'], color='skyblue')
plt.xlabel('Density')
plt.ylabel('Metapath Types')
plt.gca().invert_yaxis()
for index, value in enumerate(sorted_metapath_len6_df['density']):
    plt.text(value, index, f'{value:.2f}', va='center')
plt.show()

In [None]:
expected_metapaths = [(0, 1, 0), (0, 2, 0), (1, 0, 1), (2, 0, 2), (1, 2, 1), (2, 1, 2), 
                      (0, 1, 2, 1, 0), (1, 0, 2, 0, 1), (0, 2, 1, 2, 0), (1, 2, 0, 2, 1)]

validate_expected_metapaths(possible_metapath_l4, expected_metapaths)
validate_expected_metapaths(possible_metapath_l5, expected_metapaths)

In [None]:
expected_metapaths_simplified = [(0, 1, 0), (1, 0, 1), 
                                 (0, 1, 2, 1, 0), (1, 0, 2, 0, 1), (0, 2, 1, 2, 0), (1, 2, 0, 2, 1)]
validate_expected_metapaths(possible_metapath_l4, expected_metapaths_simplified)
validate_expected_metapaths(possible_metapath_l5, expected_metapaths_simplified)

## Final expected metapath types:
`[(0, 1, 0), (0, 2, 0), (1, 0, 1), (2, 0, 2), (1, 2, 1), (2, 1, 2),  (0, 1, 2, 1, 0), (1, 0, 2, 0, 1), (0, 2, 1, 2, 0), (1, 2, 0, 2, 1)]`

In [None]:
expected_metapaths = [(0, 1, 0), (0, 2, 0), (1, 0, 1), (2, 0, 2), (1, 2, 1), (2, 1, 2),  (0, 1, 2, 1, 0), (1, 0, 2, 0, 1), (0, 2, 1, 2, 0), (1, 2, 0, 2, 1)]
neighbor_pairs = get_symmetric_metapath_neighbor_pairs(adjM, type_mask, expected_metapaths)
edge_metapath_idx_array = get_edge_metapath_idx_array(neighbor_pairs)

In [None]:
edge_metapath_idx_array[0]

In [None]:
np.savez("data/edge_metapath_idx_array.npz", edge_metapath_idx_array)