In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import networkx as nx
import seaborn as sns

from tqdm import tqdm_notebook as tqdm

from joblib import Parallel, delayed

In [None]:
df = pd.read_pickle('./j1c-features.pkl', compression='gzip')

for col in df.columns:
    if np.issubdtype(df[col].dtype, np.floating):
        df[col] = df[col].astype(np.float32)

df = df.drop('diameter', 1)
df = df.drop('degree_assortativity', 1)
df = df.dropna()

In [2]:
df = pd.read_pickle('./data/j1c-features-cleaned.pkl', compression='gzip')

In [3]:
graphs = nx.read_graph6('./data/graph10.g6')

In [4]:
graph_array = np.load('./data/vectorized_graphs.npz')['arr_0']

In [14]:
def hexbin(
    df,
    ordering,
    x_col="modularity",
    gridsize=40,
    cmap="Blues",
    bins="log",
    title=None,
    savefig=None,
):
    xlabel = ordering[x_col]
    ordering = {key:val for key, val in ordering.items() if key != x_col}

    ncols = 3
    nrows = int(np.ceil(len(ordering) / ncols))
    figsize = (4*nrows, 3*ncols)

    fig, ax = plt.subplots(ncols=ncols, nrows=nrows, figsize=figsize, sharex=True)
    ax = ax.ravel()

    for idx, (col, y_label) in enumerate(ordering.items()):
        ax[idx].hexbin(
            x=df[x_col], y=df[col], cmap=cmap, gridsize=gridsize, bins=bins
        )
        sns.despine(ax=ax[idx])
        # ax[idx].set_xlabel('Modularity', fontsize=20)
        ax[idx].set_ylabel(y_label, fontsize=20)

    for i in range(1, 4):
        ax[-i].set_xlabel(xlabel, fontsize=20)

    if len(ordering) != (ncols * nrows):
        for i in range(1, len(ax) - len(ordering) + 1):
            fig.delaxes(ax[-i])

    fig.tight_layout()
    
    if title is not None:
        fig.suptitle(title, y=1.02, fontsize=30)

    if savefig is not None:
        fig.savefig(f"{savefig}.pdf", dpi=300, bbox_inches='tight')
    
    plt.close()

In [19]:
ORDERING = dict(
    # num_edges = "# Edges",
    density="Density",
    # total_triangles = '# Triangles',
    triangle_ratio="Triangle Ratio",
    # is_planar="Is Planar Graph?",
    avg_shortest_path_length="Avg Shortest Path",
    global_clustering_coefficient="Global Clustering",
    avg_clustering_coefficient="Avg Clustering",
    # square_clustering="Square Clustering",
    global_efficiency="Global Efficiency",
    local_efficiency="Local Efficiency",
    # degree_assortativity = "Degree Assortativity",
    # diameter = 'Diameter',
    node_connectivity="Node Connectivity",
    modularity="Modularity",
)

In [15]:
hexbin(df, ORDERING, savefig="./figures/j1c-all-graphs-hexbin")

In [20]:
np.random.seed(1)

#df_ = df.reset_index(drop=True)

for num_edge in np.unique(df.num_edges):
    tmp = df[df.num_edges == num_edge]
    
    random_idx = np.random.choice(tmp.index)
    norms = np.linalg.norm(graph_array - graph_array[random_idx], axis=1)
    
    mask = np.arange(1, graph_array.shape[0]+1)[norms <= 3]
    tmp = df.loc[mask]

    hexbin(
        tmp, 
        ORDERING, 
        title=f"Base Graph Edges={num_edge}, Threshold=3, n={tmp.shape[0]}", 
        savefig=f"./figures/norm_threshold/j1c_hexbin_{num_edge:02d}_base"
    )

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if sys.path[0] == '':


In [16]:
graph_array.shape

(12005168, 45)

In [17]:
df.shape

(12005167, 13)

In [53]:
np.arange(-3,)

array([], dtype=int64)

In [61]:
def histogram(X, ax, bins=20, log=True, title = None):    
    ax.hist(X, bins=bins, log=log)

    if title is not None:
        ax.set_title(title, fontsize=14)

def run_hist(num_edge):
    tmp = df[df.num_edges == num_edge]

    random_idx = np.random.choice(tmp.index)
    norms = np.linalg.norm(graph_array - graph_array[random_idx], axis=1)
    
    # start fig
    ncols = 3
    nrows = int(np.ceil((len(ORDERING) + 1) / ncols))
    figsize = (4*nrows, 3*ncols)
    fig, ax = plt.subplots(ncols=ncols, nrows=nrows, figsize=figsize, sharex=True)
    ax = ax.ravel()
    
    fig.suptitle(f"Base Graph Edges={num_edge}", y=1.03, fontsize=25)
    
    histogram(norms, ax[0], title="No Thresholding")
    ax[0].set_ylabel("Frequency", fontsize=15)
    
    for idx, (col, title) in enumerate(ORDERING.items()):
        idx += 1
        
        uniques, counts = np.unique(df[col], return_counts=True)
        mode = uniques[np.argmax(counts)]
        
        subset = df[df[col] == mode].index
        norms_tmp = norms[subset]
        
        histogram(
            norms_tmp, 
            ax[idx],
            title=f"{title}={mode:.2f}, n={len(norms_tmp)}"
        )
        
        if idx % 3 == 0:
            ax[idx].set_ylabel("Frequency", fontsize=12)
    
    for i in np.arange(-3, 0):
        ax[i].set_xlabel("Euclidean Distance", fontsize=12)

    fig.tight_layout()
    
    fname=f"./figures/feature_threshold/j1c_histogram_{num_edge:02d}_base"
    fig.savefig(f"{fname}.pdf", dpi=300, bbox_inches='tight')
    fig.savefig(f"{fname}.png", dpi=100, bbox_inches='tight')

    
    plt.close('all')

In [62]:
np.random.seed(1)

ORDERING = dict(
    # num_edges = "# Edges",
    density="Density",
    # total_triangles = '# Triangles',
    triangle_ratio="Triangle Ratio",
    # is_planar="Is Planar Graph?",
    avg_shortest_path_length="Avg Shortest Path",
    global_clustering_coefficient="Global Clustering",
    avg_clustering_coefficient="Avg Clustering",
    # square_clustering="Square Clustering",
    global_efficiency="Global Efficiency",
    local_efficiency="Local Efficiency",
    # degree_assortativity = "Degree Assortativity",
    # diameter = 'Diameter',
    # node_connectivity="Node Connectivity",
    modularity="Modularity",
)

_ = Parallel(n_jobs=12, verbose=1)(delayed(run_hist)(n) for n in np.unique(df.num_edges))

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  45 out of  45 | elapsed:  1.4min finished
