In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import entropy
from wl_kernel.kernel import WLkernel

In [2]:
def load_graphs_from_dir(directory):
    graphs = []
    for filename in os.listdir(directory):
        if filename.endswith(".gpickle"):
            path = os.path.join(directory, filename)
            with open(path, "rb") as f:
                G = pickle.load(f)
            graphs.append((filename, G))
    return graphs

In [3]:
def score_vector(vector):
    if np.sum(vector) == 0:
        return 0.0, 0.0, 0.0, 0.0, 0.0

    prob = vector / np.sum(vector)
    prob = prob[prob > 0]
    ent = entropy(prob, base=2)
    var = np.var(vector)
    sparsity = np.count_nonzero(vector) / len(vector)
    norm = np.linalg.norm(vector)

    sparsity_penalty = abs(sparsity - 0.2)
    score = (ent / (var + 1e-5)) / (norm + sparsity_penalty + 1e-5)
    return ent, var, sparsity, norm, score

In [4]:
def analyze_graphs(graphs, vector_sizes=[10, 20, 50, 100, 200, 300, 500, 1000, 1500, 2000, 2500], k=3):
    rows = []
    for name, G in graphs:
        for size in vector_sizes:
            kernel = WLkernel(G, k=k, size=size)
            vec = kernel.degree_vector(3)
            ent, var, sparsity, norm, score = score_vector(vec)
            rows.append({
                "graph": name,
                "vector_size": size,
                "entropy": ent,
                "variance": var,
                "sparsity": sparsity,
                "norm": norm,
                "score": score
            })

    return pd.DataFrame(rows)

In [None]:
import re
from collections import defaultdict

def visualize_results(df, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    metrics = ["entropy"]
    for metric in metrics:
        plt.figure()
        grouped = defaultdict(list)
        for _, row in df.iterrows():
            match = re.match(r"G_(\d+)_", row["graph"])
            if match:
                n_nodes = int(match.group(1))
                grouped[n_nodes].append(row)

        for n_nodes, rows in grouped.items():
            subdf = pd.DataFrame(rows)
            mean_values = subdf.groupby("vector_size")[metric].mean().reset_index()
            plt.plot(mean_values["vector_size"], mean_values[metric], label=f"{n_nodes} nodes")
        plt.xlabel("Vector size")
        plt.ylabel(metric.capitalize())
        plt.title(f"{metric.capitalize()} vs Vector Size")
        plt.legend(fontsize="small", loc="best")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{metric}_vs_vector_size.png"))
        plt.close()

In [6]:
graph_dir = "data/small_graph_dataset"
output_dir = "vector_quality_plots"
graphs = load_graphs_from_dir(graph_dir)
df = analyze_graphs(graphs)
df.to_csv("vector_quality_analysis.csv", index=False)
visualize_results(df, output_dir=output_dir)
print("✅ Analiza završena. CSV i slike su sačuvane.")

✅ Analiza završena. CSV i slike su sačuvane.
