In [1]:
import os
import json
import glob
import hashlib
import pandas as pd
import networkx as nx
from tqdm import tqdm
from joblib import Parallel, delayed
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

class WeisfeilerLehmanMachine:
    """
    Weisfeiler Lehman feature extractor class.
    """
    def __init__(self, graph, features, iterations):
        """
        Initialization method which also executes feature extraction.
        :param graph: The Nx graph object.
        :param features: Feature hash table.
        :param iterations: Number of WL iterations.
        """
        self.iterations = iterations
        self.graph = graph
        self.features = features
        self.nodes = self.graph.nodes()
        self.extracted_features = [str(v) for k, v in features.items()]
        self.do_recursions()

    def do_a_recursion(self):
        """
        The method does a single WL recursion.
        :return new_features: The hash table with extracted WL features.
        """
        new_features = {}
        for node in self.nodes:
            nebs = self.graph.neighbors(str(node))
            degs = [self.features[neb] for neb in nebs]
            features = [str(self.features[node])]+sorted([str(deg) for deg in degs])
            features = "_".join(features)
            hash_object = hashlib.md5(features.encode())
            hashing = hash_object.hexdigest()
            new_features[node] = hashing
        self.extracted_features = self.extracted_features + list(new_features.values())
        return new_features

    def do_recursions(self):
        """
        The method does a series of WL recursions.
        """
        for _ in range(self.iterations):
            self.features = self.do_a_recursion()

def path2name(path):
    base = os.path.basename(path)
    return os.path.splitext(base)[0]

def dataset_reader(path):
    """
    Function to read the graph and features from a json file.
    :param path: The path to the graph json.
    :return graph: The graph object.
    :return features: Features hash table.
    :return name: Name of the graph.
    """
    name = path2name(path)
    data = json.load(open(path))
    graph = nx.from_edgelist(data["edges"])

    if "features" in data.keys():
        features = data["features"]
        features = {int(k): v for k, v in features.items()}
    else:
        features = nx.degree(graph)
        features = {k: v for k, v in features}
       
    return graph, features, name

def feature_extractor(path, rounds):
    """
    Function to extract WL features from a graph.
    :param path: The path to the graph json.
    :param rounds: Number of WL iterations.
    :return doc: Document collection object.
    """
    graph, features, name = dataset_reader(path)
    machine = WeisfeilerLehmanMachine(graph, features, rounds)
    doc = TaggedDocument(words=machine.extracted_features, tags=["g_" + name])
    return doc

def save_embedding(output_path, model, files, dimensions):
    """
    Function to save the embedding.
    :param output_path: Path to the embedding csv.
    :param model: The embedding model object.
    :param files: The list of files.
    :param dimensions: The embedding dimension parameter.
    """
    out = []
    for f in files:
        identifier = path2name(f)
        out.append([identifier] + list(model.docvecs["g_"+identifier]))
    column_names = ["type"]+["x_"+str(dim) for dim in range(dimensions)]
    out = pd.DataFrame(out, columns=column_names)
    out = out.sort_values(["type"])
    out['pv'] = out['type']
    out = out.drop(columns=['type'])

    out.to_csv(output_path, index=None)
    return out


In [2]:
graphs = glob.glob(os.path.join('/sise/home/noaradi/meta_model/GraphEmbedding/graphs_new/batch1_manually_defined', "*.json"))
graphs1 = glob.glob(os.path.join('/sise/home/noaradi/meta_model/GraphEmbedding/graphs_new/batch2_manually_defined', "*.json"))

In [3]:
graphs3 = graphs1 + graphs

In [5]:
# graphs = glob.glob(os.path.join('/sise/home/noaradi/GraphEmbedding-master/graph_jsons', "*.json"))
# print("\nFeature extraction started.\n")
# # document_collections = Parallel(n_jobs=1)(delayed(feature_extractor)(g, 2) for g in tqdm(graphs))
document_collections = Parallel(n_jobs=1)(delayed(feature_extractor)(g, 2) for g in tqdm(graphs3))

print("\nOptimization started.\n")

model = Doc2Vec(document_collections,
                vector_size=100,
                window=5, # doesn't affect
                min_count=3,
                sample=1,
                workers=1,
                epochs=100,
                alpha=0.05)

df2 = save_embedding('graph_embedding_manually_defined_3_100.csv', model, graphs3,100)

100%|██████████| 1757/1757 [03:03<00:00,  9.57it/s]



Optimization started.



  out.append([identifier] + list(model.docvecs["g_"+identifier]))
