In [1]:
# ! pip install igraph
# ! pip install graphomaly

In [2]:
import igraph as ig
import pandas as pd
import numpy as np
from graphomaly.estimator import GraphomalyEstimator
from sklearn.ensemble import IsolationForest

In [3]:
graph_file_name = "data.csv"  # let's say that the file is in the current directory
data = pd.read_csv(graph_file_name)  # read graph as dataframe
G = data.to_numpy()     # convert to numpy array
y_edges = G[:,4]     # last column of G contains edge labels

n_edges = G.shape[0]                # number of edges
n_nodes = np.unique(G[:,0:2]).size  # number of nodes
n_abnormal_edges = np.count_nonzero(y_edges)          # number of abnormal edges
n_abnormal_nodes = np.unique(G[y_edges==1,0:2]).size  # number of abnormal nodes
print("Number of abnormal/total edges: ", n_abnormal_edges, '/', n_edges, sep='')
print("Number of abnormal/total nodes: ", n_abnormal_nodes, '/', n_nodes, sep='')

Number of abnormal/total edges: 444/597165
Number of abnormal/total nodes: 561/385100


In [4]:
nodes_data = pd.DataFrame(
    sorted(set(data["id_source"].tolist() + data["id_destination"].tolist())),
    columns=["id"]
)

In [5]:
contamination_rate = n_abnormal_nodes / n_nodes
type_feats = "graph_to_features"  # this is the transformation we want
egonet_args = {             # egonet parameters
    "ctor_args": {
        "verbose": True,    # to see something during the computation
    },
    "fit_args": {
        "feature_list": [   # this feature list is just an example, not necessarily the best
            "f_degree_in",
            "f_degree_out",
            "f_amount_in",
            "f_amount_out",
            "f_ego_edge_density",
            "f_egored_average_amount_in_rel",
            "f_egored_average_amount_out_rel",
            "f_egored_edge_density",
        ],
    },
}

to_feature_args = {         # put everything together
    "graph_algorithms": ["egonet"],
    "graph_algorithms_args": [egonet_args],
}

fit_kwargs = {
        'PyodIForest': {
            'n_estimators': [50, 100, 200],
            'bootstrap': [True, False],
            'contamination': [contamination_rate],
        },
}

In [6]:
%%time

clf = GraphomalyEstimator(models_subset=["PyodIForest"],
                          models_ctor_kwargs={},
                          models_fit_kwargs=fit_kwargs,
                          results_path="results",
                          voting="hard",
                         )

# Compute features and node labels
Xf, y_nodes, _ = clf.preprocess(G, y_edges, type_feats, **to_feature_args)

Graph info: DiGraph with 385100 nodes and 597165 edges
Nodes processed: 385100
CPU times: user 10min 52s, sys: 3.34 s, total: 10min 55s
Wall time: 10min 55s


In [7]:
nodes_data.loc[:, "label"] = y_nodes
ego = data.groupby("id_source").agg(ego=("id_destination", set)).join(
    data.groupby("id_destination").agg(ego=("id_source", set)),
    how="outer", lsuffix="_left"
)
ego["ego_c1"] = ego["ego_left"].combine_first(ego["ego"])
ego["ego_c2"] = ego["ego"].combine_first(ego["ego_left"])
del ego["ego"]
ego.loc[:, "ego"] = ego.apply(lambda x: x["ego_c1"] | x["ego_c2"], axis=1)
ego = ego.reset_index().loc[:, ["ego"]]
ego.index.name = "id"

In [8]:
graph = ig.Graph.DataFrame(data.loc[:, ["id_source", "id_destination"]], use_vids=True, directed=True)

In [9]:
%%time

ego_red_sizes = {}
for group, ego_data in ego[["ego"]].iterrows():
    sg = graph.induced_subgraph(set([group]).union(ego_data["ego"]))
    degrees = [sum(x) for x in zip(sg.degree(mode="in"), sg.degree(mode="out"))]
    ego_red_sizes[group] = len([x for x in degrees if x > 1]) or 1

nodes_data.loc[:, "ego_red_size"] = nodes_data["id"].apply(lambda x: ego_red_sizes[x])

CPU times: user 22.3 s, sys: 29.3 s, total: 51.6 s
Wall time: 51.6 s


In [10]:
model = IsolationForest(n_estimators=200)
anomalies = nodes_data.copy(deep=True)
anomalies.loc[:, "anomaly_score"] = model.fit(Xf).decision_function(Xf)
anomalies = anomalies.sort_values("anomaly_score", ascending=True)

In [11]:
anomalies[anomalies["label"] == 1]["ego_red_size"].mean()

np.float64(12.405)

In [12]:
for top in [385, 770, 1926, 3851]:
    print(top, round(anomalies.head(top)["ego_red_size"].mean(), 3))

385 102.839
770 67.245
1926 37.618
3851 23.593
