### Importing Libraries

In [46]:
import pandas as pd
import networkx as nx
from neo4j import GraphDatabase
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

### Connecting to the database

In [53]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "OLIV00%%"

driver = GraphDatabase.driver(uri, auth=(username, password))

## Generating graph embeddings 

In [58]:
graph = nx.Graph()

with driver.session() as session:
    nodes_query = "MATCH (n) RETURN ID(n) AS id, n.name AS name, n.type AS type"
    nodes_result = session.run(nodes_query)

    for record in nodes_result:
        node_id = record["id"]
        node_name = record["name"]
        node_type = record["type"]
        if node_id is not None:
            graph.add_node(node_id, name=node_name, type=node_type)

    relations_query = "MATCH ()-[r]->() RETURN ID(startNode(r)) AS source_id, ID(endNode(r)) AS target_id"
    relations_result = session.run(relations_query)

    for record in relations_result:
        source_node_id = record["source_id"]
        target_node_id = record["target_id"]
        if source_node_id is not None and target_node_id is not None:
            graph.add_edge(source_node_id, target_node_id)
            
            
df = pd.DataFrame(graph.nodes(data=True), columns=["node_id", "attributes"])
df["node_type"] = df["attributes"].apply(lambda x: x.get("type"))
df["node_name"] = df["attributes"].apply(lambda x: x.get("name"))

df.drop(columns=["attributes"], inplace=True)

if df.empty:
    raise ValueError("The dataset is empty. Please check your Neo4j query and data.")
    
    
    
# Embeddings ##########################################################################
node_embeddings = []

with driver.session() as session:
    result = session.run("""
        CALL gds.beta.node2vec.stream('link_prediction_graph', {
            embeddingDimension: 64,
            iterations: 10,
            walkLength: 20,
            inOutFactor: 1.0,
            returnFactor: 1.0
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS node_name, gds.util.asNode(nodeId).type AS node_type, embedding
    """)

    for record in result:
        node_id = record["nodeId"]
        node_name = record["node_name"]
        node_type = record["node_type"]
        embedding = record["embedding"]
        node_embeddings.append((node_id, node_name, node_type, embedding))

columns = ["node_id", "node_name", "node_type", "embedding"]
node_embeddings_df = pd.DataFrame(node_embeddings, columns=columns)
node_embeddings_df.head()

Unnamed: 0,node_id,node_name,node_type,embedding
0,0,Couple unbalance,Explanation,"[0.0002608242502901703, -0.0016390503151342273..."
1,1,Dynamic unbalance,Explanation,"[0.00020098022650927305, 0.002685168059542775,..."
2,2,Overhung unbalance,Explanation,"[0.00021156920411158353, 0.005176438018679619,..."
3,3,Structural looseness,Explanation,"[0.0002050298935500905, -0.0027629134710878134..."
4,4,Angular misalignment,Explanation,"[0.00021222594659775496, -0.000109208696812856..."


## Fetch the graph embeddings and create the ML dataset

In [74]:
def fetch_graph_data():
    with driver.session() as session:
        result = session.run(
            "MATCH (n1)-[r]->(n2) RETURN id(n1) AS start, id(n2) AS end, type(r) AS relationship"
        )
        data = [(record["start"], record["end"], record["relationship"]) for record in result]
    return data
graph_data = fetch_graph_data()


G = nx.DiGraph()
for start, end, relationship in graph_data:
    G.add_node(start)
    G.add_node(end)
    G.add_edge(start, end, relationship=relationship)

adj_matrix = nx.to_numpy_matrix(G, dtype=int)
node_order = sorted(G.nodes())
node_index_map = {node_id: index for index, node_id in enumerate(node_order)}
adj_matrix_reordered = np.array([[adj_matrix[node_index_map[start], node_index_map[end]] for end in node_order] for start in node_order])
print(adj_matrix_reordered)



# Node features and labels ############################################################################
def generate_positive_examples(adj_matrix):
    pos_edges = np.array(np.where(adj_matrix == 1)).T
    return pos_edges

def generate_negative_examples(adj_matrix):
    neg_edges = np.array(np.where(adj_matrix == 0)).T
    return neg_edges

pos_edges = generate_positive_examples(adj_matrix_reordered)
neg_edges = generate_negative_examples(adj_matrix_reordered)

print("\nPositive edges: {}".format(pos_edges.shape))
print("Negative edges: {}".format(neg_edges.shape))

pos_labels = np.ones(pos_edges.shape[0])
neg_labels = np.zeros(neg_edges.shape[0])

edges = np.vstack((pos_edges, neg_edges))
labels = np.concatenate((pos_labels, neg_labels))

labels = labels.reshape(576, 1)

print("\nEdges: {}".format(edges.shape))
print("Labels: {}".format(labels.shape))

[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [76]:
# Combine positive and negative examples and labels
edges = np.vstack((pos_edges, neg_edges))
labels = np.concatenate((pos_labels, neg_labels))

# Split the data into training and testing sets
X_train_edges, X_test_edges, y_train, y_test = train_test_split(edges, labels, test_size=0.2, random_state=42)

# Extract embeddings for the edges in the training and testing sets
X_train_embeddings = np.array([node_embeddings_df.loc[node_id]["embedding"] for edge in X_train_edges])
X_test_embeddings = np.array([node_embeddings_df.loc[node_id]["embedding"] for edge in X_test_edges])

# Train a logistic regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_embeddings, y_train)

# Predict link existence for the testing set
y_pred = logreg_model.predict(X_test_embeddings)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.9568965517241379
Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98       111
         1.0       0.00      0.00      0.00         5

    accuracy                           0.96       116
   macro avg       0.48      0.50      0.49       116
weighted avg       0.92      0.96      0.94       116



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
X_train_embeddings.shape

(920, 64)

In [72]:
y_train.shape

(460,)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Combine positive and negative examples and labels
edges = np.vstack((pos_edges, neg_edges))
labels = np.concatenate((pos_labels, neg_labels))

# Split the data into training and testing sets
X_train_edges, X_test_edges, y_train, y_test = train_test_split(edges, labels, test_size=0.2, random_state=42)

# Extract embeddings for the edges in the training and testing sets
X_train_embeddings = np.array([node_embeddings_df.loc[node_id]["embedding"] for edge in X_train_edges for node_id in edge])
X_test_embeddings = np.array([node_embeddings_df.loc[node_id]["embedding"] for edge in X_test_edges for node_id in edge])

# Train a logistic regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_embeddings, y_train)

# Predict link existence for the testing set
y_pred = logreg_model.predict(X_test_embeddings)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

In [50]:
with driver.session() as session:
    session.run("CALL gds.graph.project('link_prediction_graph', '*', '*')")
    
with driver.session() as session:
    session.run("""
    CALL gds.beta.node2vec.stream('link_prediction_graph', {
        embeddingDimension: 64,
        iterations: 10,
        walkLength: 20,
        inOutFactor: 1.0,
        returnFactor: 1.0
    })
    """)