### Importing Libraries

In [6]:
import pandas as pd
import networkx as nx
from neo4j import GraphDatabase
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

### Connecting to the database

In [3]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "OLIV00%%"

driver = GraphDatabase.driver(uri, auth=(username, password))

In [4]:
graph = nx.Graph()

with driver.session() as session:
    # Récupérer les nœuds avec leur nom et type
    nodes_query = "MATCH (n) RETURN ID(n) AS id, n.name AS name, n.type AS type"
    nodes_result = session.run(nodes_query)

    # Parcourir les résultats et ajouter les nœuds au graphe
    for record in nodes_result:
        node_id = record["id"]
        node_name = record["name"]
        node_type = record["type"]
        if node_id is not None:
            graph.add_node(node_id, name=node_name, type=node_type)

    # Récupérer les relations
    relations_query = "MATCH ()-[r]->() RETURN ID(startNode(r)) AS source_id, ID(endNode(r)) AS target_id"
    relations_result = session.run(relations_query)

    # Ajouter les relations entre les nœuds
    for record in relations_result:
        source_node_id = record["source_id"]
        target_node_id = record["target_id"]
        if source_node_id is not None and target_node_id is not None:
            graph.add_edge(source_node_id, target_node_id)

In [7]:
# Convert the graph to a pandas DataFrame
df = pd.DataFrame(graph.nodes(data=True), columns=["node_id", "attributes"])
df["node_type"] = df["attributes"].apply(lambda x: x.get("type"))
df["node_name"] = df["attributes"].apply(lambda x: x.get("name"))

# Drop the unnecessary 'attributes' column
df.drop(columns=["attributes"], inplace=True)

# Check if the DataFrame is empty
if df.empty:
    raise ValueError("The dataset is empty. Please check your Neo4j query and data.")

# Prepare the target variable ('link' column)
df["link"] = 1

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
with driver.session() as session:
    # Create a graph projection for link prediction using GDS
    session.run("CALL gds.graph.project('link_prediction_graph', '*', '*')")
    
with driver.session() as session:
    # Compute graph embeddings using Node2Vec algorithm
    session.run("""
    CALL gds.beta.node2vec.stream('link_prediction_graph', {
        embeddingDimension: 64,
        iterations: 10,
        walkLength: 20,
        inOutFactor: 1.0,
        returnFactor: 1.0
    })
    """)

In [10]:
# Create an empty list to store the node embeddings
node_embeddings = []

# Compute graph embeddings using Node2Vec algorithm and fetch the embeddings
with driver.session() as session:
    result = session.run("""
        CALL gds.beta.node2vec.stream('link_prediction_graph', {
            embeddingDimension: 64,
            iterations: 10,
            walkLength: 20,
            inOutFactor: 1.0,
            returnFactor: 1.0
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS node_name, gds.util.asNode(nodeId).type AS node_type, embedding
    """)

    # Iterate through the result and append embeddings to the list
    for record in result:
        node_id = record["nodeId"]
        node_name = record["node_name"]
        node_type = record["node_type"]
        embedding = record["embedding"]
        node_embeddings.append((node_id, node_name, node_type, embedding))

# Create a DataFrame to store the node embeddings
columns = ["node_id", "node_name", "node_type", "embedding"]
node_embeddings_df = pd.DataFrame(node_embeddings, columns=columns)

In [11]:
node_embeddings_df.head()

Unnamed: 0,node_id,node_name,node_type,embedding
0,0,Couple unbalance,Explanation,"[0.0015262243105098605, 0.004842257592827082, ..."
1,1,Dynamic unbalance,Explanation,"[0.0015249720308929682, 0.00751839391887188, -..."
2,2,Overhung unbalance,Explanation,"[0.00121641019359231, -0.000427287450293079, -..."
3,3,Structural looseness,Explanation,"[0.0015882368898019195, 0.002190080238506198, ..."
4,4,Angular misalignment,Explanation,"[0.0060586994513869286, 0.0023688809014856815,..."


In [None]:
# Suppose 'node_embeddings' is a dictionary mapping node IDs to their embeddings
# 'positive_edges' is a list of existing edges, 'negative_edges' is a list of non-existing edges

# Step 1: Generate labels for positive and negative edges
pos_labels = np.ones(len(positive_edges))
neg_labels = np.zeros(len(negative_edges))

# Combine labels and edges
edges = np.vstack((positive_edges, negative_edges))
labels = np.concatenate((pos_labels, neg_labels))

# Step 2: Generate feature vectors for candidate links
candidate_features = np.array([np.concatenate((node_embeddings[u], node_embeddings[v])) for u, v in edges])

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(candidate_features, labels, test_size=0.2, random_state=42)

# Step 4: Train a classifier
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Predict links
predictions = model.predict(X_test)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)