## Importing Libraries

In [14]:
import pandas as pd
import networkx as nx
from neo4j import GraphDatabase
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

## Connecting to the neo4j database

In [16]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "OLIV00%%"

driver = GraphDatabase.driver(uri, auth=(username, password))

## Retrieving the graph data from Neo4j and dataset preparation

In [4]:
graph = nx.Graph()

with driver.session() as session:
    # Récupérer les nœuds avec leur nom et type
    nodes_query = "MATCH (n) RETURN ID(n) AS id, n.name AS name, n.type AS type"
    nodes_result = session.run(nodes_query)

    # Parcourir les résultats et ajouter les nœuds au graphe
    for record in nodes_result:
        node_id = record["id"]
        node_name = record["name"]
        node_type = record["type"]
        if node_id is not None:
            graph.add_node(node_id, name=node_name, type=node_type)

    # Récupérer les relations
    relations_query = "MATCH ()-[r]->() RETURN ID(startNode(r)) AS source_id, ID(endNode(r)) AS target_id"
    relations_result = session.run(relations_query)

    # Ajouter les relations entre les nœuds
    for record in relations_result:
        source_node_id = record["source_id"]
        target_node_id = record["target_id"]
        if source_node_id is not None and target_node_id is not None:
            graph.add_edge(source_node_id, target_node_id)

In [5]:
# Convert the graph to a pandas DataFrame
df = pd.DataFrame(graph.nodes(data=True), columns=["node_id", "attributes"])
df["node_type"] = df["attributes"].apply(lambda x: x.get("type"))
df["node_name"] = df["attributes"].apply(lambda x: x.get("name"))

# Drop the unnecessary 'attributes' column
df.drop(columns=["attributes"], inplace=True)

# Check if the DataFrame is empty
if df.empty:
    raise ValueError("The dataset is empty. Please check your Neo4j query and data.")

# Prepare the target variable ('link' column)
df["link"] = 1

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [6]:
df.head()

Unnamed: 0,node_id,node_type,node_name,link
0,0,Explanation,Couple unbalance,1
1,1,Explanation,Dynamic unbalance,1
2,2,Explanation,Overhung unbalance,1
3,3,Explanation,Structural looseness,1
4,4,Explanation,Angular misalignment,1


## Generating graph embeddings using Neo4j's GDS library

In [7]:
with driver.session() as session:
    # Create a graph projection for link prediction using GDS
    session.run("CALL gds.graph.project('link_prediction_graph', '*', '*')")

In [8]:
with driver.session() as session:
    # Compute graph embeddings using Node2Vec algorithm
    session.run("""
    CALL gds.beta.node2vec.stream('link_prediction_graph', {
        embeddingDimension: 64,
        iterations: 10,
        walkLength: 20,
        inOutFactor: 1.0,
        returnFactor: 1.0
    })
    """)

## Fetch the graph embeddings and create the machine learning dataset

In [9]:
# Create an empty list to store the node embeddings
node_embeddings = []

# Compute graph embeddings using Node2Vec algorithm and fetch the embeddings
with driver.session() as session:
    result = session.run("""
        CALL gds.beta.node2vec.stream('link_prediction_graph', {
            embeddingDimension: 64,
            iterations: 10,
            walkLength: 20,
            inOutFactor: 1.0,
            returnFactor: 1.0
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS node_name, gds.util.asNode(nodeId).type AS node_type, embedding
    """)

    # Iterate through the result and append embeddings to the list
    for record in result:
        node_id = record["nodeId"]
        node_name = record["node_name"]
        node_type = record["node_type"]
        embedding = record["embedding"]
        node_embeddings.append((node_id, node_name, node_type, embedding))

# Create a DataFrame to store the node embeddings
columns = ["node_id", "node_name", "node_type", "embedding"]
node_embeddings_df = pd.DataFrame(node_embeddings, columns=columns)

In [10]:
node_embeddings_df.head(5)

Unnamed: 0,node_id,node_name,node_type,embedding
0,0,Couple unbalance,Explanation,"[0.007438237778842449, -0.005690622143447399, ..."
1,1,Dynamic unbalance,Explanation,"[0.007427668664604425, -0.002915834542363882, ..."
2,2,Overhung unbalance,Explanation,"[0.00774760264903307, 0.00453847274184227, 0.0..."
3,3,Structural looseness,Explanation,"[0.007449103053659201, 0.007329504005610943, 0..."
4,4,Angular misalignment,Explanation,"[0.007547266781330109, -0.0016168525908142328,..."


In [11]:
print("Column names in train_df:", train_df.columns)
print("Column names in node_embeddings_df:", node_embeddings_df.columns)

Column names in train_df: Index(['node_id', 'node_type', 'node_name', 'link'], dtype='object')
Column names in node_embeddings_df: Index(['node_id', 'node_name', 'node_type', 'embedding'], dtype='object')


## Training a machine learning model 

In [12]:
def link_prediction_classifier(max_iter=2000):
    lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=max_iter)
    return Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])


## Model evaluation

In [13]:
model = link_prediction_classifier(max_iter=2000)
model.fit()

TypeError: fit() missing 1 required positional argument: 'X'

In [84]:
from sklearn.metrics.pairwise import cosine_similarity

# Separate the nodes of type "component" and nodes of types "failure" and "ambalance"
component_nodes = node_embeddings_df[node_embeddings_df["node_type"] == "Component"]
failure_ambalance_nodes = node_embeddings_df[node_embeddings_df["node_type"].isin(["Failure", "Imbalance"])]

# Extract embeddings for each set of nodes
component_embeddings = component_nodes["embedding"].tolist()
failure_ambalance_embeddings = failure_ambalance_nodes["embedding"].tolist()

# Check if embeddings are not empty
if not component_embeddings or not failure_ambalance_embeddings:
    print("Some embeddings are empty. Check if the Node2Vec computation was successful and if nodes of required types exist.")
else:
    # Compute the similarity between the embeddings using cosine similarity
    similarity_matrix = cosine_similarity(component_embeddings, failure_ambalance_embeddings)

    # Create a DataFrame to store the similarity scores with node names as index and columns
    similarity_df = pd.DataFrame(similarity_matrix, columns=failure_ambalance_nodes["node_name"].tolist(), index=component_nodes["node_name"].tolist())

    # Display the similarity DataFrame with node names
similarity_df

Unnamed: 0,Imbalance,Structural fault,Misalignment,looseness,Bearing Fault,Gear Fault
Pump,0.038418,-0.008609,-0.148771,0.080452,-0.076055,-0.00743
Fan,0.212369,-0.147346,0.155298,-0.112698,-0.039756,-0.118019
Engine,0.390948,0.184992,-0.168545,-0.168042,-0.218257,-0.117252


## Use of the predicted links to create the relationships in the database

In [None]:
# Assuming we have the predicted links as a DataFrame 'predicted_links_df'
with driver.session() as session:
    for _, row in predicted_links_df.iterrows():
        start_node_id = row['component_id']
        end_node_id = row['machine_state_id']
        
        query = f"""
        MATCH (c:Component), (ms:Machine_State)
        WHERE id(c) = {start_node_id} AND id(ms) = {end_node_id}
        MERGE (c)-[:leads_to]->(ms)
        """
        session.run(query)


In [6]:
with driver.session() as session:
    # Retrieve all components and machine states
    query = """
    MATCH (c:Component), (ms:Machine_State)
    RETURN id(c) AS component_id, id(ms) AS machine_state_id
    """
    result = session.run(query)

    # Convert the query result to a pandas DataFrame
    df = pd.DataFrame(result.data())

# Check if the DataFrame is empty
if df.empty:
    raise ValueError("The dataset is empty. Please check your Neo4j query and data.")

# Prepare the target variable ('link' column)
df['link'] = 1

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

ValueError: The dataset is empty. Please check your Neo4j query and data.