## Importing Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
import networkx as nx
from neo4j import GraphDatabase
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

## Connecting to the neo4j database

In [3]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "OLIV00%%"

driver = GraphDatabase.driver(uri, auth=(username, password))

## Retrieving the graph data from Neo4j and dataset preparation

In [4]:
graph = nx.Graph()

with driver.session() as session:
    # Récupérer les nœuds avec leur nom et type
    nodes_query = "MATCH (n) RETURN ID(n) AS id, n.name AS name, n.type AS type"
    nodes_result = session.run(nodes_query)

    # Parcourir les résultats et ajouter les nœuds au graphe
    for record in nodes_result:
        node_id = record["id"]
        node_name = record["name"]
        node_type = record["type"]
        if node_id is not None:
            graph.add_node(node_id, name=node_name, type=node_type)

    # Récupérer les relations
    relations_query = "MATCH ()-[r]->() RETURN ID(startNode(r)) AS source_id, ID(endNode(r)) AS target_id"
    relations_result = session.run(relations_query)

    # Ajouter les relations entre les nœuds
    for record in relations_result:
        source_node_id = record["source_id"]
        target_node_id = record["target_id"]
        if source_node_id is not None and target_node_id is not None:
            graph.add_edge(source_node_id, target_node_id)

In [5]:
# Convert the graph to a pandas DataFrame
df = pd.DataFrame(graph.nodes(data=True), columns=["node_id", "attributes"])
df["node_type"] = df["attributes"].apply(lambda x: x.get("type"))
df["node_name"] = df["attributes"].apply(lambda x: x.get("name"))

# Drop the unnecessary 'attributes' column
df.drop(columns=["attributes"], inplace=True)

# Check if the DataFrame is empty
if df.empty:
    raise ValueError("The dataset is empty. Please check your Neo4j query and data.")

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [6]:
df.head()

Unnamed: 0,node_id,node_type,node_name
0,0,Explanation,Couple unbalance
1,1,Explanation,Dynamic unbalance
2,2,Explanation,Overhung unbalance
3,3,Explanation,Structural looseness
4,4,Explanation,Angular misalignment


## Generating graph embeddings using Neo4j's GDS library

In [7]:
with driver.session() as session:
    # Create a graph projection for link prediction using GDS
    session.run("CALL gds.graph.project('link_prediction_graph', '*', '*')")

In [8]:
with driver.session() as session:
    # Compute graph embeddings using Node2Vec algorithm
    session.run("""
        CALL gds.beta.node2vec.stream('link_prediction_graph', {
            embeddingDimension: 64,
            iterations: 10,
            walkLength: 20,
            inOutFactor: 1.0,
            returnFactor: 1.0
        })
    """)

## Fetch the graph embeddings 

In [9]:
# Create an empty list to store the node embeddings
node_embeddings = []

# Compute graph embeddings using Node2Vec algorithm and fetch the embeddings
with driver.session() as session:
    result = session.run("""
        CALL gds.beta.node2vec.stream('link_prediction_graph', {
            embeddingDimension: 64,
            iterations: 10,
            walkLength: 20,
            inOutFactor: 1.0,
            returnFactor: 1.0
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS node_name, gds.util.asNode(nodeId).type AS node_type, embedding
    """)

    # Iterate through the result and append embeddings to the list
    for record in result:
        node_id = record["nodeId"]
        node_name = record["node_name"]
        node_type = record["node_type"]
        embedding = record["embedding"]
        node_embeddings.append((node_id, node_name, node_type, embedding))

# Create a DataFrame to store the node embeddings
columns = ["node_id", "node_name", "node_type", "embedding"]
node_embeddings_df = pd.DataFrame(node_embeddings, columns=columns)

In [10]:
node_embeddings_df.head()

Unnamed: 0,node_id,node_name,node_type,embedding
0,0,Couple unbalance,Explanation,"[-0.00667918287217617, 0.0028545167297124863, ..."
1,1,Dynamic unbalance,Explanation,"[-0.006680582650005817, 0.0055036479607224464,..."
2,2,Overhung unbalance,Explanation,"[-0.006549644749611616, -0.002505231648683548,..."
3,3,Structural looseness,Explanation,"[-0.006379413418471813, 0.0002732594439294189,..."
4,4,Angular misalignment,Explanation,"[-0.006261258386075497, -0.0021973459515720606..."


In [11]:
print("Column names in train_df:", train_df.columns)
print("Column names in node_embeddings_df:", node_embeddings_df.columns)

Column names in train_df: Index(['node_id', 'node_type', 'node_name'], dtype='object')
Column names in node_embeddings_df: Index(['node_id', 'node_name', 'node_type', 'embedding'], dtype='object')


## Similarities

In [12]:
# Separate the nodes of type "component" and nodes of types "failure" and "ambalance"
component_nodes = node_embeddings_df[node_embeddings_df["node_type"] == "Component"]
failure_ambalance_nodes = node_embeddings_df[node_embeddings_df["node_type"].isin(["Failure", "Imbalance"])]

# Extract embeddings for each set of nodes
component_embeddings = component_nodes["embedding"].tolist()
failure_ambalance_embeddings = failure_ambalance_nodes["embedding"].tolist()

# Check if embeddings are not empty
if not component_embeddings or not failure_ambalance_embeddings:
    print("Some embeddings are empty. Check if the Node2Vec computation was successful and if nodes of required types exist.")
else:
    # Compute the similarity between the embeddings using cosine similarity
    similarity_matrix = cosine_similarity(component_embeddings, failure_ambalance_embeddings)

    # Create a DataFrame to store the similarity scores with node names as index and columns
    similarity_df = pd.DataFrame(similarity_matrix, columns=failure_ambalance_nodes["node_name"].tolist(), index=component_nodes["node_name"].tolist())

    # Display the similarity DataFrame with node names
    
similarity_df

Unnamed: 0,Imbalance,Structural fault,Misalignment,looseness,Bearing Fault,Gear Fault
Pump,-0.153682,0.069227,-0.063285,0.263582,-0.040137,-0.060397
Fan,-0.181578,0.055777,0.117058,-0.046323,-0.07977,-0.128679
Engine,0.096125,0.054971,-0.054945,0.155273,0.065945,0.00488


In [13]:
# Find the maximum similarity value for each column
max_similarities = similarity_df.max(axis=0)

# Iterate through each column and find the corresponding row(s) with the maximum value
for column in max_similarities.index:
    max_similarity = max_similarities[column]
    max_rows = similarity_df[similarity_df[column] == max_similarity].index.tolist()
    
    # Print the result
    if len(max_rows) == 1:
        print(f"{max_rows[0]} ===== leads to =====> {column}")
    else:
        rows_str = ", ".join(max_rows)
        print(f"{rows_str} ===== leads to =====>{column}")
    print()

Engine ===== leads to =====> Imbalance

Pump ===== leads to =====> Structural fault

Fan ===== leads to =====> Misalignment

Pump ===== leads to =====> looseness

Engine ===== leads to =====> Bearing Fault

Engine ===== leads to =====> Gear Fault



In [14]:
# Separate the nodes of type "component" and nodes of types "failure" and "ambalance"
component_nodes = node_embeddings_df[node_embeddings_df["node_type"] == "Component"]
failure_ambalance_nodes = node_embeddings_df[node_embeddings_df["node_type"].isin(["Failure", "Imbalance"])]

# Extract embeddings for each set of nodes
component_embeddings = component_nodes["embedding"].tolist()
failure_ambalance_embeddings = failure_ambalance_nodes["embedding"].tolist()

# Check if embeddings are not empty
if not component_embeddings or not failure_ambalance_embeddings:
    print("Some embeddings are empty. Check if the Node2Vec computation was successful and if nodes of required types exist.")
else:
    # Compute the similarity between the embeddings using euclidean distance
    similarity_matrix = euclidean_distances(component_embeddings, failure_ambalance_embeddings)

    # Create a DataFrame to store the similarity scores with node names as index and columns
    similarity_df = pd.DataFrame(similarity_matrix, columns=failure_ambalance_nodes["node_name"].tolist(), index=component_nodes["node_name"].tolist())

    # Display the similarity DataFrame with node names
similarity_df

Unnamed: 0,Imbalance,Structural fault,Misalignment,looseness,Bearing Fault,Gear Fault
Pump,0.052303,0.045747,0.050717,0.043687,0.049941,0.051463
Fan,0.054672,0.047695,0.047726,0.053639,0.052547,0.054765
Engine,0.047006,0.046841,0.051274,0.04743,0.048042,0.050577


In [15]:
# Find the maximum similarity value for each column
max_similarities = similarity_df.max(axis=0)

# Iterate through each column and find the corresponding row(s) with the maximum value
for column in max_similarities.index:
    max_similarity = max_similarities[column]
    max_rows = similarity_df[similarity_df[column] == max_similarity].index.tolist()
    
    # Print the result
    if len(max_rows) == 1:
        print(f"{max_rows[0]} ===== leads to =====> {column}")
    else:
        rows_str = ", ".join(max_rows)
        print(f"{rows_str} ===== leads to =====> {column}")
    print()

Fan ===== leads to =====> Imbalance

Fan ===== leads to =====> Structural fault

Engine ===== leads to =====> Misalignment

Fan ===== leads to =====> looseness

Fan ===== leads to =====> Bearing Fault

Fan ===== leads to =====> Gear Fault



In [33]:
session.close()
driver.close()