## Importing Libraries

In [60]:
import torch
import torch.nn as nn
import torch.optim as optim

In [61]:
import pandas as pd
import numpy as np
import networkx as nx
from neo4j import GraphDatabase
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

## Connecting to the neo4j database

In [63]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "OLIV00%%"

driver = GraphDatabase.driver(uri, auth=(username, password))

## Retrieving the graph data from Neo4j and dataset preparation

In [64]:
graph = nx.Graph()

with driver.session() as session:
    # Récupérer les nœuds avec leur nom et type
    nodes_query = "MATCH (n) RETURN ID(n) AS id, n.name AS name, n.type AS type"
    nodes_result = session.run(nodes_query)

    # Parcourir les résultats et ajouter les nœuds au graphe
    for record in nodes_result:
        node_id = record["id"]
        node_name = record["name"]
        node_type = record["type"]
        if node_id is not None:
            graph.add_node(node_id, name=node_name, type=node_type)

    # Récupérer les relations
    relations_query = "MATCH ()-[r]->() RETURN ID(startNode(r)) AS source_id, ID(endNode(r)) AS target_id"
    relations_result = session.run(relations_query)

    # Ajouter les relations entre les nœuds
    for record in relations_result:
        source_node_id = record["source_id"]
        target_node_id = record["target_id"]
        if source_node_id is not None and target_node_id is not None:
            graph.add_edge(source_node_id, target_node_id)

In [65]:
# Convert the graph to a pandas DataFrame
df = pd.DataFrame(graph.nodes(data=True), columns=["node_id", "attributes"])
df["node_type"] = df["attributes"].apply(lambda x: x.get("type"))
df["node_name"] = df["attributes"].apply(lambda x: x.get("name"))

# Drop the unnecessary 'attributes' column
df.drop(columns=["attributes"], inplace=True)

# Check if the DataFrame is empty
if df.empty:
    raise ValueError("The dataset is empty. Please check your Neo4j query and data.")

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [66]:
df.head()

Unnamed: 0,node_id,node_type,node_name
0,0,fftvInterval,Iv1
1,1,fftvInterval,Iv2
2,2,fftvInterval,Iv3
3,3,fftvInterval,Iv4
4,4,fftvInterval,Iv5


## Generating graph embeddings using Neo4j's GDS library

In [67]:
with driver.session() as session:
    # Create a graph projection for link prediction using GDS
    session.run("CALL gds.graph.project('link_prediction_graph', '*', '*')")

In [68]:
with driver.session() as session:
    # Compute graph embeddings using Node2Vec algorithm
    session.run("""
        CALL gds.beta.node2vec.stream('link_prediction_graph', {
            embeddingDimension: 64,
            iterations: 10,
            walkLength: 20,
            inOutFactor: 1.0,
            returnFactor: 1.0
        })
    """)

## Fetch the graph embeddings 

In [69]:
# Create an empty list to store the node embeddings
node_embeddings = []

# Compute graph embeddings using Node2Vec algorithm and fetch the embeddings
with driver.session() as session:
    result = session.run("""
        CALL gds.beta.node2vec.stream('link_prediction_graph', {
            embeddingDimension: 64,
            iterations: 10,
            walkLength: 20,
            inOutFactor: 1.0,
            returnFactor: 1.0
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS node_name, gds.util.asNode(nodeId).type AS node_type, embedding
    """)

    # Iterate through the result and append embeddings to the list
    for record in result:
        node_id = record["nodeId"]
        node_name = record["node_name"]
        node_type = record["node_type"]
        embedding = record["embedding"]
        node_embeddings.append((node_id, node_name, node_type, embedding))

# Create a DataFrame to store the node embeddings
columns = ["node_id", "node_name", "node_type", "embedding"]
node_embeddings_df = pd.DataFrame(node_embeddings, columns=columns)

In [70]:
node_embeddings_df.head()

Unnamed: 0,node_id,node_name,node_type,embedding
0,0,Iv1,fftvInterval,"[-0.0012790472246706486, 0.0038480537477880716..."
1,1,Iv2,fftvInterval,"[-0.0007773544639348984, -0.006434565410017967..."
2,2,Iv3,fftvInterval,"[-0.0006931481184437871, -0.002473310101777315..."
3,3,Iv4,fftvInterval,"[-0.0011693515116348863, 0.002924727974459529,..."
4,4,Iv5,fftvInterval,"[-0.0010751865338534117, -0.010924862697720528..."


In [71]:
print("Column names in train_df:", train_df.columns)
print("Column names in node_embeddings_df:", node_embeddings_df.columns)

Column names in train_df: Index(['node_id', 'node_type', 'node_name'], dtype='object')
Column names in node_embeddings_df: Index(['node_id', 'node_name', 'node_type', 'embedding'], dtype='object')


## Similarities

In [72]:
# Separate the nodes of type "component" and nodes of types "failure" and "ambalance"
component_nodes = node_embeddings_df[node_embeddings_df["node_type"] == "Component"]
failure_ambalance_nodes = node_embeddings_df[node_embeddings_df["node_type"].isin(["Failure", "Imbalance"])]

# Extract embeddings for each set of nodes
component_embeddings = component_nodes["embedding"].tolist()
failure_ambalance_embeddings = failure_ambalance_nodes["embedding"].tolist()

# Check if embeddings are not empty
if not component_embeddings or not failure_ambalance_embeddings:
    print("Some embeddings are empty. Check if the Node2Vec computation was successful and if nodes of required types exist.")
else:
    # Compute the similarity between the embeddings using cosine similarity
    similarity_matrix = cosine_similarity(component_embeddings, failure_ambalance_embeddings)

    # Create a DataFrame to store the similarity scores with node names as index and columns
    similarity_df = pd.DataFrame(similarity_matrix, columns=failure_ambalance_nodes["node_name"].tolist(), index=component_nodes["node_name"].tolist())

    # Display the similarity DataFrame with node names
    
similarity_df

Unnamed: 0,Imbalance,Structural fault,Misalignment,looseness,Bearing Fault,Gear Fault
Pump,0.280931,0.243133,0.351113,0.348986,0.155556,0.371646
Fan,0.290364,0.386825,0.480342,0.568085,0.573036,0.536192
Engine,0.396605,0.393788,0.324915,0.639001,0.460977,0.383681


In [73]:
# Find the maximum similarity value for each column
max_similarities = similarity_df.max(axis=0)

# Iterate through each column and find the corresponding row(s) with the maximum value
for column in max_similarities.index:
    max_similarity = max_similarities[column]
    max_rows = similarity_df[similarity_df[column] == max_similarity].index.tolist()
    
    # Print the result
    if len(max_rows) == 1:
        print(f"{max_rows[0]} ===== leads to =====> {column}")
    else:
        rows_str = ", ".join(max_rows)
        print(f"{rows_str} ===== leads to =====>{column}")
    print()

Engine ===== leads to =====> Imbalance

Engine ===== leads to =====> Structural fault

Fan ===== leads to =====> Misalignment

Engine ===== leads to =====> looseness

Fan ===== leads to =====> Bearing Fault

Fan ===== leads to =====> Gear Fault



In [74]:
# Separate the nodes of type "component" and nodes of types "failure" and "ambalance"
component_nodes = node_embeddings_df[node_embeddings_df["node_type"] == "Component"]
failure_ambalance_nodes = node_embeddings_df[node_embeddings_df["node_type"].isin(["Failure", "Imbalance"])]

# Extract embeddings for each set of nodes
component_embeddings = component_nodes["embedding"].tolist()
failure_ambalance_embeddings = failure_ambalance_nodes["embedding"].tolist()

# Check if embeddings are not empty
if not component_embeddings or not failure_ambalance_embeddings:
    print("Some embeddings are empty. Check if the Node2Vec computation was successful and if nodes of required types exist.")
else:
    # Compute the similarity between the embeddings using euclidean distance
    similarity_matrix = euclidean_distances(component_embeddings, failure_ambalance_embeddings)

    # Create a DataFrame to store the similarity scores with node names as index and columns
    similarity_df = pd.DataFrame(similarity_matrix, columns=failure_ambalance_nodes["node_name"].tolist(), index=component_nodes["node_name"].tolist())

    # Display the similarity DataFrame with node names
similarity_df

Unnamed: 0,Imbalance,Structural fault,Misalignment,looseness,Bearing Fault,Gear Fault
Pump,0.053,0.053052,0.054528,0.048762,0.059751,0.05366
Fan,0.057632,0.052623,0.052484,0.044053,0.046166,0.049566
Engine,0.053329,0.052492,0.059957,0.040529,0.052007,0.057265


In [75]:
# Find the maximum similarity value for each column
max_similarities = similarity_df.max(axis=0)

# Iterate through each column and find the corresponding row(s) with the maximum value
for column in max_similarities.index:
    max_similarity = max_similarities[column]
    max_rows = similarity_df[similarity_df[column] == max_similarity].index.tolist()
    
    # Print the result
    if len(max_rows) == 1:
        print(f"{max_rows[0]} ===== leads to =====> {column}")
    else:
        rows_str = ", ".join(max_rows)
        print(f"{rows_str} ===== leads to =====> {column}")
    print()

Fan ===== leads to =====> Imbalance

Pump ===== leads to =====> Structural fault

Engine ===== leads to =====> Misalignment

Pump ===== leads to =====> looseness

Pump ===== leads to =====> Bearing Fault

Engine ===== leads to =====> Gear Fault



NameError: name 'adj_matrix' is not defined

In [59]:
# Exemple d'embeddings de nœuds (remplacez cela par vos embeddings réels)
node_embeddings = torch.rand(100, 128)

# Génération des données pour la prédiction de liens
num_nodes = node_embeddings.shape[0]
X = torch.zeros((num_nodes * (num_nodes - 1) // 2, node_embeddings.shape[1]))
y = torch.zeros(num_nodes * (num_nodes - 1) // 2, dtype=torch.float)
count = 0
for i in range(num_nodes):
    for j in range(i + 1, num_nodes):
        X[count] = torch.abs(node_embeddings[i] - node_embeddings[j])  # Exemple de caractéristique : distance euclidienne
        y[count] = 1 if are_linked(i, j) else 0  # Remplacez are_linked(i, j) par votre fonction pour déterminer si les nœuds sont liés
        count += 1

# Définition du modèle d'apprentissage profond
class DeepModel(nn.Module):
    def __init__(self, input_dim):
        super(DeepModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.out(x)
        return x

# Initialize the deep model
model = DeepModel(node_embeddings.shape[1])

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    output = model(X)
    loss = criterion(output.squeeze(), y)
    loss.backward()
    optimizer.step()

# Evaluation du modèle sur l'ensemble de test
with torch.no_grad():
    test_output = model(X_test)
    test_predictions = (test_output.squeeze() >= 0.5).float()
    accuracy = (test_predictions == y_test).float().mean()
print("Accuracy: ", accuracy.item())


NameError: name 'adj_matrix' is not defined

In [33]:
session.close()
driver.close()