In [10]:
import rdflib
import networkx as nx
from node2vec import Node2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
kg_path = "./defect_knowledge_graph_final.ttl"  
g = rdflib.Graph()
g.parse(kg_path, format="turtle")

# Convert RDF Graph to NetworkX Graph
G = nx.Graph()
for subj, pred, obj in g:
    G.add_edge(str(subj), str(obj), label=str(pred))

print(f"Graph Loaded: {len(G.nodes())} nodes, {len(G.edges())} edges")

Graph Loaded: 43628 nodes, 114974 edges


In [5]:
# Generate node embeddings from the KG
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Convert embeddings to DataFrame
embedding_df = pd.DataFrame([model.wv.get_vector(str(node)) for node in G.nodes], index=G.nodes)
embedding_df.reset_index(inplace=True)
embedding_df.rename(columns={"index": "ModuleID"}, inplace=True)

print(embedding_df.head())

Computing transition probabilities: 100%|████████████████████████████████████████| 43628/43628 [28:26<00:00, 25.57it/s]


PicklingError: Could not pickle the task to send it to the workers.

In [12]:
# Extract defect labels from KG
defect_labels = []
for subj, pred, obj in g:
    if "hasDefect" in str(pred):  # Relationship between CodeModule and Defect
        defect_labels.append({"ModuleID": str(subj), "DefectStatus": 1})  # 1 = Defective

# Convert to DataFrame
defect_df = pd.DataFrame(defect_labels)

# Merge embeddings with labels
df = embedding_df.merge(defect_df, on="ModuleID", how="left")
df["DefectStatus"].fillna(0, inplace=True)  # Modules without defects are non-defective

print(df.head())


NameError: name 'embedding_df' is not defined

In [57]:
# Select features and target
features = [col for col in df.columns if col not in ["DefectStatus", "ModuleID"]]
target = "DefectStatus"

# Split data
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

In [59]:
# Get feature importance
importance = rf_model.feature_importances_
feature_names = features

# Plot importance
plt.figure(figsize=(12, 6))
sns.barplot(x=importance, y=feature_names)
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Random Forest Feature Importance")
plt.show()