# **Graph Neural Networks**

In [7]:
%pip install torch_geometric

Note: you may need to restart the kernel to use updated packages.


In [8]:
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import DataLoader
from sklearn.decomposition import PCA
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import os
from torch.optim.lr_scheduler import StepLR

In [9]:
# Load the ISBSG dataset
df = pd.read_csv("ISBSG.csv")

# Preprocess the dataset
# Dropping the first column 'Project' as it is not needed for the analysis
df = df.drop('Project ID', axis=1)
df = df.drop('Data Quality Rating', axis=1)

# Split the dataset into features and labels
features = df.drop('Summary Work Effort', axis=1)
labels = df['Summary Work Effort']

# Use PCA to reduce the dimensionality of the features
pca = PCA(n_components=10) # set the number of components to keep
features = pca.fit_transform(features)


# Normalize the features
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Converting the dataset to a NetworkX graph
# Here, we create a fully connected graph where edge weights are the cosine similarity between feature vectors


G = nx.Graph()



###**DO NOT RUN THIS CELL** 
if you have already converted the dataset to a graph

In [None]:
for i in range(len(features)):
    for j in range(i+1, len(features)):
        similarity = cosine_similarity(features[i].reshape(1, -1), features[j].reshape(1, -1))
        G.add_edge(i, j, weight=similarity[0][0])

# Save the graph to a file
with open("graph.pkl", "wb") as f:
    pickle.dump(G, f)

In [13]:
# Save the graph to a file
with open("C:\Work\Estimation-of-Software-Development-Effort-main\graph.pkl", "wb") as f:
    pickle.dump(G, f)

In [14]:
# Convert NetworkX graph to PyTorch Geometric Data
edge_index = torch.tensor(list(G.edges), dtype=torch.long).t().contiguous()
x = torch.tensor(features, dtype=torch.float)
y = torch.tensor(labels.values, dtype=torch.float).view(-1, 1)

data = Data(x=x, edge_index=edge_index, y=y)

# Define the GNN architecture
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(data.num_features, 64)
        self.conv2 = GCNConv(64, 64)
        self.conv3 = GCNConv(64, 32)
        self.conv4 = GCNConv(32, 1)
        

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.conv4(x, edge_index)

        return x


# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")


# Create the model, optimizer, and data loader
model = GNN().to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=0.004)
scheduler = StepLR(optimizer, step_size=30, gamma=0.001)
loader = DataLoader([data], batch_size=32, shuffle=True)

# Train the GNN model
def train():
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)

        optimizer.zero_grad()
        out = model(data)
        loss = F.mse_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Train the model for multiple epochs
for epoch in range(200):
    loss = train()
    scheduler.step()
    print(f"Epoch: {epoch+1}, Loss: {loss:.4f}")

Using CPU
Epoch: 1, Loss: 505026976.0000
Epoch: 2, Loss: 505027168.0000




Epoch: 3, Loss: 505026432.0000
Epoch: 4, Loss: 505022944.0000
Epoch: 5, Loss: 505024032.0000
Epoch: 6, Loss: 505017056.0000
Epoch: 7, Loss: 505022720.0000
Epoch: 8, Loss: 505016576.0000
Epoch: 9, Loss: 505017536.0000
Epoch: 10, Loss: 505016256.0000
Epoch: 11, Loss: 505012096.0000
Epoch: 12, Loss: 505008000.0000
Epoch: 13, Loss: 505003744.0000
Epoch: 14, Loss: 504990752.0000
Epoch: 15, Loss: 504988224.0000
Epoch: 16, Loss: 504991168.0000
Epoch: 17, Loss: 504988320.0000
Epoch: 18, Loss: 504988416.0000
Epoch: 19, Loss: 504947328.0000
Epoch: 20, Loss: 504977120.0000
Epoch: 21, Loss: 504974016.0000
Epoch: 22, Loss: 504965344.0000
Epoch: 23, Loss: 504904544.0000
Epoch: 24, Loss: 504920352.0000
Epoch: 25, Loss: 504918112.0000
Epoch: 26, Loss: 504887264.0000
Epoch: 27, Loss: 504886720.0000
Epoch: 28, Loss: 504927776.0000
Epoch: 29, Loss: 504849280.0000
Epoch: 30, Loss: 504893024.0000
Epoch: 31, Loss: 504732512.0000
Epoch: 32, Loss: 504843104.0000
Epoch: 33, Loss: 504786112.0000
Epoch: 34, Loss

In the above code, `loss` is the objective that the GNN model is trying to minimize during training. Specifically, it's the Mean Squared Error (MSE) loss between the predicted effort values (`out`) and the actual effort values (`data.y`). The goal of the training process is to find the model parameters that minimize the MSE loss.

Here's a brief breakdown of the key lines related to the loss:

```python
out = model(data)  # Get predicted effort values from the GNN model
loss = F.mse_loss(out, data.y)  # Compute the MSE loss between predictions and actual effort values
loss.backward()  # Compute the gradients of the loss with respect to the model parameters
optimizer.step()  # Update the model parameters using the computed gradients
```

During each training epoch, the model processes the data, makes predictions, and updates its parameters based on the gradients of the loss function to reduce the error in the predictions. The `loss` variable is used to keep track of how well the model is performing during training and to provide feedback on the learning progress.


In [15]:
# Set the model to evaluation mode
model.eval()

# Move the input data to the same device as the model
data = data.to(device)

# Get the predicted effort values
with torch.no_grad():
    predicted_effort = model(data)

from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate the MAE and RMSE
mae = mean_absolute_error(data.y.cpu().numpy(), predicted_effort.cpu().numpy())
rmse = np.sqrt(mean_squared_error(data.y.cpu().numpy(), predicted_effort.cpu().numpy()))

# Print the MAE and RMSE
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


Mean Absolute Error (MAE): 6043.5483
Root Mean Squared Error (RMSE): 22466.8828


In [16]:
def mean_magnitude_relative_error(y_true, y_pred):
    relative_errors = np.abs(y_true - y_pred) / y_true
    return np.mean(relative_errors)

# Calculate the MMRE
mmre = mean_magnitude_relative_error(data.y.cpu().numpy(), predicted_effort.cpu().numpy())

# Print the MMRE
print(f"Mean Magnitude of Relative Error (MMRE): {mmre:.4f}")

Mean Magnitude of Relative Error (MMRE): 0.9713


To improve the model's performance, you can try:

1. Increase the number of training epochs or adjust the learning rate.
2. Experiment with different GNN architectures or other machine learning models.
3. Fine-tune the process of converting the dataset into a graph representation.
4. Perform feature engineering, feature selection, or dimensionality reduction to improve the quality of the input features.
5. Divide your dataset into training and testing sets to better evaluate the model's performance and avoid overfitting.