In [11]:
!pip install --upgrade jupyter jupyterlab ipywidgets

Collecting jupyterlab
  Downloading jupyterlab-4.2.1-py3-none-any.whl.metadata (16 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.3-py3-none-any.whl.metadata (2.4 kB)
Collecting httpx>=0.25.0 (from jupyterlab)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jupyterlab-server<3,>=2.27.1 (from jupyterlab)
  Downloading jupyterlab_server-2.27.2-py3-none-any.whl.metadata (5.9 kB)
Collecting comm>=0.1.3 (from ipywidgets)
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.11 (from ipywidgets)
  Downloading widgetsnbextension-4.0.11-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.11 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.11-py3-none-any.whl.metadata (4.1 kB)
Collecting httpcore==1.* (from httpx>=0.25.0->jupyterlab)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx>=0.25.0->jupyterlab)
  Downloading h11-0.14.0-py3-

In [12]:
!jupyter lab build

[LabBuildApp] JupyterLab 4.0.13
[LabBuildApp] Building in /opt/anaconda3/share/jupyter/lab
[LabBuildApp] ERROR | Build failed.
Troubleshooting: If the build failed due to an out-of-memory error, you
may be able to fix it by disabling the `dev_build` and/or `minimize` options.

If you are building via the `jupyter lab build` command, you can disable
these options like so:

jupyter lab build --dev-build=False --minimize=False

You can also disable these options for all JupyterLab builds by adding these
lines to a Jupyter config file named `jupyter_config.py`:

c.LabBuildApp.minimize = False
c.LabBuildApp.dev_build = False

If you don't already have a `jupyter_config.py` file, you can create one by
adding a blank file of that name to any of the Jupyter config directories.
The config directories can be listed by running:

jupyter --paths

Explanation:

- `dev-build`: This option controls whether a `dev` or a more streamlined
`production` build is used. This option will default to `False` (

# Librerie


In [22]:
!pip install torch torchvision torchaudio
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-<torch_version>+<cpu_or_cuda>.html
!pip install torch-geometric
!pip install transformers
import re
import torch
import pandas as pd
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import json
import numpy as np

zsh:1: no such file or directory: torch_version


# Utilities


In [23]:
# Funzione per il parsing degli AST
def parse_ast(ast_text):
    nodes = []
    edges = []
    node_index = 0
    node_stack = []

    for line in ast_text.split("\n"):
        if not line.strip():
            continue
        indent_level = len(line) - len(line.lstrip())
        node_label = line.strip()
        
        nodes.append((node_index, node_label))
        
        if node_stack:
            parent_index = node_stack[-1]
            edges.append((parent_index, node_index))
        
        if line.lstrip().startswith("`-") or line.lstrip().startswith("|-"):
            node_stack.append(node_index)
        elif node_stack:
            node_stack.pop()
        
        node_index += 1
    
    return nodes, edges

# Funzione per creare embedding BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def create_bert_embedding(nodes):
    embeddings = []
    for _, label in nodes:
        inputs = tokenizer(label, return_tensors='pt', truncation=True, padding=True, max_length=128)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
    return torch.tensor(np.array(embeddings), dtype=torch.float).squeeze(1)

# Funzione per creare i dati
def create_data(nodes, edges, embeddings, label):
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    y = torch.tensor([label], dtype=torch.long)
    return Data(x=embeddings, edge_index=edge_index, y=y)

# Dataset


In [24]:
file_path = 'Dataset/DiverseVul_AST_primo.json'
with open(file_path, 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data)

# Creazione del dataset
dataset = []
for index, row in df.iterrows():
    nodes, edges = parse_ast(row['func'])
    embeddings = create_bert_embedding(nodes)
    data = create_data(nodes, edges, embeddings, row['target'])
    dataset.append(data)

# Suddivisione del dataset in train, validation e test
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

KeyboardInterrupt: 

# Modello


In [None]:
# Definizione del modello GCN
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(768, 32)  # Cambia 1 a 768 per BERT embedding size
        self.conv2 = GCNConv(32, 64)
        self.fc = torch.nn.Linear(64, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = torch.mean(x, dim=0)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train(loader):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in loader:
            out = model(data)
            pred = out.argmax(dim=1)
            correct += pred.eq(data.y).sum().item()
            total += data.y.size(0)
    return correct / total

# Training


In [None]:
for epoch in range(1, 201):
    train_loss = train(train_loader)
    val_acc = evaluate(val_loader)
    print(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}, Validation Accuracy: {val_acc:.4f}')

# Test


In [None]:
# Test del modello
test_acc = evaluate(test_loader)
print(f'Test Accuracy: {test_acc:.4f}')