In [45]:
%pip install transformers
%pip install torch
%pip install pandas
%pip install scikit-learn
%pip install matplotlib
%pip install jupyter ipywidgets


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting jupyter
  Downloading jupyter-1.0.0-py2.py3-none-any.whl.metadata (995 bytes)
Collecting ipywidgets
  Downloading ipywidgets-8.1.3-py3-none-any.whl.metadata (2.4 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.2.0-py3-none-any.whl.metadata (10 kB)
Collecting qtconsole (from jupyter)
  Downloading qtconsole-5.5.2-py3-none-any.whl.metadata (5.1 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Downloading nbconvert-7.16.4-py3-none-any.whl.metadata (8.5 kB)
Collecting widgetsnbextension~=4.0.11 (from ipywidgets)
  Downloading widgetsnb

In [57]:
from pymongo import MongoClient
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertConfig
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [47]:
client = MongoClient('mongodb://localhost:27017/')
db = client['Interaction_and_Contract_State_Vulnerabilities']

vulnerabilities = ['reentrancy', 'delegatecall', 'unchecked_external_call', 'unchecked_send']
labels = {vulnerability: i for i, vulnerability in enumerate(vulnerabilities)}
data = []

def fetch_data_from_collection(collection_name, label):
    collection = db[collection_name]
    documents = collection.find({})
    count = 0  # Debug counter
    for doc in documents:
        if 'extract_feature' in doc:
            for feature in doc['extract_feature']:
                tokens = feature.get('tokens', [])
                if tokens:  # Ensure tokens are not empty
                    tokens = [str(token) for token in tokens]  # Convert all tokens to strings
                    data.append((tokens, label))
                    count += 1
    print(f"Fetched {count} records from collection {collection_name}")  # Debug statement

for vulnerability in vulnerabilities:
    fetch_data_from_collection(vulnerability, labels[vulnerability])

# Kiểm tra xem dữ liệu có được lấy đúng hay không
print(f"Number of samples: {len(data)}")
if len(data) > 0:
    print(f"Sample data: {data[1]}")

Fetched 3850 records from collection reentrancy
Fetched 3750 records from collection delegatecall
Fetched 1032 records from collection unchecked_external_call
Fetched 1268 records from collection unchecked_send
Number of samples: 9900
Sample data: (['function', 'confirmOwner', '(', ')', 'public', '{', 'require', '(', 'newOwner', '=', '=', 'msg', '.', 'sender', ')', ';', 'owner', '=', 'newOwner', ';', 'delete', 'newOwner', ';', '}'], 0)


In [49]:
class SolidityDataset(Dataset):
    def __init__(self, data, max_len):
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_ids, label = self.data[index]

        # Debug: Check the data types and structure
        print(f"Original input_ids: {input_ids}")
        
        # Ensure input_ids are integers
        input_ids = [int(token) for token in input_ids if token.isdigit()]

        # Debug: Check the transformed input_ids
        print(f"Transformed input_ids: {input_ids}")

        # Padding and truncation
        padding_length = self.max_len - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + [0] * padding_length
            attention_mask = [1] * len(input_ids) + [0] * padding_length
        else:
            input_ids = input_ids[:self.max_len]
            attention_mask = [1] * self.max_len
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Khởi tạo các biến cần thiết
MAX_LEN = 64
BATCH_SIZE = 8

# Chia dữ liệu thành tập huấn luyện và kiểm tra
if len(data) > 0:  # Ensure data is not empty
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_dataset = SolidityDataset(train_data, MAX_LEN)
    test_dataset = SolidityDataset(test_data, MAX_LEN)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
else:
    print("No data available to split into training and test sets.")

In [50]:
class CustomBERTModel(torch.nn.Module):
    def __init__(self, config, num_classes):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel(config)
        self.classifier = torch.nn.Linear(config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

# Thiết lập cấu hình tùy chỉnh cho BERT
config = BertConfig(
    hidden_size=768,
    num_attention_heads=4,
    num_hidden_layers=12,
    intermediate_size=3072,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1
)

# Số lớp phân loại
NUM_CLASSES = len(vulnerabilities)

model = CustomBERTModel(config, NUM_CLASSES)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Các siêu tham số
EPOCHS = 15
LEARNING_RATE = 2e-3
EPSILON = 1e-5

# Optimizer và scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Loss function
criterion = torch.nn.CrossEntropyLoss()

In [58]:
import tqdm


def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", unit="batch")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({"Loss": loss.item()})
    return total_loss / len(dataloader)


In [59]:
def eval_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    all_labels = []
    all_preds = []
    progress_bar = tqdm(dataloader, desc="Evaluating", unit="batch")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            progress_bar.set_postfix({"Loss": loss.item(), "Accuracy": correct_predictions.item() / len(dataloader.dataset)})
    return total_loss / len(dataloader), correct_predictions.double() / len(dataloader.dataset), all_labels, all_preds

In [60]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    train_loss = train_epoch(model, train_dataloader, criterion, optimizer, device)
    val_loss, val_acc, _, _ = eval_model(model, test_dataloader, criterion, device)
    scheduler.step()
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

Exception ignored in: <function tqdm.__del__ at 0x000002755B2FEAC0>
Traceback (most recent call last):
  File "d:\GitHub\Blockchain-Smart-Contract-Security\.venv\Lib\site-packages\tqdm\std.py", line 1149, in __del__
    self.close()
  File "d:\GitHub\Blockchain-Smart-Contract-Security\.venv\Lib\site-packages\tqdm\notebook.py", line 278, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Epoch 1/15


TypeError: 'module' object is not callable

In [54]:
test_loss, test_acc, test_labels, test_preds = eval_model(model, test_dataloader, criterion, device)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

cm = confusion_matrix(test_labels, test_preds)
plt.figure(figsize=(8, 8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(len(set(test_labels)))
plt.xticks(tick_marks, set(test_labels), rotation=45)
plt.yticks(tick_marks, set(test_labels))
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

# Cell 9: Hàm xuất các token trong dataset
def export_tokens(dataset, filename):
    with open(filename, 'w') as f:
        for item in dataset:
            tokens, label = item['input_ids'].tolist(), item['label'].item()
            token_str = ' '.join(map(str, tokens))
            f.write(f"{token_str}\t{label}\n")

# Xuất các token trong tập huấn luyện và tập kiểm tra
if len(data) > 0:  # Ensure data is not empty
    export_tokens(train_dataset, 'train_tokens.txt')
    export_tokens(test_dataset, 'test_tokens.txt')
else:
    print("No data available to export.")

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html