In [11]:
from pymongo import MongoClient
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, AdamW
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [12]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(torch.version.cuda)


2.3.0+cu118
True
NVIDIA GeForce RTX 2060
11.8


In [13]:
# Cell 2: Kết nối tới MongoDB và truy vấn dữ liệu
client = MongoClient('mongodb://localhost:27017/')
db = client['Interaction_and_Contract_State_Vulnerabilities']

vulnerabilities = ['reentrancy', 'delegatecall', 'unchecked_external_call', 'unchecked_send']
labels = {vulnerability: i for i, vulnerability in enumerate(vulnerabilities)}
data = []

def fetch_data_from_collection(collection_name, label):
    collection = db[collection_name]
    documents = collection.find({})
    for doc in documents:
        if 'extract_feature' in doc:
            for feature in doc['extract_feature']:
                tokens = feature.get('tokens', [])
                if tokens:
                    data.append((' '.join(tokens), label))

for vulnerability in vulnerabilities:
    fetch_data_from_collection(vulnerability, labels[vulnerability])

print(f"Number of samples: {len(data)}")
if data:
    print(f"Sample data: {data[8]}")

Number of samples: 9900
Sample data: ('function transfer ( address _to , uint256 _value ) public returns ( bool success ) ; function transferFrom ( address _from , address _to , uint256 _value ) public returns ( bool success ) ; function approve ( address _spender , uint256 _value ) public returns ( bool success ) ; function allowance ( address _owner , address _spender ) public constant returns ( uint256 remaining ) ; event Transfer ( address indexed _from , address indexed _to , uint256 _value ) ; event Approval ( address indexed _owner , address indexed _spender , uint256 _value ) ; } contract GTO is ERC20Interface { uint8 public constant decimals = 5 ; string public constant symbol = GTO ; string public constant name = GTO ; bool public _selling = false ; uint256 public _totalSupply = 10 * * 14 ; uint256 public _originalBuyPrice = 45 * 10 * * 7 ; address public owner ; mapping ( address = > uint256 ) private balances ; mapping ( address = > mapping ( address = > uint256 ) ) private

In [14]:
# Cell 3: Định nghĩa lớp SolidityDataset
class SolidityDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text, label = self.data[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [15]:
# Cell 4: Chuẩn bị dữ liệu và dataloader
MAX_LEN = 128
BATCH_SIZE = 16

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

if data:
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_dataset = SolidityDataset(train_data, tokenizer, MAX_LEN)
    test_dataset = SolidityDataset(test_data, tokenizer, MAX_LEN)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
else:
    print("No data available to split into training and test sets.")

In [None]:
train_data

In [16]:
# Cell 5: Khởi tạo mô hình và các tham số huấn luyện
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Định nghĩa các siêu tham số của mô hình BERT
config = BertConfig(
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act='gelu',
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    pad_token_id=0,
    gradient_checkpointing=False,
    position_embedding_type='absolute',
    use_cache=True,
    classifier_dropout=None,
    num_labels=len(vulnerabilities)
)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
model = model.to(device)

EPOCHS = 13
LEARNING_RATE = 2e-5
EPSILON = 1e-4

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Cell 6: Định nghĩa hàm train_epoch và eval_model
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", unit="batch")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({"Loss": loss.item()})
    return total_loss / len(dataloader)

def eval_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    all_labels = []
    all_preds = []
    progress_bar = tqdm(dataloader, desc="Evaluating", unit="batch")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            progress_bar.set_postfix({"Loss": loss.item(), "Accuracy": correct_predictions.item() / len(dataloader.dataset)})
    return total_loss / len(dataloader), correct_predictions.double() / len(dataloader.dataset), all_labels, all_preds

In [18]:
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)


In [19]:
# Huấn luyện mô hình
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    val_loss, val_acc, _, _ = eval_model(model, test_dataloader, device)
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

# Đánh giá mô hình trên tập kiểm tra
test_loss, test_acc, test_labels, test_preds = eval_model(model, test_dataloader, device)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

# Sử dụng hàm evaluate_model để tính các chỉ số đánh giá
print("\nEvaluation Metrics:")
evaluate_model(test_labels, test_preds)

Epoch 1/13


Training: 100%|██████████| 495/495 [02:35<00:00,  3.19batch/s, Loss=0.119] 
Evaluating: 100%|██████████| 124/124 [00:14<00:00,  8.68batch/s, Loss=0.487, Accuracy=0.951] 


Train Loss: 0.2832
Validation Loss: 0.1256, Validation Accuracy: 0.9505
Epoch 2/13


Training: 100%|██████████| 495/495 [03:17<00:00,  2.51batch/s, Loss=0.0807] 
Evaluating: 100%|██████████| 124/124 [00:17<00:00,  7.29batch/s, Loss=0.406, Accuracy=0.952]  


Train Loss: 0.1184
Validation Loss: 0.0931, Validation Accuracy: 0.9515
Epoch 3/13


Training: 100%|██████████| 495/495 [03:10<00:00,  2.60batch/s, Loss=0.155]  
Evaluating: 100%|██████████| 124/124 [00:16<00:00,  7.43batch/s, Loss=0.22, Accuracy=0.957]   


Train Loss: 0.0991
Validation Loss: 0.0855, Validation Accuracy: 0.9571
Epoch 4/13


Training: 100%|██████████| 495/495 [02:53<00:00,  2.86batch/s, Loss=0.112]  
Evaluating: 100%|██████████| 124/124 [00:14<00:00,  8.39batch/s, Loss=0.52, Accuracy=0.958]    


Train Loss: 0.0920
Validation Loss: 0.0845, Validation Accuracy: 0.9581
Epoch 5/13


Training: 100%|██████████| 495/495 [02:49<00:00,  2.92batch/s, Loss=0.173]   
Evaluating: 100%|██████████| 124/124 [00:15<00:00,  7.82batch/s, Loss=0.586, Accuracy=0.958]   


Train Loss: 0.0880
Validation Loss: 0.0833, Validation Accuracy: 0.9576
Epoch 6/13


Training: 100%|██████████| 495/495 [02:49<00:00,  2.92batch/s, Loss=0.0619]  
Evaluating: 100%|██████████| 124/124 [00:14<00:00,  8.33batch/s, Loss=0.179, Accuracy=0.959]   


Train Loss: 0.0856
Validation Loss: 0.0779, Validation Accuracy: 0.9591
Epoch 7/13


Training: 100%|██████████| 495/495 [02:49<00:00,  2.92batch/s, Loss=0.0573]  
Evaluating: 100%|██████████| 124/124 [00:16<00:00,  7.54batch/s, Loss=0.149, Accuracy=0.959]   


Train Loss: 0.0905
Validation Loss: 0.0783, Validation Accuracy: 0.9591
Epoch 8/13


Training: 100%|██████████| 495/495 [03:31<00:00,  2.34batch/s, Loss=0.0554]  
Evaluating: 100%|██████████| 124/124 [00:18<00:00,  6.87batch/s, Loss=0.188, Accuracy=0.959]   


Train Loss: 0.0840
Validation Loss: 0.0786, Validation Accuracy: 0.9591
Epoch 9/13


Training: 100%|██████████| 495/495 [02:44<00:00,  3.01batch/s, Loss=0.000331]
Evaluating: 100%|██████████| 124/124 [00:14<00:00,  8.84batch/s, Loss=0.129, Accuracy=0.959]   


Train Loss: 0.0840
Validation Loss: 0.0848, Validation Accuracy: 0.9591
Epoch 10/13


Training:  63%|██████▎   | 312/495 [01:38<00:59,  3.08batch/s, Loss=0.0785]  