# Installation

In [14]:
# !conda clean --all
# !conda update -n base -c defaults conda

# CUDA Toolkit 11.3
# https://developer.nvidia.com/cuda-11.3.0-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exe_local
#https://pytorch.org/get-started/previous-versions/

# !conda create --name vulscriber-cluster-env python=3.8
# !conda install pytorch==1.10.0 torchvision==0.11.1 torchaudio==0.10.0 cudatoolkit=11.3 -c pytorch
# !conda install transformers pandas scikit-learn

# https://stackoverflow.com/questions/75242037/failed-to-import-transformers-onnx-config
# !conda install -c conda-forge sentence-transformers
# !conda install Pillow

# Load Dataset

In [None]:
import json 
import pandas as pd

# Load JSON & Convert to DataFrame
filename = r"C:\Users\Administrator\Downloads\mapr\VulScribeR-main\dataset\megavul_simple.json"
with open(filename, "r") as f:
    json_data = json.load(f)
mega_vul_df = pd.DataFrame(json_data)


# Tokenizer

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.to(device)


In [None]:
# # Encode vectors
# from tqdm import tqdm
# import torch
# import numpy as np
# import pickle
# import gc

# BATCH_SIZE = 128  
# SAVE_PATH = "encoded_vectors.pkl"  # Dùng pickle thay vì JSON
# SAVE_EVERY = 50  

# torch.backends.cudnn.benchmark = True  

# # Tải checkpoint nếu có
# try:
#     with open(SAVE_PATH, "rb") as f:
#         index_to_vector_map = pickle.load(f)
#         print(f"✅ Resume từ checkpoint: {len(index_to_vector_map)} vectors")
# except FileNotFoundError:
#     index_to_vector_map = {}

# # Chuẩn bị danh sách các mẫu chưa encode trước vòng lặp
# code_samples = [(i, str(row["func"])) for i, row in mega_vul_df.iterrows() if str(row["func"]).strip()]
# code_samples = [(i, code) for i, code in code_samples if i not in index_to_vector_map]  
# total_samples = len(code_samples)

# # Nếu tất cả đã encode, thoát sớm
# if total_samples == 0:
#     print("✅ Tất cả mẫu đã được encode!")
#     exit()

# batch_count = 0  

# for i in tqdm(range(0, total_samples, BATCH_SIZE), desc="Encoding..."):
#     batch_indices, batch_codes = zip(*code_samples[i : i + BATCH_SIZE])

#     # Encode batch
#     inputs = tokenizer(batch_codes, return_tensors="pt", truncation=True, padding=True, max_length=512)
#     inputs = {k: v.to("cuda:0", non_blocking=True) for k, v in inputs.items()}

#     with torch.inference_mode(), torch.cuda.amp.autocast():
#         outputs = model(**inputs)

#     batch_vectors = outputs.last_hidden_state[:, 0, :].cpu().numpy()

#     # Lưu vào dict
#     index_to_vector_map.update({idx: vec.tolist() for idx, vec in zip(batch_indices, batch_vectors)})

#     batch_count += 1
#     if batch_count % SAVE_EVERY == 0:  # Chỉ lưu mỗi 10 batch
#         with open(SAVE_PATH, "wb") as f:
#             pickle.dump(index_to_vector_map, f)

#         torch.cuda.empty_cache()  
#         gc.collect()

# # Lưu lần cuối
# with open(SAVE_PATH, "wb") as f:
#     pickle.dump(index_to_vector_map, f)

# print(f"✅ Hoàn tất encoding! Đã lưu vào {SAVE_PATH}")


# Load encodede vectors pickle file

In [1]:
import pickle

file_path = "encoded_vectors.pkl"

try:
    with open(file_path, "rb") as f:
        data = pickle.load(f)
    print("✅ File pickle tải thành công!", len(data))
except Exception as e:
    print("❌ Lỗi khi tải file pickle:", e)


✅ File pickle tải thành công! 353873


In [None]:

# Chuẩn bị danh sách các mẫu chưa encode trước vòng lặp
code_samples = [(i, str(row["func"])) for i, row in mega_vul_df.iterrows() if str(row["func"]).strip()]
code_samples = [(i, code) for i, code in code_samples if i not in data]  
total_samples = len(code_samples)

In [9]:
# print(type(data))
# print(len(data))
encoded_code_vectors = list(data.values())
print(f"✅ Đã lấy được {len(encoded_code_vectors)} vectors từ pickle!")


✅ Đã lấy được 353873 vectors từ pickle!


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(encoded_code_vectors)

# Get the cluster labels for each code piece
cluster_labels = kmeans.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [12]:
import os

# Ensure the directory exists
output_dir = './dataset/container_data'
os.makedirs(output_dir, exist_ok=True)

mega_vul_df['cluster'] = cluster_labels

for cluster_number, group_df in mega_vul_df.groupby('cluster'):
    file_name = os.path.join(output_dir, f'megavul_vuls_cls_{cluster_number}.jsonl')
    group_df[['func', 'is_vul', 'diff_line_info', 'func_before']].to_json(file_name, orient='records', lines=True)


In [None]:
#✅ Bạn chỉ encode các dòng có func, nhưng một số dòng có thể bị thiếu func:

code_samples = [(i, str(row["func"])) for i, row in mega_vul_df.iterrows() if str(row["func"]).strip()]


In [None]:
arr = [0,0,0,0,0,0]
for i in cluster_labels:
    arr[i]+=1

print(arr) # avg -> BIGVUL [1238, 1187, 1069, 1372, 3917, 0] order is [4,3,1,0,2] and kmeans indices are 0,1,2,3,4 for 5 clusters
# MEGAVUL [41918, 65107, 68454, 85557, 92837, 0]


[41918, 65107, 68454, 85557, 92837, 0]
