## 1. Semantic search with all-MiniLM-L6-v2 embedding

In [2]:
from sentence_transformers import SentenceTransformer
import chromadb
import pandas as pd
from tqdm import tqdm
from rich import print
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None  # default='warn'

### Preprocessing and splitting

In [20]:
df = pd.read_csv('data/bigvul_devign_cvefixes_neuralsentry_commits.csv')
df = df.drop_duplicates(subset=['commit_hash'],keep='first')
df = df.dropna(subset=['commit_msg', 'commit_hash'])
train_df, test_df = train_test_split(df, test_size=0.2, random_state=420)

linux_test_df = pd.read_csv('data/linux_test.csv')
vulnfix = linux_test_df[linux_test_df['labels'] == 1]
non_vulnfix = linux_test_df[linux_test_df['labels'] == 0]
linux_test_df = pd.concat([non_vulnfix, vulnfix])
linux_test_df.shape, test_df.shape

((2330, 6), (6998, 8))

### Embed commit for train and test dataset

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(commit):
  pbar.update()
  return model.encode(commit).tolist()


with tqdm(total=len(train_df), desc="Generate embedding for train dataset") as pbar:
  train_df['embedded_commit'] = train_df['commit_msg'].apply(lambda x: generate_embeddings(x))

Generate embedding for train dataset: 100%|██████████| 27992/27992 [27:09<00:00, 17.18it/s]  


In [5]:
with tqdm(total=len(test_df), desc="Generate embedding for test dataset") as pbar:
  test_df['embedded_commit'] = test_df['commit_msg'].apply(lambda x: generate_embeddings(x))

with tqdm(total=len(linux_test_df), desc="Generate embedding for linux test dataset") as pbar:
  linux_test_df['embedded_commit'] = linux_test_df['commit_msg'].apply(lambda x: generate_embeddings(x))

Generate embedding for test dataset: 100%|██████████| 6998/6998 [09:06<00:00, 12.81it/s]
Generate embedding for linux test dataset: 100%|██████████| 2330/2330 [05:29<00:00,  7.08it/s]


### Feeding training data into ChromaDB

In [7]:
train_embeddings = train_df['embedded_commit']
train_labels = train_df['labels'].tolist()
train_hash = train_df['commit_hash']

client = chromadb.PersistentClient(path='db/all-MiniLM-L6-v2 embedding/')
collection = client.create_collection("all-MiniLM-L6-v2_embedding")

collection.add(
    embeddings=[embed for embed in train_embeddings],
    metadatas=[{"label": str(train_labels[i])} for i in range(len(train_df))],
    ids=[str(hashes) for hashes in train_hash]
)


### Prediction (BigVul, Devign, CVEfixes, NeuralSentry commits)

In [13]:
client = chromadb.PersistentClient(path='db/all-MiniLM-L6-v2 embedding/')
collection = client.get_collection(name="all-MiniLM-L6-v2_embedding")
y_pred = []
for index, row in test_df.iterrows():
    query_result = collection.query(
        query_embeddings=[ row['embedded_commit'] ],
        n_results=2,
    )
    y_pred.append(query_result['metadatas'][0][0]['label'])

print(query_result) # Last row of the test dataset
y_pred = [int(x) for x in y_pred]

### Generate Metrics for (BigVul, Devign, CVEfixes, NeuralSentry commits)

In [14]:
y_test = test_df['labels'].tolist()
report = classification_report(y_test,y_pred,target_names=["non-bugfix", "bugfix"],)
confusion_matrix_metric = confusion_matrix(y_test,y_pred)
accuracy_metric = accuracy_score(y_test, y_pred)
print(  "\n\n>>> Confusion Matrix:",
  f"\nTP: {confusion_matrix_metric[1][1]}, FP: {confusion_matrix_metric[0][1]}",
  f"\nFN: {confusion_matrix_metric[1][0]}, TN: {confusion_matrix_metric[0][0]}",
  "\n\n>>> Accuracy: ",
  accuracy_metric,
  "\n\n>>>: Classification Report:\n",
  report)

### Prediction (Linux CVEs)

In [15]:
client = chromadb.PersistentClient(path='db/all-MiniLM-L6-v2 embedding/')
collection = client.get_collection(name="all-MiniLM-L6-v2_embedding")
y_pred = []
for index, row in linux_test_df.iterrows():
    query_result = collection.query(
        query_embeddings=[ row['embedded_commit'] ],
        n_results=2,
    )
    y_pred.append(query_result['metadatas'][0][0]['label'])

print(query_result) # Last row of the test dataset
y_pred = [int(x) for x in y_pred]

### Generate Metrics for (Linux CVEs)

In [16]:
y_test = linux_test_df['labels'].tolist()
report = classification_report(y_test,y_pred,target_names=["non-bugfix", "bugfix"],)
confusion_matrix_metric = confusion_matrix(y_test,y_pred)
accuracy_metric = accuracy_score(y_test, y_pred)
print(  "\n\n>>> Confusion Matrix:",
  f"\nTP: {confusion_matrix_metric[1][1]}, FP: {confusion_matrix_metric[0][1]}",
  f"\nFN: {confusion_matrix_metric[1][0]}, TN: {confusion_matrix_metric[0][0]}",
  "\n\n>>> Accuracy: ",
  accuracy_metric,
  "\n\n>>>: Classification Report:\n",
  report)