## 2. Semantic search with OpenAI embedding

### Loading libraries

In [23]:
from sentence_transformers import SentenceTransformer
import chromadb
import pandas as pd
from tqdm import tqdm
from rich import print
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None  # default='warn'

import os
from dotenv import load_dotenv
import openai

load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

### Preprocessing and Splitting

In [24]:
df = pd.read_csv('data/bigvul_devign_cvefixes_neuralsentry_commits.csv')
df = df.drop_duplicates(subset=['commit_hash'],keep='first')
df = df.dropna(subset=['commit_msg', 'commit_hash'])
openai_train_df, openai_test_df = train_test_split(df, test_size=0.2, random_state=420)

In [25]:
openai_linux_test_df = pd.read_csv('data/linux_test.csv')
linux_test_df = pd.read_csv('data/linux_test.csv')
vulnfix = linux_test_df[linux_test_df['labels'] == 1]
non_vulnfix = linux_test_df[linux_test_df['labels'] == 0]
openai_linux_test_df = pd.concat([non_vulnfix, vulnfix])
openai_linux_test_df['commit_msg'] = openai_linux_test_df['commit_msg'].apply(lambda x: x[:10000])
openai_linux_test_df.shape

(2330, 6)

In [27]:
def generate_embeddings(commit, model="text-embedding-ada-002"):
   pbar.update()
   return openai.Embedding.create(input = [commit], model=model)['data'][0]['embedding']

# openai_train_df = pd.read_csv('data/train.csv')
with tqdm(total=len(openai_train_df), desc="Generate embedding for train dataset") as pbar:
  openai_train_df['embedded_commit'] = openai_train_df['commit_msg'].apply(lambda x: generate_embeddings(x))

Generate embedding for train dataset: 100%|██████████| 27992/27992 [2:53:07<00:00,  2.69it/s]  


In [34]:
with tqdm(total=len(openai_test_df), desc="Generate embedding for test dataset") as pbar:
  openai_test_df['embedded_commit'] = openai_test_df['commit_msg'].apply(lambda x: generate_embeddings(x))


with tqdm(total=len(openai_linux_test_df), desc="Generate embedding for linux test dataset") as pbar:
  openai_linux_test_df['embedded_commit'] = openai_linux_test_df['commit_msg'].apply(lambda x: generate_embeddings(x))

Generate embedding for test dataset: 100%|██████████| 6998/6998 [43:05<00:00,  2.71it/s]  
Generate embedding for linux test dataset: 100%|██████████| 2330/2330 [14:32<00:00,  2.67it/s]


### Feeding training data into ChromaDB

In [33]:
openai_train_embeddings = openai_train_df['embedded_commit']
openai_train_labels = openai_train_df['labels'].tolist()
openai_train_hash = openai_train_df['commit_hash']

client = chromadb.PersistentClient(path='db/text-embedding-ada-002 embedding/')
collection = client.get_or_create_collection("text-embedding-ada-002_embedding")

collection.add(
    embeddings=[embed for embed in openai_train_embeddings],
    metadatas=[{"label": str(openai_train_labels[i])} for i in range(len(openai_train_df))],
    ids=[str(hashes) for hashes in openai_train_hash]
)

### Prediction (BigVul, Devign, CVEfixes, NeuralSentry commits)

In [35]:
client = chromadb.PersistentClient(path='db/text-embedding-ada-002 embedding/')
collection = client.get_collection("text-embedding-ada-002_embedding")

pred_labels = []
for index, row in openai_test_df.iterrows():
    query_result = collection.query(
        query_embeddings=[ row['embedded_commit'] ],
        n_results=2,
    )
    pred_labels.append(query_result['metadatas'][0][0]['label'])

print(query_result)
pred_labels = [int(x) for x in pred_labels]

### Generate Metrics for (BigVul, Devign, CVEfixes, NeuralSentry commits)

In [36]:
test_labels = openai_test_df['labels'].tolist()
report = classification_report(test_labels,pred_labels,target_names=["non-bugfix", "bugfix"],)
confusion_matrix_metric = confusion_matrix(test_labels,pred_labels)
accuracy_metric = accuracy_score(test_labels, pred_labels)
print(  "\n\n>>> Confusion Matrix:",
  f"\nTP: {confusion_matrix_metric[1][1]}, FP: {confusion_matrix_metric[0][1]}",
  f"\nFN: {confusion_matrix_metric[1][0]}, TN: {confusion_matrix_metric[0][0]}",
  "\n\n>>> Accuracy: ",
  accuracy_metric,
  "\n\n>>>: Classification Report:\n",
  report)

### Prediction (Linux CVEs)

In [37]:
client = chromadb.PersistentClient(path='db/text-embedding-ada-002 embedding/')
collection = client.get_collection("text-embedding-ada-002_embedding")

pred_labels = []
for index, row in openai_linux_test_df.iterrows():
    query_result = collection.query(
        query_embeddings=[ row['embedded_commit'] ],
        n_results=2,
    )
    pred_labels.append(query_result['metadatas'][0][0]['label'])

print(query_result)
pred_labels = [int(x) for x in pred_labels]

### Generate Metrics for (Linux CVEs)

In [38]:
test_labels = openai_linux_test_df['labels'].tolist()
report = classification_report(test_labels,pred_labels,target_names=["non-bugfix", "bugfix"],)
confusion_matrix_metric = confusion_matrix(test_labels,pred_labels)
accuracy_metric = accuracy_score(test_labels, pred_labels)
print(  "\n\n>>> Confusion Matrix:",
  f"\nTP: {confusion_matrix_metric[1][1]}, FP: {confusion_matrix_metric[0][1]}",
  f"\nFN: {confusion_matrix_metric[1][0]}, TN: {confusion_matrix_metric[0][0]}",
  "\n\n>>> Accuracy: ",
  accuracy_metric,
  "\n\n>>>: Classification Report:\n",
  report)