# RQ3: Detect using custom DL models

In [1]:
from google import genai
from google.genai import types
import os
import numpy as np
import pandas as pd
import configparser
from tqdm import tqdm
import ast
import time

In [2]:
class GeminiEmbeddingGenerator:
    def __init__(self, model_name, embedding_type):
        self.model_name = model_name
        self.embedding_type = embedding_type
        self.__setup()

    def __setup(self):
        config = configparser.ConfigParser()
        config.read(".config")
        GEMINI_API_KEY = config["API_KEYS"]["GEMINI_API_KEY"]

        self.__client = genai.Client(api_key=GEMINI_API_KEY)

    def get_embeddings(self, text):
        #return np.arange(1,768,1).tolist()
        result = self.__client.models.embed_content(
                        model=self.model_name ,
                        contents=text,
                        config=types.EmbedContentConfig(task_type=self.embedding_type)
                )
        return result.embeddings[0].values

## Simple classificaton task using LLM embeddings

In [3]:
os.makedirs("./Results/RQ3/Model1", exist_ok=True)

In [4]:
df = pd.read_csv("./data/Complete_GoldSet.csv")
df['text'] = "CommitMessage: " + df['CommitMessage'] + "\nDiff:\n" +  df['Diff']

In [6]:
egen = GeminiEmbeddingGenerator("text-embedding-004", "CLASSIFICATION")
id = []
X = []
Y = []

In [7]:
for _, row in tqdm(df.iterrows()):
    emb = egen.get_embeddings(row['text'])
    id.append(row["CommitHash"])
    X.append(emb)
    Y.append(row["Decision"])
    time.sleep(0.25)

1764it [16:59,  1.73it/s]


In [8]:
df_emb = pd.DataFrame(data={"Hash": id, "Embeddings": X, "Label": Y})
df_emb['Embeddings'] = df_emb['Embeddings'].astype('object')
df_emb.to_csv("./data/Embeddings/MsgDiff.csv", index=False)

arr_x = np.array(X)
np.save("./data/Embeddings/MsgDiff.npy", arr_x)

In [None]:
df_emb = pd.read_csv("./data/Embeddings/MsgDiff.csv")
df_emb["Embeddings"] = df_emb["Embeddings"].apply(ast.literal_eval)

## Multi-step Classification

In [None]:
hashes = []
msg_cls_emb = []
msg_sim_emb = []
diff_sim_emb = []
labels = []
egen_cls = GeminiEmbeddingGenerator("text-embedding-004", "CLASSIFICATION")
egen_sim = GeminiEmbeddingGenerator("text-embedding-004", "SEMANTIC_SIMILARITY")

In [None]:
for _, row in tqdm(df.iterrows()):
    msg_cls_emb.append(egen_cls.get_embeddings(row['CommitMessage']))

    msg_sim_emb.append(egen_sim.get_embeddings(row['CommitMessage']))
    diff_sim_emb.append(egen_sim.get_embeddings(row['Diff']))

    hashes.append(row["CommitHash"])
    labels.append(row["Decision"])
    time.sleep(0.3)

In [None]:
df_emb = pd.DataFrame(data={"Hash": hashes, "Msg_Cls_Embeddings": msg_cls_emb, "Msg_Sim_Embeddings": msg_sim_emb, "Diff_Sim_Embeddings": diff_sim_emb, "Label": labels})
df_emb['Embeddings'] = df_emb['Embeddings'].astype('object')
df_emb.to_csv("./data/Embeddings/MsgCls_MsgDiffSim.csv", index=False)

arr_x = np.array(msg_cls_emb)
np.save("./data/Embeddings/MsgCls.npy", arr_x)
arr_x = np.array(msg_sim_emb)
np.save("./data/Embeddings/MsgSim.npy", arr_x)
arr_x = np.array(diff_sim_emb)
np.save("./data/Embeddings/DiffSim.npy", arr_x)