# RQ3: Detect using custom DL models

In [102]:
from google import genai
from google.genai import types
import os
import numpy as np
import pandas as pd
import configparser
from tqdm import tqdm
import ast
import time

In [91]:
class GeminiEmbeddingGenerator:
    def __init__(self, model_name, embedding_type):
        self.model_name = model_name
        self.embedding_type = embedding_type
        self.__setup()

    def __setup(self):
        config = configparser.ConfigParser()
        config.read(".config")
        GEMINI_API_KEY = config["API_KEYS"]["GEMINI_API_KEY"]

        self.__client = genai.Client(api_key=GEMINI_API_KEY)

    def get_embeddings(self, text):
        #return np.arange(1,768,1).tolist()
        result = self.__client.models.embed_content(
                        model=self.model_name ,
                        contents=text,
                        config=types.EmbedContentConfig(task_type=self.embedding_type)
                )
        return result.embeddings[0].values

## Simple classificaton task using LLM embeddings

In [92]:
os.makedirs("./Results/RQ3/Model1", exist_ok=True)

In [93]:
df = pd.read_csv("./data/Complete_GoldSet.csv")
df['text'] = "CommitMessage: " + df['CommitMessage'] + "\nDiff:\n" +  df['Diff']

In [None]:
egen = GeminiEmbeddingGenerator("text-embedding-004", "CLASSIFICATION")
id = []
X = []
Y = []

In [104]:
for _, row in tqdm(df.iterrows()):
    if row["CommitHash"] in id:
        continue
    emb = egen.get_embeddings(row['text'])
    id.append(row["CommitHash"])
    X.append(emb)
    Y.append(row["Decision"])
    time.sleep(0.25)

1776it [15:24,  1.92it/s]


In [105]:
df_emb = pd.DataFrame(data={"Hash": id, "Embeddings": X, "Label": Y})
df_emb['Embeddings'] = df_emb['Embeddings'].astype('object')
df_emb.to_csv("./data/Embeddings/MsgDiff.csv", index=False)

arr_x = np.array(X)
np.save("./data/Embeddings/MsgDiff.npy", arr_x)

In [None]:
df_emb = pd.read_csv("./data/Embeddings/MsgDiff.csv")
df_emb["Embeddings"] = df_emb["Embeddings"].apply(ast.literal_eval)

In [126]:
import glob
for csv in glob.glob("./Results/**/*.csv", recursive=True):
    df = pd.read_csv(csv)
    df_unique = df.drop_duplicates(subset=["Diff"], keep="first")
    df_unique.to_csv(csv, index=False)