In [3]:
import pandas as pd
from tqdm import tqdm, tqdm_pandas
from vertexai.preview.language_models import TextEmbeddingModel
import git
from git.repo.base import Repo
import tempfile

from dotenv import load_dotenv
import os

load_dotenv()

# Function to generate embeddings
def generate_embeddings(text, model="textembedding-gecko@001", max_tokens=8191):
    print(f"generating embeddings for {text}")
    truncated_text = text[:max_tokens]
    embeddings_model = TextEmbeddingModel.from_pretrained(model)
    
    return [e.values for e in embeddings_model.get_embeddings([truncated_text])][0]


def clone(source: str, dest: str = None):
    class Progress(git.remote.RemoteProgress):
        def update(self, op_code, cur_count, max_count=None, message=""):
            print(f"update({op_code}, {cur_count}, {max_count}, {message})")

    if not dest:
        dest = tempfile.mkdtemp()
    with git.Git().custom_environment():
        repo = Repo.clone_from(url=source, to_path=dest, progress=Progress())
        return repo,dest

rulesToEmbeddingsDF = pd.DataFrame()
tqdm.pandas(desc="Generating embeddings")

embeddings = []
# clone rules repo
repo, dest = clone("https://github.com/returntocorp/semgrep-rules")
for root, dirs, files in os.walk(dest, topdown=False):
   for name in files:
      if name.endswith(".yaml"):
          with open(os.path.join(root,name), "rb") as f:
              content =f.read()
              embeddings.append({"name":name,"content":content, "embeddings":generate_embeddings(content)})
              print(f"successfully generated embedding for {name}")
          
rulesToEmbeddingsDF = pd.DataFrame(embeddings)
# Save the DataFrame as a Parquet file
rulesToEmbeddingsDF.to_parquet('semgrep.rules.to.embeddings.parquet')

# Print the resulting dataframe
print(rulesToEmbeddingsDF)


update(5, 1.0, 1939.0, )
update(4, 20.0, 1939.0, )
update(4, 39.0, 1939.0, )
update(4, 59.0, 1939.0, )
update(4, 78.0, 1939.0, )
update(4, 97.0, 1939.0, )
update(4, 117.0, 1939.0, )
update(4, 136.0, 1939.0, )
update(4, 156.0, 1939.0, )
update(4, 175.0, 1939.0, )
update(4, 194.0, 1939.0, )
update(4, 214.0, 1939.0, )
update(4, 233.0, 1939.0, )
update(4, 253.0, 1939.0, )
update(4, 272.0, 1939.0, )
update(4, 291.0, 1939.0, )
update(4, 311.0, 1939.0, )
update(4, 330.0, 1939.0, )
update(4, 350.0, 1939.0, )
update(4, 369.0, 1939.0, )
update(4, 388.0, 1939.0, )
update(4, 408.0, 1939.0, )
update(4, 427.0, 1939.0, )
update(4, 446.0, 1939.0, )
update(4, 466.0, 1939.0, )
update(4, 485.0, 1939.0, )
update(4, 505.0, 1939.0, )
update(4, 524.0, 1939.0, )
update(4, 543.0, 1939.0, )
update(4, 563.0, 1939.0, )
update(4, 582.0, 1939.0, )
update(4, 602.0, 1939.0, )
update(4, 621.0, 1939.0, )
update(4, 640.0, 1939.0, )
update(4, 660.0, 1939.0, )
update(4, 679.0, 1939.0, )
update(4, 699.0, 1939.0, )
update(4