In [0]:
%pip install openai
%pip install aiohttp
%pip install pymilvus
%pip install PyGithub
%pip install mmh3

In [0]:
%restart_python

In [0]:
import openai
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
import numpy as np
import logging
import uuid
import json
import httpx
import requests
from github import Github
import mmh3
import datetime
from typing import List
from dotenv import load_dotenv
import os

In [0]:
dbutils.widgets.text("repo", "", "GitHub Repo")
repo = dbutils.widgets.get("repo")

In [0]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GITHUB_PAT = os.getenv('GITHUB_PAT')
ZILLIZ_CLOUD_URI = os.getenv("ZILLIZ_CLOUD_URI")
ZILLIZ_API_KEY = os.getenv("ZILLIZ_API_KEY")

# OpenAI Model Configuration
OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"
OPENAI_CHAT_MODEL = "gpt-4.1-mini"
EMBEDDING_DIMENSION = 1536

In [0]:
MILVUS_COLLECTION_NAME = "github_dense_index"
MILVUS_GITHUB_SPARSE_COLLECTION="github_sparse_index"

# Global variables
collection = None
collection_github_sparse = None


In [0]:
def get_openai_client():
    """Get OpenAI client with error handling"""
    if client is None:
        raise Exception("OpenAI API key not configured")
    return client


In [0]:
# Milvus setup
def connect_github_dense_collection(collection_name: str):
    """Connect to a dense vector index collection."""

    # Check if collection exists
    if utility.has_collection(collection_name):
        collection = Collection(collection_name)
        print(f"Connected to existing collection: {collection_name}")
        return collection
    else:
        collection = None
        print(f"Did not find collection in Zilliz: {collection_name}")
    
    print(f"Created new collection: {collection_name}")
    return collection

In [0]:
def connect_github_sparse_collection(collection_name: str):
    """Connect to a sparse collection (BM25-like)."""

    if utility.has_collection(collection_name):
        col = Collection(collection_name)
        print(f"Connected to existing sparse collection: {collection_name}")
    else:
        col = None
        print(f"Did not find sparse collection in Zilliz: {collection_name}")
    return col


In [0]:
if OPENAI_API_KEY:
    openai.api_key = OPENAI_API_KEY
    # Explicitly create an httpx client without proxies
    async_http_client = httpx.AsyncClient()
    # async_http_client = httpx.AsyncClient(proxies=None)
    client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY, http_client=async_http_client)
else:
    client = None

In [0]:
connections.connect(
    alias="default",
    uri=ZILLIZ_CLOUD_URI,
    token=ZILLIZ_API_KEY,
    secure=True
)
print("Successfully connected to Zilliz Cloud!")

In [0]:
collection = connect_github_dense_collection(MILVUS_COLLECTION_NAME)

In [0]:
collection_github_sparse = connect_github_sparse_collection(MILVUS_GITHUB_SPARSE_COLLECTION)

In [0]:
async def delete_github_repo(repo_name: str, collection=None, collection_github_sparse=None):
    """Delete all vector index entries for a given GitHub repo from both dense and sparse vector index collections in Zilliz."""
    if not repo_name or not '/' in repo_name:
        raise Exception(f"Invalid repo name format. Use 'owner/repo': {repo}")

    repo_name = repo_name.strip()
    print(f"Attempting to delete all entries for repo: {repo_name}")

    # Define the expression to match the repo
    expr = f"repo == '{repo_name}'"

    # Query to check if entries exist before deleting
    dense_results = collection.query(expr, output_fields=["id"])
    sparse_results = collection_github_sparse.query(expr, output_fields=["id"])

    if not dense_results and not sparse_results:
        print(f"Repo '{repo_name}' not found in any vector index.")
        return None

    total_deleted = 0
    log_list = []
    end_time = datetime.datetime.now().isoformat()
    table_name = "tabular.dataexpert.mlivshutz54984_vector_delete_log"

    # Delete from dense collection
    if dense_results:
        delete_result_dense = collection.delete(expr)
        print(f"Deleted from dense collection '{collection.name}': {delete_result_dense}")
        total_deleted += len(dense_results)
        entry = {
            "files_deleted": total_deleted,
            "vector_index_type": "dense",
            "collection": collection.name,
            "deleted_at": end_time,
            "repo": repo,
        }
        log_list.append(entry)

    # Delete from sparse collection
    if sparse_results:
        delete_result_sparse = collection_github_sparse.delete(expr)
        print(f"Deleted from sparse collection '{collection_github_sparse.name}': {delete_result_sparse}")
        total_deleted += len(sparse_results)
        entry = {
            "files_deleted": total_deleted,
            "vector_index_type": "sparse",
            "collection": collection_github_sparse.name,
            "deleted_at": end_time,
            "repo": repo,
        }
        log_list.append(entry)
    
    # Persist the deletions
    collection.flush()
    collection_github_sparse.flush()

    if len(log_list) > 0:
        spark.createDataFrame(log_list).write.mode("append").saveAsTable(table_name)
        spark.sql(f"alter table {table_name} set TBLPROPERTIES (delta.enableChangeDataFeed = true)")

    print(f"Successfully deleted {total_deleted} entries for repo: {repo_name}")
    return {"message": f"Successfully deleted {total_deleted} entries for repo '{repo_name}'.", "deleted_count": total_deleted}



In [0]:
# Ingest GitHub repo
await delete_github_repo(repo, collection, collection_github_sparse)
