In [3]:
# Load environment variables
from dotenv import load_dotenv
import os

load_dotenv()

# Verify environment is loaded
print(f"GOOGLE_CLOUD_PROJECT: {os.getenv('GOOGLE_CLOUD_PROJECT')}")
print(f"GOOGLE_CLOUD_LOCATION: {os.getenv('GOOGLE_CLOUD_LOCATION', 'us-central1')}")
print(f"GOOGLE_CLOUD_MODEL: {os.getenv('GOOGLE_CLOUD_MODEL', 'gemini-2.5-flash-lite')}")



GOOGLE_CLOUD_PROJECT: kaggle-capstone-112025
GOOGLE_CLOUD_LOCATION: global
GOOGLE_CLOUD_MODEL: gemini-2.5-flash-lite


In [5]:
from google.cloud import discoveryengine_v1 as discoveryengine
from google.api_core.client_options import ClientOptions

def create_data_store(
    project_id: str,
    location: str,
    data_store_id: str,
    display_name: str
):
    """Create a Vertex AI Search data store."""
    
    # Create client with appropriate endpoint
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    
    client = discoveryengine.DataStoreServiceClient(client_options=client_options)
    
    parent = f"projects/{project_id}/locations/{location}/collections/default_collection"
    
    data_store = discoveryengine.DataStore(
        display_name=display_name,
        industry_vertical=discoveryengine.IndustryVertical.GENERIC,
        content_config=discoveryengine.DataStore.ContentConfig.CONTENT_REQUIRED,
    )
    
    request = discoveryengine.CreateDataStoreRequest(
        parent=parent,
        data_store=data_store,
        data_store_id=data_store_id,
    )
    
    # Create the data store (this is a long-running operation)
    operation = client.create_data_store(request=request)
    print("Waiting for data store creation to complete...")
    response = operation.result()
    print(f"Data store created: {response.name}")
    
    return response


def import_documents_from_gcs(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: str
):
    """Import documents from GCS into the data store."""
    
    # Create client with appropriate endpoint
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    
    client = discoveryengine.DocumentServiceClient(client_options=client_options)
    
    parent = f"projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{data_store_id}/branches/default_branch"
    
    request = discoveryengine.ImportDocumentsRequest(
        parent=parent,
        gcs_source=discoveryengine.GcsSource(
            input_uris=[gcs_uri],
            data_schema="content"  # or "custom" for structured data
        ),
        reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
    )
    
    # Import documents (this is a long-running operation)
    operation = client.import_documents(request=request)
    print("Waiting for document import to complete...")
    response = operation.result()
    print(f"Documents imported successfully")
    
    return response


# Usage example
if __name__ == "__main__":
    PROJECT_ID = "your-project-id"
    LOCATION = "global"  # Options: "global", "us", "eu"
    DATA_STORE_ID = "my-datastore"
    DISPLAY_NAME = "My Data Store"
    GCS_URI = "gs://your-bucket-name/path/*"  # Use /* for all files in path
    
    # Step 1: Create the data store
    create_data_store(PROJECT_ID, LOCATION, DATA_STORE_ID, DISPLAY_NAME)
    
    # Step 2: Import documents from GCS
    import_documents_from_gcs(PROJECT_ID, LOCATION, DATA_STORE_ID, GCS_URI)

ImportError: cannot import name 'discoveryengine_v1' from 'google.cloud' (unknown location)