In [0]:
%pip install databricks.sdk

In [0]:
# Import necessary modules

# Storage
from azure.storage.filedatalake import DataLakeServiceClient
import databricks.sdk

# Other
import random
import os
from databricks.sdk.service.catalog import VolumeType

In [0]:
# Assign storage account and containers values to var
account_name = "rougestorageacc1"
account_key = dbutils.secrets.get(scope="rouge4kv1", key="rougestorageacc1key1")
source_container = "stagekaggledata"
dest_container = "processkaggledata"
train_ratio = 0.8

In [0]:
# Authenticate using account key
service_client = DataLakeServiceClient(
    account_url=f"https://{account_name}.dfs.core.windows.net",
    credential=account_key
)

source_fs = service_client.get_file_system_client(file_system=source_container)
dest_fs = service_client.get_file_system_client(file_system=dest_container)

In [0]:
# verify access to the container of interest.
spark.conf.set(
    f"fs.azure.account.key.{account_name}.dfs.core.windows.net",
    f"{account_key}"
)

display(dbutils.fs.ls(f"abfss://{dest_container}@{account_name}.dfs.core.windows.net/train"))

Create a unity catalog volume for downstream processing of the data - downstream ml libraries can't work with a URL input for data location.

In [0]:
# Create a unity catalog volume for downstream processing of the data (datagen.flow_from_directory() requires a volume, i.e. it doesn't work with an external location) 

#Assign values to var
db_client = databricks.sdk.WorkspaceClient()

catalog = "main"
schema = "ml_data"
volume_name = "processkaggledata"
volume_path = "abfss://processkaggledata@rougestorageacc1.dfs.core.windows.net/"
managed_location = "abfss://metastore-root@rougestorageacc1.dfs.core.windows.net/unity-catalog"

In [0]:
# Create a storage credential

cred_name = "rouge4cred1"
rouge4acmi = databricks.sdk.service.catalog.AzureManagedIdentity(access_connector_id="/subscriptions/f26a4615-66a4-4095-9a0d-dc252e5dba73/resourceGroups/rouge_databricks_managed_rg/providers/Microsoft.Databricks/accessConnectors/unity-catalog-access-connector")

try:
    db_client.storage_credentials.create(
        name=cred_name,
        azure_managed_identity=rouge4acmi
    )
except databricks.sdk.errors.platform.BadRequest as e:
    if "already exists" in str(e):
        print(f"Credential '{cred_name}' already exists.")
    else:
        raise e

In [0]:
# Create the external location
ext_loc_name = f"{volume_name}_extloc"

try:
    db_client.external_locations.create(
        name=ext_loc_name,
        url=volume_path,
        credential_name="rouge4cred1"  # Replace with your actual storage credential name
    )
    print(f"✅ External location '{ext_loc_name}' created.")
except databricks.sdk.errors.platform.BadRequest as e:
    if "already exists" in str(e):
        print(f"External location '{ext_loc_name}' already exists.")
    else:
        raise e

In [0]:
# Create the catalog
try:
    db_client.catalogs.create(name=catalog, comment="Catalog for rice disease ML data", storage_root=managed_location)
    print(f"✅ Catalog '{catalog}' created.")
except databricks.sdk.errors.platform.BadRequest as e:
    if "already exists" in str(e):
        print(f"Catalog '{catalog}' already exists.")
    else:
        raise e

In [0]:
# Create the schema
try:
    db_client.schemas.create(
        name=schema,
        catalog_name=catalog,
        comment="Schema for training volumes and models"
    )
    print(f"✅ Schema '{schema}' created in catalog '{catalog}'.")
except databricks.sdk.errors.platform.BadRequest as e:
    if "already exists" in str(e):
        print(f"Schema '{schema}' already exists in catalog '{catalog}'.")
    else:
        raise e


In [0]:
# Create the volume
try:
    volume = db_client.volumes.create(
        catalog_name=catalog,
        schema_name=schema,
        name=volume_name,
        volume_type=VolumeType.EXTERNAL,  # or "managed"
        storage_location=volume_path
    )
    print(f"✅ Volume created: {volume.full_name}")
except databricks.sdk.errors.ResourceAlreadyExists:
    print(f"Volume '{volume_name}' already exists.")