In [0]:
dbutils.secrets.listScopes()

In [0]:
dbutils.secrets.list(scope = "rouge4kv1")

In [0]:
%pip install kaggle azure-storage-blob

In [0]:
#import necessary modules and libraries
import os
from azure.storage.blob import BlobServiceClient
import shutil

In [0]:
#Assign secrets to variables

#Kaggle
kaggle_username = dbutils.secrets.get(scope = "rouge4kv1", key = "kaggleusername")
kaggle_key = dbutils.secrets.get(scope = "rouge4kv1", key = "kagglekey")

#Azure Data Lake Storage
STORAGE_ACCOUNT_NAME = "rougestorageacc1"
STORAGE_ACCOUNT_KEY = dbutils.secrets.get(scope = "rouge4kv1", key = "rougestorageacc1key1")
CONTAINER_NAME = "rawkaggledata"

In [0]:
# Assign the authentication details prior to importing the Kaggle library because it authenticates at the time of import.
os.environ['KAGGLE_USERNAME'] = kaggle_username
os.environ['KAGGLE_KEY'] = kaggle_key

In [0]:
# Import the kaggle library
from kaggle.api.kaggle_api_extended import KaggleApi

In [0]:
# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

In [0]:
# Connect to Azure Blob Storage (ADLS Gen2)
blob_service_client = BlobServiceClient(account_url=f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net",
                                        credential=STORAGE_ACCOUNT_KEY)
container_client = blob_service_client.get_container_client(CONTAINER_NAME)

In [0]:
# Define dataset

# - Note to self - probably best to arrange the list in order of increasing dataset size and parallelize. This is 
# executed on a single node but might be more cost effective and efficient to run this on multiple nodes due to the huge 
# dataset sizes and the nested for loops. 

# - The job crashed the first time I ran it, so I rearranged the order of the datasets. 

# - The dataset ("minhhuy2810/rice-diseases-image-dataset") is too large to download in one go and so running it always 
# displays an error - "Fatal error: The Python kernel is unresponsive.". So removing it from the list of datasets to be 
# downloaded. A more powerful cluster would be required to download the entire dataset. 

datasets = ["vbookshelf/rice-leaf-diseases", "maimunulkjisan/rice-leaf-dataset-from-mendeley-data", "anshulm257/rice-disease-dataset"]

In [0]:
# Download the entire dataset to the temporary directory
print("Downloading dataset from Kaggle...")

for dataset in datasets:
    # Create a temporary directory in Databricks
    temp_dir = "/dbfs/tmp/kaggle_datasets/"
    os.makedirs(temp_dir, exist_ok=True)

    api.dataset_download_files(dataset, path=temp_dir, unzip=True)

    # Upload each file to Azure Data Lake
    for root, _, files in os.walk(temp_dir):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            blob_path = os.path.relpath(file_path, temp_dir)  # Preserve folder structure

            print(f"Uploading {blob_path} to Azure Data Lake...")

            # Read file in binary mode
            with open(file_path, "rb") as data:
                blob_client = container_client.get_blob_client(blob_path)
                blob_client.upload_blob(data, overwrite=True)
            
    # Clean up local files
    shutil.rmtree(temp_dir)
    print(f"{dataset} uploaded successfully and temporary files deleted.")

print("All datasets uploaded successfully.")


In [0]:
# verify that the dataset is available in the specified storage account/container.
spark.conf.set(
    f"fs.azure.account.key.{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net",
    f"{STORAGE_ACCOUNT_KEY}"
)

display(dbutils.fs.ls(f"abfss://{CONTAINER_NAME}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net/"))
