### Install and import dependencies

In [0]:
%pip install kaggle
%restart_python

In [0]:
import os
import shutil
from datetime import datetime

#### Setup Kaggle login and import Kaggle

In [0]:
print("Configuring Kaggle credentials...")
try:
    os.environ['KAGGLE_USERNAME'] = dbutils.secrets.get(
        scope="hackathon_secrets", key="kaggle_username")
    os.environ['KAGGLE_KEY'] = dbutils.secrets.get(
        scope="hackathon_secrets", key="kaggle_key")
except Exception as e:
    print(f"Notice: Could not fetch secrets. Checking local environment.")
    if not os.environ.get('KAGGLE_USERNAME') or not os.environ.get('KAGGLE_KEY'):
        raise EnvironmentError(
            "KAGGLE_USERNAME and KAGGLE_KEY must be set via Databricks Secrets or local environment variables.")

from kaggle.api.kaggle_api_extended import KaggleApi  # isort: skip # noqa: E402

### Setup volume

In [0]:
CATALOG = "workspace"
SCHEMA = "car_sales"
VOLUME = "raw_data"
DATASET_NAME = "austinreese/craigslist-carstrucks-data"

TARGET_VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}/"

try:
    print(f"üîß Creating schema: {CATALOG}.{SCHEMA}")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
    print(f"üîß Creating volume: {CATALOG}.{SCHEMA}.{VOLUME}")
    spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.{VOLUME}")
    print("‚úÖ Volume setup completed successfully!")
except Exception as e:
    print(f"‚ö†Ô∏è  Warning: Auto-creation failed. Error: {e}")
    print("üí° You may need to create the catalog/schema/volume manually in the Databricks UI")

### Store Data

In [0]:
print(f"Starting ingestion of {DATASET_NAME} to {TARGET_VOLUME_PATH}...")

api = KaggleApi()
api.authenticate()

print("Downloading from Kaggle...")
print(f"üì• Dataset: {DATASET_NAME}")
print(f"üìÅ Local download path: {TARGET_VOLUME_PATH}")
api.dataset_download_files(DATASET_NAME, path=TARGET_VOLUME_PATH, unzip=True)

print("üìã Download completed. Files found:")
for filename in os.listdir(TARGET_VOLUME_PATH):
    file_path = os.path.join(TARGET_VOLUME_PATH, filename)
    file_size = os.path.getsize(file_path)
    file_size_mb = file_size / (1024 * 1024)
    file_size_gb = file_size / (1024 * 1024 * 1024)
    size_str = f"{file_size_gb:.2f} GB" if file_size_gb > 1 else f"{file_size_mb:.2f} MB"
    print(f"  üìÑ {filename}: {size_str}")

total_size = sum(os.path.getsize(os.path.join(TARGET_VOLUME_PATH, f)) for f in os.listdir(TARGET_VOLUME_PATH))
total_size_gb = total_size / (1024 * 1024 * 1024)
print(f"üìä Total download size: {total_size_gb:.2f} GB")

print(f"üèóÔ∏è  Ensuring Volume exists at {CATALOG}.{SCHEMA}.{VOLUME}...")
print(f"üìç Target volume path: {TARGET_VOLUME_PATH}")

print("Ingestion complete!")