In [None]:
import pandas as pd
from datetime import datetime

# --- AZURE CONFIGURATION ---
# In production, these are often passed as parameters
storage_account = "beehavenstorage" # <--- REPLACE with your actual Storage Account Name
container_bronze = "bronze"
container_silver = "silver"

# Define ABFSS Paths (The standard for Azure Data Lake Gen2)
base_uri = f"abfss://{container_bronze}@{storage_account}.dfs.core.windows.net/new"
silver_uri = f"abfss://{container_silver}@{storage_account}.dfs.core.windows.net"

# --- 1. INGESTION (Reading from Data Lake) ---
print(f"Reading data from: {base_uri}")

# Reading CSVs directly from the Lake
# Note: This relies on your Azure User having 'Storage Blob Data Contributor' role
df_flow = pd.read_csv(f"{base_uri}/flow_schwartau.csv")
df_humidity = pd.read_csv(f"{base_uri}/humidity_schwartau.csv")
df_temp = pd.read_csv(f"{base_uri}/temperature_schwartau.csv")
df_weight = pd.read_csv(f"{base_uri}/weight_schwartau.csv")

# --- 2. TRANSFORMATION (Cleaning) ---
# [Insert your existing cleaning logic here: renaming columns, converting timestamps]
# Example:
df_flow['timestamp'] = pd.to_datetime(df_flow['timestamp']).dt.tz_localize(None)

# --- 3. LOADING (Writing to Silver) ---
current_time = datetime.now().strftime("%Y-%m-%d_%Hh%Mm%Ss")

# Saving as Parquet (Columnar format optimized for Analytics)
save_path = f"{silver_uri}/flow/flow_processed_{current_time}.parquet"
df_flow.to_parquet(save_path, index=False)
print(f"Saved processed data to: {save_path}")