In [0]:

# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta
import random

# Clear any existing configurations
#spark.sql("SET spark.sql.execution.arrow.pyspark.enabled = false")

print("Lab environment initialized successfully")


In [0]:
%sql
-- Create a catalog for this lab
CREATE CATALOG IF NOT EXISTS hr_analytics_lab;

-- Use the catalog
USE CATALOG hr_analytics_lab;

-- Create schemas for different business units
CREATE SCHEMA IF NOT EXISTS hr_data;
CREATE SCHEMA IF NOT EXISTS sales_data;
CREATE SCHEMA IF NOT EXISTS analytics;

-- Set default schema
USE hr_analytics_lab.hr_data;

In [0]:
# Generate comprehensive employee dataset
def generate_employee_data(num_employees=100):
    cities = ["New York", "San Francisco", "Chicago", "Austin", "Seattle", "Boston", "Los Angeles", "Miami"]
    departments = ["Engineering", "Sales", "Marketing", "HR", "Finance", "Operations"]
    job_levels = ["Junior", "Senior", "Lead", "Manager", "Director"]
    
    employees = []
    for i in range(1, num_employees + 1):
        birth_date = (datetime.now() - timedelta(days=random.randint(8000, 15000))).strftime("%Y-%m-%d")
        hire_date = (datetime.now() - timedelta(days=random.randint(30, 2000))).strftime("%Y-%m-%d")
        
        employee = {
            "employee_id": i,
            "first_name": f"Employee_{i}",
            "last_name": f"LastName_{i}",
            "email": f"employee{i}@company.com",
            "phone": f"555-{random.randint(1000, 9999)}",
            "hire_date": hire_date,
            "birth_date": birth_date,
            "salary": random.randint(40000, 200000),
            "department": random.choice(departments),
            "job_title": f"{random.choice(job_levels)} {random.choice(departments)} Specialist",
            "city": random.choice(cities),
            "is_active": random.choice([True, True, True, False])  # 75% active
        }
        employees.append(employee)
    
    return spark.createDataFrame(employees)

# Create the employee dataset
employee_df = generate_employee_data(100)
employee_df.show(10)

In [0]:
%sql
-- Drop table if exists for clean start
DROP TABLE IF EXISTS hr_analytics_lab.hr_data.employees_managed;

-- This will be created as a managed table automatically
CREATE TABLE hr_analytics_lab.hr_data.employees_managed (
    employee_id INT NOT NULL,
    first_name STRING NOT NULL,
    last_name STRING NOT NULL,
    email STRING,
    phone STRING,
    hire_date DATE,
    birth_date DATE,
    salary DOUBLE,
    department STRING,
    job_title STRING,
    city STRING,
    is_active BOOLEAN
) 
USING DELTA
COMMENT "Managed table containing employee information"
TBLPROPERTIES (
    'department'='HR',
    'data_classification'='PII',
    'created_by'='data_engineering_team'
);

In [0]:
# Insert data into managed table
from pyspark.sql.types import IntegerType

employee_df = (
    employee_df
    # Convert employee_id to int
    .withColumn("employee_id", col("employee_id").cast(IntegerType()))
    # Convert salary to long (if needed)
    .withColumn("salary", col("salary").cast(LongType()))
    # Convert is_active to boolean
    .withColumn("is_active", col("is_active").cast(BooleanType()))
    # Convert hire_date from string to proper date format
    .withColumn("hire_date", to_date(col("hire_date"), "yyyy-MM-dd"))
    # Convert birth_date if needed
    .withColumn("birth_date", to_date(col("birth_date"), "yyyy-MM-dd"))
    # Drop rows missing employee_id
    .na.drop(subset=["employee_id"])
)
employee_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("hr_analytics_lab.hr_data.employees_managed")

print("Data inserted into managed table")

In [0]:
employee_df.printSchema()

In [0]:
%sql
-- Describe the managed table in detail
DESCRIBE EXTENDED hr_analytics_lab.hr_data.employees_managed;

-- Show table properties
SHOW TBLPROPERTIES hr_analytics_lab.hr_data.employees_managed;

-- Check the location (notice it's managed by Unity Catalog)
DESCRIBE DETAIL hr_analytics_lab.hr_data.employees_managed;

In [0]:
%sql
-- Show existing external locations (if any)
SHOW EXTERNAL LOCATIONS;


In [0]:
%sql
-- Show storage credentials (if any)
SHOW STORAGE CREDENTIALS;


In [0]:
%sql
-- For demonstration, we'll create an external table pointing to a specific path
-- Note: In production, you would need proper external locations and storage credentials

-- Create external table structure (this may require admin privileges for external locations)
CREATE TABLE IF NOT EXISTS hr_analytics_lab.hr_data.employees_external (
    employee_id INT NOT NULL,
    first_name STRING NOT NULL,
    last_name STRING NOT NULL,
    email STRING,
    phone STRING,
    hire_date DATE,
    birth_date DATE,
    salary DOUBLE,
    department STRING,
    job_title STRING,
    city STRING,
    is_active BOOLEAN
) 
USING DELTA
COMMENT "External table containing employee information"
LOCATION 's3://databricks-miraj/hr-data/employees/'  -- Replace with actual external location
TBLPROPERTIES (
    'department'='HR',
    'data_classification'='PII',
    'storage_type'='external'
);

In [0]:
%sql
-- Create a comparison query
WITH table_comparison AS (
    SELECT 
        'managed' as table_type,
        COUNT(*) as record_count,
        'Unity Catalog managed storage' as location_type
    FROM hr_analytics_lab.hr_data.employees_managed
    
    UNION ALL
    
    SELECT 
        'external' as table_type,
        COUNT(*) as record_count,
        'External cloud storage' as location_type
    FROM hr_analytics_lab.hr_data.employees_managed  -- Using managed for demo since external might not be accessible
)
SELECT * FROM table_comparison;