In [1]:
! pip install faker==23.0.0

Collecting faker==23.0.0
  Downloading Faker-23.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-23.0.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.1 MB/s[0m  [33m0:00:02[0m eta [36m0:00:01[0m0m
[?25hInstalling collected packages: faker
  Attempting uninstall: faker
    Found existing installation: Faker 23.3.0
    Uninstalling Faker-23.3.0:
      Successfully uninstalled Faker-23.3.0
Successfully installed faker-23.0.0


In [1]:
# VARIANT Data Type Exploration in Databricks
# This notebook demonstrates various features and capabilities of the VARIANT data type

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
import random
from datetime import datetime, timedelta
from faker import Faker
import os

# =============================================================================
# CONFIGURATION PARAMETERS
# =============================================================================
# Update these parameters to customize the notebook for your environment
CATALOG = "users"                    # Unity Catalog name
SCHEMA = "pavan_naidu"               # Schema name within the catalog
VOLUME = "raw_data"                  # Volume name for storing data
# =============================================================================

def get_spark() -> SparkSession:
    try:
        from databricks.connect import DatabricksSession
        return DatabricksSession.builder.getOrCreate()
    except Exception:
        return SparkSession.builder.getOrCreate()

spark = get_spark()
fake = Faker()

# Set catalog and schema using parameters
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"USE SCHEMA {SCHEMA}")

print("✅ Spark session initialized")
print(f"Current catalog: {spark.sql('SELECT current_catalog()').collect()[0][0]}")
print(f"Current schema: {spark.sql('SELECT current_schema()').collect()[0][0]}")
print(f"Configuration: CATALOG={CATALOG}, SCHEMA={SCHEMA}, VOLUME={VOLUME}")

✅ Spark session initialized


HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

Current catalog: users


HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

Current schema: pavan_naidu
Configuration: CATALOG=users, SCHEMA=pavan_naidu, VOLUME=raw_data


## 1. Setup: Create UC Volume and Generate Fake JSON Data

First, let's create a folder in Unity Catalog volume and generate diverse JSON data to showcase VARIANT functionality.


In [2]:
# Create or verify volume exists
try:
    spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.{VOLUME}")
    print(f"✅ Volume '{VOLUME}' is ready")
except Exception as e:
    print(f"Volume might already exist or error: {e}")

# Get the volume path
volume_path = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}"
print(f"Volume path: {volume_path}")


✅ Volume 'raw_data' is ready
Volume path: /Volumes/users/pavan_naidu/raw_data


In [3]:
# Generate fake user data to demonstrate VARIANT capabilities

def generate_user_data(num_records=1000):
    """Generate user data with varying structures to simulate schema evolution"""
    users = []
    
    for i in range(num_records):
        # Basic user structure (all users have these fields)
        user = {
            "user_id": str(fake.uuid4()),
            "username": fake.user_name(),
            "email": fake.email(),
            "name": fake.name(),
            "age": random.randint(18, 80),
            "created_at": str(fake.date_time_between(start_date='-5y', end_date='now')),
            "last_login": str(fake.date_time_between(start_date='-30d', end_date='now'))
        }
        
        # Profile information (nested structure)
        user["profile"] = {
            "bio": fake.text(max_nb_chars=200),
            "occupation": fake.job(),
            "company": fake.company(),
            "interests": [fake.word() for _ in range(random.randint(1, 5))],
            "skills": [fake.job() for _ in range(random.randint(0, 3))]
        }
        
        # Address information (nested with multiple levels)
        user["address"] = {
            "street": fake.street_address(),
            "city": fake.city(),
            "state": fake.state(),
            "country": fake.country(),
            "postal_code": fake.postcode(),
            "coordinates": {
                "latitude": float(fake.latitude()),
                "longitude": float(fake.longitude())
            }
        }
        
        # Preferences (nested boolean flags)
        user["preferences"] = {
            "newsletter": bool(random.choice([True, False])),
            "notifications": {
                "email": bool(random.choice([True, False])),
                "sms": bool(random.choice([True, False])),
                "push": bool(random.choice([True, False])),
                "frequency": random.choice(["daily", "weekly", "monthly", "never"])
            },
            "privacy": {
                "profile_visible": bool(random.choice([True, False])),
                "show_email": bool(random.choice([True, False])),
                "show_location": bool(random.choice([True, False]))
            }
        }
        
        # Simulate schema evolution - newer users have additional fields
        if i > num_records * 0.3:  # 70% of users have these newer fields
            user["social_media"] = {
                "twitter": f"@{fake.user_name()}" if random.random() > 0.3 else None,
                "linkedin": fake.url() if random.random() > 0.5 else None,
                "github": f"github.com/{fake.user_name()}" if random.random() > 0.7 else None
            }
            
        if i > num_records * 0.6:  # 40% of users have these newest fields
            user["subscription"] = {
                "tier": random.choice(["free", "basic", "premium", "enterprise"]),
                "start_date": str(fake.date_between(start_date='-2y', end_date='today')),
                "auto_renew": bool(random.choice([True, False]))
            }
            user["metrics"] = {
                "login_count": random.randint(1, 1000),
                "posts_created": random.randint(0, 500),
                "comments_made": random.randint(0, 2000),
                "last_activity": str(fake.date_time_between(start_date='-7d', end_date='now'))
            }
        
        # Some users have optional fields (simulate sparse data)
        if random.random() > 0.7:
            user["phone"] = fake.phone_number()
        
        if random.random() > 0.8:
            user["referral_code"] = fake.bothify(text='REF-####-????')
            user["referred_by"] = str(fake.uuid4()) if random.random() > 0.5 else None
        
        users.append(user)
    
    return users

# Generate sample user data
print("Generating fake user data...")
user_data = generate_user_data(1000)
print(f"✅ Generated {len(user_data)} user records with evolving schemas")

# Show sample records demonstrating schema evolution
print("\nSample user records showing schema evolution:")
print("\n1. Basic user (early schema):")
print(json.dumps(user_data[100], indent=2)[:600] + "...")
print("\n2. User with social media (mid evolution):")
print(json.dumps(user_data[500], indent=2)[:600] + "...")
print("\n3. User with full schema (latest):")
print(json.dumps(user_data[900], indent=2)[:600] + "...")


Generating fake user data...
✅ Generated 1000 user records with evolving schemas

Sample user records showing schema evolution:

1. Basic user (early schema):
{
  "user_id": "424a5cad-531e-4730-857d-f7f9c945fb24",
  "username": "bryan93",
  "email": "jasmineking@example.net",
  "name": "Matthew Quinn",
  "age": 66,
  "created_at": "2025-08-07 08:15:29.517631",
  "last_login": "2025-09-06 07:01:41.772613",
  "profile": {
    "bio": "Probably enjoy him friend option.\nHigh admit determine each drug human. Health before run collection ahead training.",
    "occupation": "Emergency planning/management officer",
    "company": "Campbell Inc",
    "interests": [
      "newspaper",
      "inside",
      "that"
    ],
    "skills": []
  },
  "address": {
  ...

2. User with social media (mid evolution):
{
  "user_id": "a682f442-b29e-4a78-8f3f-da3c52a876ae",
  "username": "laurenmckinney",
  "email": "felicia39@example.com",
  "name": "Cristina Berry MD",
  "age": 54,
  "created_at": "2022-04-0

In [4]:
# Create a users folder in the volume
users_folder = f"{volume_path}/users"
try:
    dbutils.fs.mkdirs(users_folder)
    print(f"✅ Created folder: {users_folder}")
except Exception as e:
    print(f"Folder might already exist: {e}")

# Save JSON data to users folder
json_file_path = f"{users_folder}/user_data.json"

# Write JSON data to file
dbutils.fs.put(json_file_path, json.dumps(user_data), overwrite=True)
print(f"✅ Saved JSON data to: {json_file_path}")

# Also save as individual JSON lines for easier processing
jsonl_file_path = f"{users_folder}/user_data.jsonl"
jsonl_content = "\n".join([json.dumps(user) for user in user_data])
dbutils.fs.put(jsonl_file_path, jsonl_content, overwrite=True)
print(f"✅ Saved JSONL data to: {jsonl_file_path}")

# # Optionally, save data partitioned by subscription tier for those who have it
# print("\n📁 Creating partitioned data by subscription tier...")
# for user in user_data:
#     if "subscription" in user:
#         tier = user.get("subscription", {}).get("tier", "unknown")
#         tier_folder = f"{users_folder}/by_tier/{tier}"
        
#         # Create tier folder if it doesn't exist
#         try:
#             dbutils.fs.mkdirs(tier_folder)
#         except:
#             pass  # Folder exists
        
#         # Save user to appropriate tier folder
#         user_file = f"{tier_folder}/{user['user_id']}.json"
#         dbutils.fs.put(user_file, json.dumps(user), overwrite=True)

# print("✅ Created partitioned user data by subscription tier")

# List the created structure
print("\n📂 Folder structure created:")
display(dbutils.fs.ls(users_folder))


✅ Created folder: /Volumes/users/pavan_naidu/raw_data/users
✅ Saved JSON data to: /Volumes/users/pavan_naidu/raw_data/users/user_data.json
✅ Saved JSONL data to: /Volumes/users/pavan_naidu/raw_data/users/user_data.jsonl

📂 Folder structure created:


[FileInfo(path='/Volumes/users/pavan_naidu/raw_data/users/user_data.json', name='user_data.json', size=1159727, modificationTime=1758225995000),
 FileInfo(path='/Volumes/users/pavan_naidu/raw_data/users/user_data.jsonl', name='user_data.jsonl', size=1158726, modificationTime=1758225996000)]

## 2. Creating Tables with VARIANT Data Type

Let's create tables to store our JSON data using the VARIANT type.


In [5]:
# Drop tables if they exist
spark.sql("DROP TABLE IF EXISTS users_variant")
spark.sql("DROP TABLE IF EXISTS users_extracted")

# Create a table with VARIANT column for user data
# Enable Delta feature for column defaults
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS users_variant (
        user_id STRING,
        username STRING,
        email STRING,
        user_data VARIANT,
        ingestion_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
    )
    USING DELTA
    TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')
    COMMENT 'Table storing user data in VARIANT column'
""")

print("✅ Created table: users_variant with column defaults enabled")

# Create another table with extracted fields for performance comparison
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS users_extracted (
        user_id STRING,
        username STRING,
        email STRING,
        name STRING,
        age INT,
        city STRING,
        country STRING,
        subscription_tier STRING,
        user_data VARIANT,
        ingestion_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
    )
    USING DELTA
    TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')
    COMMENT 'Table with commonly accessed fields extracted from VARIANT'
""")

print("✅ Created table: users_extracted")

# Show table schemas
print("\nTable Schemas:")
spark.sql("DESCRIBE TABLE users_variant").show()
spark.sql("DESCRIBE TABLE users_extracted").show()


✅ Created table: users_variant with column defaults enabled
✅ Created table: users_extracted

Table Schemas:
+-------------------+---------+-------+
|           col_name|data_type|comment|
+-------------------+---------+-------+
|            user_id|   string|   NULL|
|           username|   string|   NULL|
|              email|   string|   NULL|
|          user_data|  variant|   NULL|
|ingestion_timestamp|timestamp|   NULL|
+-------------------+---------+-------+

+-------------------+---------+-------+
|           col_name|data_type|comment|
+-------------------+---------+-------+
|            user_id|   string|   NULL|
|           username|   string|   NULL|
|              email|   string|   NULL|
|               name|   string|   NULL|
|                age|      int|   NULL|
|               city|   string|   NULL|
|            country|   string|   NULL|
|  subscription_tier|   string|   NULL|
|          user_data|  variant|   NULL|
|ingestion_timestamp|timestamp|   NULL|
+---------

## 3. Loading Data into VARIANT Columns

Now let's load our JSON data into the tables using different methods.


In [6]:
# Method 1: Load from JSON file using PARSE_JSON
spark.sql(f"""
    INSERT INTO users_variant (user_id, username, email, user_data)
    SELECT 
        get_json_object(value, '$.user_id') as user_id,
        get_json_object(value, '$.username') as username,
        get_json_object(value, '$.email') as email,
        PARSE_JSON(value) as user_data
    FROM (
        SELECT value 
        FROM text.`{jsonl_file_path}`
    )
""")

count = spark.sql("SELECT COUNT(*) FROM users_variant").collect()[0][0]
print(f"✅ Loaded {count} records into users_variant using PARSE_JSON")

# Show sample data
print("\nSample data from users_variant:")
spark.sql("SELECT * FROM users_variant LIMIT 5").show(truncate=False)


HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

✅ Loaded 1000 records into users_variant using PARSE_JSON

Sample data from users_variant:


HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+--------------+----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
# Method 2: Load data with extracted fields for optimized queries
spark.sql(f"""
    INSERT INTO users_extracted 
    SELECT 
        get_json_object(value, '$.user_id') as user_id,
        get_json_object(value, '$.username') as username,
        get_json_object(value, '$.email') as email,
        get_json_object(value, '$.name') as name,
        CAST(get_json_object(value, '$.age') AS INT) as age,
        get_json_object(value, '$.address.city') as city,
        get_json_object(value, '$.address.country') as country,
        get_json_object(value, '$.subscription.tier') as subscription_tier,
        PARSE_JSON(value) as user_data,
        CURRENT_TIMESTAMP() as ingestion_timestamp
    FROM (
        SELECT value 
        FROM text.`{jsonl_file_path}`
    )
""")

count = spark.sql("SELECT COUNT(*) FROM users_extracted").collect()[0][0]
print(f"✅ Loaded {count} records into users_extracted with extracted fields")

# Show sample
print("\nSample data from users_extracted (with extracted fields):")
spark.sql("SELECT user_id, name, age, city, subscription_tier FROM users_extracted LIMIT 5").show(truncate=False)


HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

✅ Loaded 1000 records into users_extracted with extracted fields

Sample data from users_extracted (with extracted fields):


HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+-------------+---+--------------+-----------------+
|user_id                             |name         |age|city          |subscription_tier|
+------------------------------------+-------------+---+--------------+-----------------+
|3517c88f-ccdd-4085-b4c2-ddaec1b81062|David Hudson |18 |West Jamesbury|NULL             |
|59676281-f3fb-4464-a503-5c4b20ced876|Kevin Dean   |24 |Campbellmouth |NULL             |
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|Sandra Lester|23 |West Phillip  |NULL             |
|76e70ec6-f201-4510-9de5-2c61af2af97b|Diana Pratt  |38 |Thompsontown  |NULL             |
|d4bcbf2c-d0ac-4ac2-9c9f-b18d263e894f|Matthew Brown|27 |Wrightshire   |NULL             |
+------------------------------------+-------------+---+--------------+-----------------+



## 4. Querying VARIANT Data - Basic Access Patterns

Let's explore different ways to access and query data stored in VARIANT columns.


In [8]:
# Accessing nested fields using : notation
print("=== Accessing Nested Fields ===\n")

spark.sql("""
    SELECT 
        user_id,
        user_data:name as name,
        user_data:age as age,
        user_data:profile:bio as bio,
        user_data:profile:occupation as occupation,
        user_data:address:city as city,
        user_data:address:coordinates:latitude as latitude,
        user_data:address:coordinates:longitude as longitude
    FROM users_variant
    LIMIT 5
""").show(truncate=False)


=== Accessing Nested Fields ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+---------------+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------+----------------+-----------+-----------+
|user_id                             |name           |age|bio                                                                                                                                                                              |occupation                             |city            |latitude   |longitude  |
+------------------------------------+---------------+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------+----------------+-----------+-----------+
|3517c88f-ccdd-4085-b4c2-ddaec1b81062|"David H

In [9]:
# Accessing array elements within VARIANT
print("=== Accessing Array Elements in VARIANT ===\n")

spark.sql("""
    SELECT 
        user_id,
        username,
        user_data:profile:interests[0] as first_interest,
        user_data:profile:interests[1] as second_interest,
        SIZE(CAST(user_data:profile:interests AS ARRAY<STRING>)) as total_interests,
        user_data:profile:skills as all_skills
    FROM users_variant
    WHERE user_data:profile:interests IS NOT NULL
    LIMIT 5
""").show(truncate=False)

# Using EXPLODE with arrays in VARIANT
print("\n=== Exploding Arrays from VARIANT ===\n")

spark.sql("""
    SELECT 
        user_id,
        username,
        interest
    FROM users_variant
    LATERAL VIEW EXPLODE(CAST(user_data:profile:interests AS ARRAY<STRING>)) AS interest
    LIMIT 10
""").show(truncate=False)


=== Accessing Array Elements in VARIANT ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+--------------+--------------+---------------+---------------+-----------------------------------------------------------------------------------+
|user_id                             |username      |first_interest|second_interest|total_interests|all_skills                                                                         |
+------------------------------------+--------------+--------------+---------------+---------------+-----------------------------------------------------------------------------------+
|3517c88f-ccdd-4085-b4c2-ddaec1b81062|thomaslong    |"report"      |NULL           |1              |["Systems analyst","Bookseller","Aeronautical engineer"]                           |
|59676281-f3fb-4464-a503-5c4b20ced876|brendanelson  |"local"       |"reflect"      |2              |["Engineer, communications"]                                                       |
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|hernandezpaige|"should"      |"art"  

HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+--------------+----------+
|user_id                             |username      |interest  |
+------------------------------------+--------------+----------+
|3517c88f-ccdd-4085-b4c2-ddaec1b81062|thomaslong    |report    |
|59676281-f3fb-4464-a503-5c4b20ced876|brendanelson  |local     |
|59676281-f3fb-4464-a503-5c4b20ced876|brendanelson  |reflect   |
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|hernandezpaige|should    |
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|hernandezpaige|art       |
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|hernandezpaige|hot       |
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|hernandezpaige|generation|
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|hernandezpaige|morning   |
|76e70ec6-f201-4510-9de5-2c61af2af97b|brownpatrick  |impact    |
|76e70ec6-f201-4510-9de5-2c61af2af97b|brownpatrick  |or        |
+------------------------------------+--------------+----------+



## 5. Type Casting and Conversion with VARIANT

VARIANT columns can be cast to specific types when needed.


In [10]:
# Type casting VARIANT fields
print("=== Type Casting VARIANT Fields ===\n")

spark.sql("""
    SELECT 
        user_id,
        user_data:age as age_variant,
        CAST(user_data:age AS INT) as age_int,
        CAST(user_data:created_at AS TIMESTAMP) as created_timestamp,
        CAST(user_data:preferences:newsletter AS BOOLEAN) as newsletter_bool,
        CAST(user_data:metrics:login_count AS INT) as login_count_int
    FROM users_variant
    LIMIT 5
""").show()

# Using TRY_CAST for safe casting (handles nulls gracefully)
print("\n=== Safe Casting with TRY_CAST ===\n")

spark.sql("""
    SELECT 
        user_id,
        TRY_CAST(user_data:age AS INT) as age,
        TRY_CAST(user_data:subscription:start_date AS DATE) as subscription_date,
        TRY_CAST(user_data:metrics:posts_created AS INT) as posts_count,
        TRY_CAST(user_data:phone AS STRING) as phone_number,
        TRY_CAST(user_data:last_login AS TIMESTAMP) as last_login_time
    FROM users_variant
    LIMIT 5
""").show()


=== Type Casting VARIANT Fields ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+--------------------+-----------+-------+--------------------+---------------+---------------+
|             user_id|age_variant|age_int|   created_timestamp|newsletter_bool|login_count_int|
+--------------------+-----------+-------+--------------------+---------------+---------------+
|3517c88f-ccdd-408...|         18|     18|2023-09-15 23:49:...|           true|           NULL|
|59676281-f3fb-446...|         24|     24|2025-06-18 13:08:...|           true|           NULL|
|6a0d1f3e-787b-4ff...|         23|     23|2022-12-27 12:48:...|          false|           NULL|
|76e70ec6-f201-451...|         38|     38|2024-03-07 14:08:...|           true|           NULL|
|d4bcbf2c-d0ac-4ac...|         27|     27|2022-04-28 22:59:...|           true|           NULL|
+--------------------+-----------+-------+--------------------+---------------+---------------+


=== Safe Casting with TRY_CAST ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+--------------------+---+-----------------+-----------+------------+--------------------+
|             user_id|age|subscription_date|posts_count|phone_number|     last_login_time|
+--------------------+---+-----------------+-----------+------------+--------------------+
|3517c88f-ccdd-408...| 18|             NULL|       NULL|        NULL|2025-08-26 20:40:...|
|59676281-f3fb-446...| 24|             NULL|       NULL|        NULL|2025-09-13 19:43:...|
|6a0d1f3e-787b-4ff...| 23|             NULL|       NULL|        NULL|2025-09-17 07:39:...|
|76e70ec6-f201-451...| 38|             NULL|       NULL|        NULL|2025-08-19 19:29:...|
|d4bcbf2c-d0ac-4ac...| 27|             NULL|       NULL|        NULL|2025-08-31 09:03:...|
+--------------------+---+-----------------+-----------+------------+--------------------+



## 6. VARIANT Functions and Operations

Databricks provides several functions specifically for working with VARIANT data.


In [11]:
# VARIANT_GET function to safely extract values
print("=== Using VARIANT_GET Function ===\n")

spark.sql("""
    SELECT 
        user_id,
        VARIANT_GET(user_data, '$.name', 'STRING') as name,
        VARIANT_GET(user_data, '$.age', 'INT') as age,
        VARIANT_GET(user_data, '$.profile.interests', 'ARRAY<STRING>') as interests,
        VARIANT_GET(user_data, '$.non_existent_field', 'STRING') as missing_field
    FROM users_variant
    LIMIT 3
""").show(truncate=False)


=== Using VARIANT_GET Function ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+-------------+---+---------------------------------------+-------------+
|user_id                             |name         |age|interests                              |missing_field|
+------------------------------------+-------------+---+---------------------------------------+-------------+
|3517c88f-ccdd-4085-b4c2-ddaec1b81062|David Hudson |18 |[report]                               |NULL         |
|59676281-f3fb-4464-a503-5c4b20ced876|Kevin Dean   |24 |[local, reflect]                       |NULL         |
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|Sandra Lester|23 |[should, art, hot, generation, morning]|NULL         |
+------------------------------------+-------------+---+---------------------------------------+-------------+



In [12]:
# TO_JSON and FROM_JSON functions
print("=== Converting between VARIANT and JSON strings ===\n")

spark.sql("""
    SELECT 
        user_id,
        TO_JSON(user_data) as json_string,
        LENGTH(TO_JSON(user_data)) as json_length,
        PARSE_JSON(TO_JSON(user_data:address)) as address_variant
    FROM users_variant
    LIMIT 2
""").show(truncate=False)

# SCHEMA_OF_VARIANT to discover schema
print("\n=== Discovering Schema of VARIANT Data ===\n")

spark.sql("""
    SELECT 
        SCHEMA_OF_VARIANT(user_data) as inferred_schema
    FROM users_variant
    GROUP BY SCHEMA_OF_VARIANT(user_data)
    LIMIT 5
""").show(truncate=False)


=== Converting between VARIANT and JSON strings ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+----------------

HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|inferred_schema                                                                                                                                                                                                                                                                                                                       

## 7. Filtering and Conditional Logic with VARIANT

Let's explore how to filter and apply conditions on VARIANT data.


In [13]:
# Filtering based on VARIANT field values
print("=== Filtering on VARIANT Fields ===\n")

spark.sql("""
    SELECT 
        user_id,
        user_data:name as name,
        user_data:age as age,
        user_data:address:country as country
    FROM users_variant
    WHERE CAST(user_data:age AS INT) > 30
        AND CAST(user_data:preferences:newsletter AS BOOLEAN) = true
        AND user_data:address:country IS NOT NULL
    LIMIT 5
""").show()

# Complex filtering with nested conditions
print("\n=== Complex Filtering with Nested VARIANT Fields ===\n")

spark.sql("""
    SELECT 
        user_id,
        user_data:name as name,
        user_data:age as age,
        CAST(user_data:subscription:tier AS STRING) as subscription_tier,
        user_data:metrics:login_count as login_count
    FROM users_variant
    WHERE user_data:subscription:tier IS NOT NULL
        AND user_data:metrics:login_count IS NOT NULL
        AND CAST(user_data:age AS INT) BETWEEN 25 AND 60
        AND CAST(user_data:metrics:login_count AS INT) > 10
    ORDER BY CAST(user_data:metrics:login_count AS INT) DESC
    LIMIT 5
""").show()


=== Filtering on VARIANT Fields ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+--------------------+------------------+---+------------+
|             user_id|              name|age|     country|
+--------------------+------------------+---+------------+
|76e70ec6-f201-451...|     "Diana Pratt"| 38|   "Germany"|
|c7bfa6fd-fdc6-44b...|   "Mike Valencia"| 79|  "Mongolia"|
|439e284c-b2f8-4be...|  "Julian Jenkins"| 59|"Montenegro"|
|9cbb5c54-147c-481...|    "Angela Marsh"| 57| "Singapore"|
|e86bc742-de06-4b2...|"Michelle English"| 38|   "Bahamas"|
+--------------------+------------------+---+------------+


=== Complex Filtering with Nested VARIANT Fields ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+--------------------+-----------------+---+-----------------+-----------+
|             user_id|             name|age|subscription_tier|login_count|
+--------------------+-----------------+---+-----------------+-----------+
|17e18134-80c3-4cb...| "Daniel Goodwin"| 46|            basic|        996|
|2bfd2540-ec82-4a3...|"Margaret Harris"| 44|       enterprise|        996|
|f196bbe1-68d9-4f9...|   "Tyler Dillon"| 46|             free|        994|
|19fcd2aa-b042-408...| "Sabrina Moreno"| 34|       enterprise|        988|
|4a9edbcc-5c4a-463...|  "Anthony Bates"| 40|       enterprise|        982|
+--------------------+-----------------+---+-----------------+-----------+



In [14]:
# Using CASE statements with VARIANT
print("=== CASE Statements with VARIANT ===\n")

spark.sql("""
    SELECT 
        user_id,
        user_data:name as name,
        user_data:age as age,
        CAST(user_data:subscription:tier AS STRING) as subscription_tier,
        CASE 
            WHEN CAST(user_data:age AS INT) < 25 THEN 'Young'
            WHEN CAST(user_data:age AS INT) BETWEEN 25 AND 50 THEN 'Adult'
            WHEN CAST(user_data:age AS INT) > 50 THEN 'Senior'
            ELSE 'Unknown'
        END as age_group,
        CASE 
            WHEN CAST(user_data:subscription:tier AS STRING) = 'free' THEN 'Free User'
            WHEN CAST(user_data:subscription:tier AS STRING) = 'basic' THEN 'Basic User'
            WHEN CAST(user_data:subscription:tier AS STRING) = 'premium' THEN 'Premium User'
            WHEN CAST(user_data:subscription:tier AS STRING) = 'enterprise' THEN 'Enterprise User'
            ELSE 'No Subscription'
        END as user_category
    FROM users_variant
    LIMIT 10
""").show(truncate=False)


=== CASE Statements with VARIANT ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+-------------------+---+-----------------+---------+---------------+
|user_id                             |name               |age|subscription_tier|age_group|user_category  |
+------------------------------------+-------------------+---+-----------------+---------+---------------+
|3517c88f-ccdd-4085-b4c2-ddaec1b81062|"David Hudson"     |18 |NULL             |Young    |No Subscription|
|59676281-f3fb-4464-a503-5c4b20ced876|"Kevin Dean"       |24 |NULL             |Young    |No Subscription|
|6a0d1f3e-787b-4fff-aeb9-aaabb258dcdb|"Sandra Lester"    |23 |NULL             |Young    |No Subscription|
|76e70ec6-f201-4510-9de5-2c61af2af97b|"Diana Pratt"      |38 |NULL             |Adult    |No Subscription|
|d4bcbf2c-d0ac-4ac2-9c9f-b18d263e894f|"Matthew Brown"    |27 |NULL             |Adult    |No Subscription|
|c7bfa6fd-fdc6-44b3-8b65-f2f07f7e7cc2|"Mike Valencia"    |79 |NULL             |Senior   |No Subscription|
|104841c8-816b-47dd-900c-c29f5d1c2773

## 8. Aggregations with VARIANT Data

Performing aggregations on data stored in VARIANT columns.


In [15]:
# Aggregating VARIANT data
print("=== Aggregations on VARIANT Data ===\n")

spark.sql("""
    SELECT 
        COUNT(*) as user_count,
        AVG(CAST(user_data:age AS INT)) as avg_age,
        MIN(CAST(user_data:age AS INT)) as min_age,
        MAX(CAST(user_data:age AS INT)) as max_age
    FROM users_variant
""").show()

# Aggregating by subscription tier
print("\n=== Subscription Tier Aggregations ===\n")

spark.sql("""
    SELECT 
        CAST(user_data:subscription:tier AS STRING) as subscription_tier,
        COUNT(*) as user_count,
        AVG(CAST(user_data:age AS INT)) as avg_age,
        AVG(CAST(user_data:metrics:login_count AS INT)) as avg_login_count,
        AVG(CAST(user_data:metrics:posts_created AS INT)) as avg_posts
    FROM users_variant
    WHERE user_data:subscription:tier IS NOT NULL
    GROUP BY CAST(user_data:subscription:tier AS STRING)
    ORDER BY user_count DESC
""").show()


=== Aggregations on VARIANT Data ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+----------+-------+-------+-------+
|user_count|avg_age|min_age|max_age|
+----------+-------+-------+-------+
|      1000| 48.526|     18|     80|
+----------+-------+-------+-------+


=== Subscription Tier Aggregations ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+-----------------+----------+------------------+-----------------+------------------+
|subscription_tier|user_count|           avg_age|  avg_login_count|         avg_posts|
+-----------------+----------+------------------+-----------------+------------------+
|          premium|       108|49.361111111111114|532.5370370370371| 237.4537037037037|
|            basic|       101|45.495049504950494|522.4554455445544|243.63366336633663|
|             free|        99| 47.97979797979798|456.7676767676768|260.42424242424244|
|       enterprise|        91|46.967032967032964|555.3516483516484| 258.7802197802198|
+-----------------+----------+------------------+-----------------+------------------+



In [16]:
# Complex aggregations with array data
print("=== Aggregating Array Data in VARIANT ===\n")

spark.sql("""
    SELECT 
        AVG(SIZE(CAST(user_data:profile:interests AS ARRAY<STRING>))) as avg_interests_per_user,
        MAX(SIZE(CAST(user_data:profile:interests AS ARRAY<STRING>))) as max_interests,
        MIN(SIZE(CAST(user_data:profile:interests AS ARRAY<STRING>))) as min_interests
    FROM users_variant
    WHERE user_data:profile:interests IS NOT NULL
""").show()

# Aggregating skills data
print("\n=== Skills Aggregation ===\n")

spark.sql("""
    WITH user_skills AS (
        SELECT 
            user_id,
            user_data:name as name,
            SIZE(CAST(user_data:profile:skills AS ARRAY<STRING>)) as skill_count,
            user_data:profile:skills as skills
        FROM users_variant
        WHERE user_data:profile:skills IS NOT NULL
    )
    SELECT 
        COUNT(*) as users_with_skills,
        AVG(skill_count) as avg_skills_per_user,
        MAX(skill_count) as max_skills,
        MIN(skill_count) as min_skills
    FROM user_skills
""").show()


=== Aggregating Array Data in VARIANT ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+----------------------+-------------+-------------+
|avg_interests_per_user|max_interests|min_interests|
+----------------------+-------------+-------------+
|                  2.99|            5|            1|
+----------------------+-------------+-------------+


=== Skills Aggregation ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+-----------------+-------------------+----------+----------+
|users_with_skills|avg_skills_per_user|max_skills|min_skills|
+-----------------+-------------------+----------+----------+
|             1000|              1.527|         3|         0|
+-----------------+-------------------+----------+----------+



## 9. Window Functions with VARIANT

Using window functions to analyze VARIANT data.


In [17]:
# Window functions with VARIANT data
print("=== Window Functions on VARIANT Data ===\n")

spark.sql("""
    SELECT 
        user_id,
        user_data:name as name,
        user_data:age as age,
        CAST(user_data:address:country AS STRING) as country,
        CAST(user_data:metrics:login_count AS INT) as login_count,
        RANK() OVER (PARTITION BY CAST(user_data:address:country AS STRING) ORDER BY CAST(user_data:age AS INT) DESC) as age_rank_in_country,
        DENSE_RANK() OVER (ORDER BY CAST(user_data:metrics:login_count AS INT) DESC) as login_rank,
        AVG(CAST(user_data:age AS INT)) OVER (PARTITION BY CAST(user_data:address:country AS STRING)) as country_avg_age
    FROM users_variant
    WHERE user_data:metrics:login_count IS NOT NULL
        AND user_data:address:country IS NOT NULL
    ORDER BY country, age_rank_in_country
    LIMIT 15
""").show(truncate=False)

# Running totals and cumulative stats
print("\n=== Cumulative Statistics with VARIANT ===\n")

spark.sql("""
    WITH user_activity AS (
        SELECT 
            user_id,
            user_data:name as name,
            CAST(user_data:created_at AS TIMESTAMP) as created_time,
            CAST(user_data:metrics:login_count AS INT) as login_count,
            CAST(user_data:metrics:posts_created AS INT) as posts_created
        FROM users_variant
        WHERE user_data:metrics:login_count IS NOT NULL
            AND user_data:created_at IS NOT NULL
    )
    SELECT 
        user_id,
        name,
        created_time,
        login_count,
        posts_created,
        AVG(login_count) OVER (
            ORDER BY created_time 
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ) as moving_avg_logins,
        MAX(login_count) OVER (
            ORDER BY created_time
        ) as max_logins_so_far
    FROM user_activity
    ORDER BY created_time
    LIMIT 10
""").show(truncate=False)


=== Window Functions on VARIANT Data ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+-------------------+---+--------------+-----------+-------------------+----------+------------------+
|user_id                             |name               |age|country       |login_count|age_rank_in_country|login_rank|country_avg_age   |
+------------------------------------+-------------------+---+--------------+-----------+-------------------+----------+------------------+
|80c267df-127a-4ecf-acc4-b01ac30cb228|"David Edwards"    |61 |Albania       |926        |1                  |26        |42.0              |
|03934429-e1e0-46cd-88a4-0c9942d22c68|"Brittany Park"    |53 |Albania       |29         |2                  |322       |42.0              |
|302b9797-6576-4587-9627-c80b8f157d90|"Christina Olsen"  |43 |Albania       |328        |3                  |234       |42.0              |
|3bad45a6-6f9c-4bf7-9a7c-84ab71200bcf|"Jamie Thompson"   |33 |Albania       |116        |4                  |293       |42.0              |
|e44d385b-1156-4f55-

HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+--------------------+--------------------------+-----------+-------------+-----------------+-----------------+
|user_id                             |name                |created_time              |login_count|posts_created|moving_avg_logins|max_logins_so_far|
+------------------------------------+--------------------+--------------------------+-----------+-------------+-----------------+-----------------+
|de4d199b-d5f6-4c2c-b847-be3a384021c4|"Amy Barrett"       |2020-09-20 11:01:09.461979|310        |491          |310.0            |310              |
|3f75a1c3-ef6d-4dae-bd4f-f417c6906b78|"Matthew Williams"  |2020-09-22 04:28:35.17108 |808        |299          |559.0            |808              |
|92d052ab-db4c-4af1-aac4-73e1ed8ee296|"Olivia Barber"     |2020-09-26 18:36:44.866098|177        |14           |431.6666666666667|808              |
|f265fea2-2725-40f9-b749-9dfc0e5c1234|"Heather Jones"     |2020-10-15 03:55:42.5799  |487        |96      

## 10. JOINs with VARIANT Data

Demonstrating how to join tables containing VARIANT columns.


In [18]:
# Self-join on VARIANT data
print("=== Self-Join on VARIANT Data ===\n")

spark.sql("""
    SELECT 
        u1.user_id as user1_id,
        u1.user_data:name as user1_name,
        CAST(u1.user_data:address:city AS STRING) as user1_city,
        u2.user_id as user2_id,
        u2.user_data:name as user2_name,
        CAST(u2.user_data:address:city AS STRING) as user2_city
    FROM users_variant u1
    JOIN users_variant u2
        ON CAST(u1.user_data:address:city AS STRING) = CAST(u2.user_data:address:city AS STRING)
        AND u1.user_id < u2.user_id
    WHERE u1.user_data:address:city IS NOT NULL
    LIMIT 10
""").show(truncate=False)

# Join with extracted table for performance
print("\n=== Join with Extracted Table for Performance ===\n")

spark.sql("""
    WITH user_analysis AS (
        SELECT 
            v.user_id,
            v.user_data:name as name,
            v.user_data:age as age,
            CAST(v.user_data:address:city AS STRING) as city,
            CAST(v.user_data:subscription:tier AS STRING) as subscription_tier,
            e.name as extracted_name,
            e.age as extracted_age,
            e.city as extracted_city
        FROM users_variant v
        LEFT JOIN users_extracted e
            ON v.user_id = e.user_id
        WHERE v.user_data:subscription:tier IS NOT NULL
    )
    SELECT * FROM user_analysis
    WHERE name IS NOT NULL
    LIMIT 5
""").show(truncate=False)


=== Self-Join on VARIANT Data ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+------------------+-----------------+------------------------------------+-----------------+-----------------+
|user1_id                            |user1_name        |user1_city       |user2_id                            |user2_name       |user2_city       |
+------------------------------------+------------------+-----------------+------------------------------------+-----------------+-----------------+
|612db342-6f94-4dd1-9724-d8bb6301cca5|"Anthony Huffman" |Mitchellmouth    |7a8b57ec-2735-4a23-beca-9e1cf1829369|"Bruce Mosley"   |Mitchellmouth    |
|7dedd795-f9b3-49c7-bab5-4b007080f776|"Amanda Marshall" |South Brian      |dfe4aae3-3394-4431-ac0e-c5847faf7b14|"Kyle Day"       |South Brian      |
|36742531-a6d1-4118-870c-a310d93e9e3e|"William Hardy"   |Port James       |70d5bc49-19b1-4eb4-a975-b053260749b7|"Nicholas Haley" |Port James       |
|2dc4aad9-4f39-40d8-9a8c-58387c063038|"Vincent Stafford"|Port Melissa     |b411dafe-028e-4a53-b003-0e5f6af

HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+-----------------+---+-----------------+-----------------+---------------+-------------+-----------------+
|user_id                             |name             |age|city             |subscription_tier|extracted_name |extracted_age|extracted_city   |
+------------------------------------+-----------------+---+-----------------+-----------------+---------------+-------------+-----------------+
|da424b78-b197-4296-a2c8-6042f78ef358|"Joshua Lee"     |61 |South Matthewport|enterprise       |Joshua Lee     |61           |South Matthewport|
|f665d0fe-5ff9-438c-b230-2f2495d1960e|"Nathan Thompson"|75 |Bakerbury        |premium          |Nathan Thompson|75           |Bakerbury        |
|ca40f72a-8fc0-4b10-83c8-b5d6ae326906|"Anna Lopez"     |31 |Port Kevin       |premium          |Anna Lopez     |31           |Port Kevin       |
|66bd3b80-5334-41c1-bf06-9c1f6a08c6ac|"Jon Owens"      |77 |Juanstad         |premium          |Jon Owens      |77           |Juan

## 11. Advanced VARIANT Use Cases

Let's explore some advanced use cases and patterns.


In [19]:
# Dynamic schema evolution handling
print("=== Demonstrating Schema Evolution with VARIANT ===\n")

# The data we generated already simulates schema evolution
# Let's query to show how VARIANT handles different schema versions seamlessly

print("Schema Evolution in our user data:")
print("- First 30% of users: Basic schema")
print("- Next 30% of users: Added social_media fields")
print("- Last 40% of users: Added subscription and metrics fields\n")

# Query showing all schema versions work together
spark.sql("""
    SELECT 
        user_id,
        user_data:name as name,
        user_data:email as email,
        user_data:phone as phone,  -- Optional field
        user_data:social_media:twitter as twitter,  -- Added in v2
        user_data:subscription:tier as subscription_tier,  -- Added in v3
        user_data:metrics:login_count as login_count  -- Added in v3
    FROM users_variant
    WHERE user_id IN (
        (SELECT user_id FROM users_variant LIMIT 1 OFFSET 100)
        UNION ALL
        (SELECT user_id FROM users_variant LIMIT 1 OFFSET 500)
        UNION ALL
        (SELECT user_id FROM users_variant LIMIT 1 OFFSET 900)
    )
""").show(truncate=False)

print("\n✅ VARIANT seamlessly handles schema evolution - old and new fields coexist!")


=== Demonstrating Schema Evolution with VARIANT ===

Schema Evolution in our user data:
- First 30% of users: Basic schema
- Next 30% of users: Added social_media fields
- Last 40% of users: Added subscription and metrics fields



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+------------------------------------+-------------------+-------------------------+--------------+----------------+-----------------+-----------+
|user_id                             |name               |email                    |phone         |twitter         |subscription_tier|login_count|
+------------------------------------+-------------------+-------------------------+--------------+----------------+-----------------+-----------+
|424a5cad-531e-4730-857d-f7f9c945fb24|"Matthew Quinn"    |"jasmineking@example.net"|NULL          |NULL            |NULL             |NULL       |
|a682f442-b29e-4a78-8f3f-da3c52a876ae|"Cristina Berry MD"|"felicia39@example.com"  |NULL          |"@melissacrosby"|NULL             |NULL       |
|bc9c5934-c811-4f72-b574-8fa9f9838f27|"Rebecca Mckinney" |"donald01@example.net"   |"477.434.3033"|"@deanarmstrong"|"free"           |958        |
+------------------------------------+-------------------+-------------------------+--------------+----------------+--

In [20]:
# Pivoting VARIANT data
print("=== Pivoting VARIANT Data ===\n")

spark.sql("""
    WITH subscription_counts AS (
        SELECT 
            CAST(user_data:address:country AS STRING) as country,
            CAST(user_data:subscription:tier AS STRING) as subscription_tier,
            COUNT(*) as user_count
        FROM users_variant
        WHERE user_data:subscription:tier IS NOT NULL
            AND user_data:address:country IS NOT NULL
        GROUP BY country, subscription_tier
    )
    SELECT * FROM subscription_counts
    PIVOT (
        SUM(user_count)
        FOR subscription_tier IN ('free', 'basic', 'premium', 'enterprise')
    )
    ORDER BY country
    LIMIT 10
""").show()

# Creating materialized views from VARIANT data
print("\n=== Creating Structured View from VARIANT ===\n")

spark.sql("""
    CREATE OR REPLACE TEMPORARY VIEW user_details AS
    SELECT 
        user_id,
        CAST(user_data:name AS STRING) as name,
        CAST(user_data:email AS STRING) as email,
        CAST(user_data:age AS INT) as age,
        CAST(user_data:address:city AS STRING) as city,
        CAST(user_data:address:country AS STRING) as country,
        CAST(user_data:preferences:newsletter AS BOOLEAN) as newsletter_opt_in,
        CAST(user_data:created_at AS TIMESTAMP) as created_date,
        SIZE(CAST(user_data:profile:interests AS ARRAY<STRING>)) as interest_count
    FROM users_variant
""")

print("✅ Created structured view from VARIANT data")

spark.sql("SELECT * FROM user_details LIMIT 5").show()


=== Pivoting VARIANT Data ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+-------------------+----+-----+-------+----------+
|            country|free|basic|premium|enterprise|
+-------------------+----+-----+-------+----------+
|            Albania|   3|    1|      1|      NULL|
|            Algeria|NULL| NULL|      1|         1|
|     American Samoa|   2|    1|      1|      NULL|
|            Andorra|NULL|    1|      1|         1|
|           Anguilla|NULL|    1|      1|      NULL|
|Antigua and Barbuda|NULL| NULL|      2|      NULL|
|          Argentina|   2| NULL|   NULL|         2|
|            Armenia|NULL|    1|      3|      NULL|
|              Aruba|NULL| NULL|      1|      NULL|
|            Austria|NULL| NULL|   NULL|         1|
+-------------------+----+-----+-------+----------+


=== Creating Structured View from VARIANT ===

✅ Created structured view from VARIANT data


HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+--------------------+-------------+--------------------+---+--------------+--------------------+-----------------+--------------------+--------------+
|             user_id|         name|               email|age|          city|             country|newsletter_opt_in|        created_date|interest_count|
+--------------------+-------------+--------------------+---+--------------+--------------------+-----------------+--------------------+--------------+
|3517c88f-ccdd-408...| David Hudson|mooreelizabeth@ex...| 18|West Jamesbury|             Belgium|             true|2023-09-15 23:49:...|             1|
|59676281-f3fb-446...|   Kevin Dean|woodsgabrielle@ex...| 24| Campbellmouth|British Indian Oc...|             true|2025-06-18 13:08:...|             2|
|6a0d1f3e-787b-4ff...|Sandra Lester|danielhowell@exam...| 23|  West Phillip|              Canada|            false|2022-12-27 12:48:...|             5|
|76e70ec6-f201-451...|  Diana Pratt|sullivanjason@exa...| 38|  Thompsontown|            

## 12. Performance Optimization with VARIANT

Best practices for optimizing queries on VARIANT data.


In [21]:
# Performance tip: Extract frequently accessed fields
print("=== Optimizing with Extracted Columns ===\n")

# Create an optimized table with extracted common fields
spark.sql(f"""
    CREATE OR REPLACE TABLE variant_optimized
    USING DELTA
    TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')
    AS
    SELECT 
        user_id,
        username,
        email,
        user_data,
        -- Extract frequently accessed fields as separate columns
        CAST(user_data:name AS STRING) as extracted_name,
        CAST(user_data:age AS INT) as extracted_age,
        CAST(user_data:created_at AS TIMESTAMP) as extracted_timestamp,
        ingestion_timestamp
    FROM users_variant
""")

print("✅ Created optimized table with extracted columns")

# Compare query performance (conceptual)
print("\n📊 Query Performance Comparison:")
print("Original: Scanning full VARIANT for every query")
print("Optimized: Can use extracted columns for filtering and joins")

# Show the optimized table structure
spark.sql("DESCRIBE TABLE variant_optimized").show()

# Example of efficient filtering using extracted columns
print("\n=== Efficient Filtering with Extracted Columns ===\n")

spark.sql("""
    SELECT 
        extracted_name,
        username,
        user_data:email as email
    FROM variant_optimized
    WHERE extracted_name LIKE 'J%'  -- Uses extracted column for efficient filtering
    LIMIT 5
""").show()


=== Optimizing with Extracted Columns ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

✅ Created optimized table with extracted columns

📊 Query Performance Comparison:
Original: Scanning full VARIANT for every query
Optimized: Can use extracted columns for filtering and joins
+-------------------+---------+-------+
|           col_name|data_type|comment|
+-------------------+---------+-------+
|            user_id|   string|   NULL|
|           username|   string|   NULL|
|              email|   string|   NULL|
|          user_data|  variant|   NULL|
|     extracted_name|   string|   NULL|
|      extracted_age|      int|   NULL|
|extracted_timestamp|timestamp|   NULL|
|ingestion_timestamp|timestamp|   NULL|
+-------------------+---------+-------+


=== Efficient Filtering with Extracted Columns ===



HBox(children=(IntProgress(value=0, bar_style='success'), Label(value='')))

+-----------------+--------------+--------------------+
|   extracted_name|      username|               email|
+-----------------+--------------+--------------------+
|James Christensen|       tmiller|"larry93@example....|
|  Jennifer Carter|brittanywilson|"zachary63@exampl...|
|   Julian Jenkins|     raymond68|"lisatrujillo@exa...|
| Jeffrey Williams|      gerald70|"judithmann@examp...|
|  Jacob Schneider|      victor14|"natasha71@exampl...|
+-----------------+--------------+--------------------+



## 13. Cleanup Script

This section provides a comprehensive cleanup script to remove all resources created during this exploration.


In [22]:
# Cleanup Script - Removes all resources created during VARIANT exploration

def cleanup():
    """Simple cleanup function to remove all created resources"""
    print("🧹 Cleaning up VARIANT exploration resources...")
    
    # Drop tables
    tables = ['users_variant', 'users_extracted', 'variant_optimized']
    for table in tables:
        spark.sql(f"DROP TABLE IF EXISTS {table}")
        print(f"✅ Dropped table: {table}")
    
    # Drop views
    spark.sql("DROP VIEW IF EXISTS user_details")
    print("✅ Dropped view: user_details")
    
    # Clean volume files
    users_folder = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}/users"
    try:
        dbutils.fs.rm(users_folder, recurse=True)
        print("✅ Cleaned volume files")
    except:
        print("⚠️  Volume files may not exist")
    
    # Reset to default catalog/schema
    spark.sql("USE CATALOG main")
    spark.sql("USE SCHEMA default")
    print("✅ Reset to main catalog, default schema")
    
    print("\n🎉 Cleanup complete!")

# Run cleanup
cleanup()


🧹 Cleaning up VARIANT exploration resources...
✅ Dropped table: users_variant
✅ Dropped table: users_extracted
✅ Dropped table: variant_optimized
✅ Dropped view: user_details
✅ Cleaned volume files
✅ Reset to main catalog, default schema

🎉 Cleanup complete!


In [23]:
# Optional: Individual cleanup functions

# def drop_tables_only():
#     """Drop only the tables"""
#     tables = ['users_variant', 'users_extracted', 'variant_optimized']
#     for table in tables:
#         spark.sql(f"DROP TABLE IF EXISTS {table}")
#         print(f"✅ Dropped {table}")

# def clean_volume_only():
#     """Clean only the volume files"""
#     dbutils.fs.rm(f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}/users", recurse=True)
#     print("✅ Cleaned volume files")

# Usage:
# cleanup()           # Full cleanup (already run above)
# drop_tables_only()  # Drop only tables
# clean_volume_only() # Clean only volume files
