In [0]:
%pip install databricks-labs-dqx --quiet
dbutils.library.restartPython()

In [0]:
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.labs.dqx.config import WorkspaceFileChecksStorageConfig
from databricks.sdk import WorkspaceClient
from pyspark.sql import SparkSession
import json
import time

# Initialize
spark = SparkSession.builder.getOrCreate()
ws = WorkspaceClient()
profiler = DQProfiler(ws)

In [0]:
# Base path for IMDB data in Unity Catalog Volume
BASE_PATH = "/Volumes/workspace/imdb_data_analysis/datastore"

# Define all 7 IMDB file paths
file_paths = {
    "title_basics": f"{BASE_PATH}/title.basics.tsv",
    "title_ratings": f"{BASE_PATH}/title.ratings.tsv",
    "name_basics": f"{BASE_PATH}/name.basics.tsv",
    "title_principals": f"{BASE_PATH}/title.principals.tsv",
    "title_crew": f"{BASE_PATH}/title.crew.tsv",
    "title_episode": f"{BASE_PATH}/title.episode.tsv",
    "title_akas": f"{BASE_PATH}/title.akas.tsv"
}

In [0]:
# Load title.basics
df_title_basics = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("quote", "") \
    .option("escape", "") \
    .option("nullValue", "\\N") \
    .csv(file_paths["title_basics"])

# Load title.ratings
df_title_ratings = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("quote", "") \
    .option("escape", "") \
    .option("nullValue", "\\N") \
    .csv(file_paths["title_ratings"])

# Load name.basics
df_name_basics = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("quote", "") \
    .option("escape", "") \
    .option("nullValue", "\\N") \
    .csv(file_paths["name_basics"])

# Load title.principals
df_title_principals = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("quote", "") \
    .option("escape", "") \
    .option("nullValue", "\\N") \
    .csv(file_paths["title_principals"])

# Load title.crew
df_title_crew = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("quote", "") \
    .option("escape", "") \
    .option("nullValue", "\\N") \
    .csv(file_paths["title_crew"])

# Load title.episode
df_title_episode = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("quote", "") \
    .option("escape", "") \
    .option("nullValue", "\\N") \
    .csv(file_paths["title_episode"])

# Load title.akas
df_title_akas = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("quote", "") \
    .option("escape", "") \
    .option("nullValue", "\\N") \
    .csv(file_paths["title_akas"])

# Store in dictionary for easier iteration
datasets = {
    "title_basics": df_title_basics,
    "title_ratings": df_title_ratings,
    "name_basics": df_name_basics,
    "title_principals": df_title_principals,
    "title_crew": df_title_crew,
    "title_episode": df_title_episode,
    "title_akas": df_title_akas
}

In [0]:
# Profiling configurations for each dataset (full data, no sampling)
profiling_configs = {
    "title_basics": {
        "options": {
            "sample_fraction": None,
            "limit": None,
            "round": True,
            "max_in_count": 20,
            "distinct_ratio": 0.001,
            "max_null_ratio": 0.15,
            "remove_outliers": True,
            "outlier_columns": ["runtimeMinutes", "startYear"],
            "num_sigmas": 3,
            "trim_strings": True,
            "max_empty_ratio": 0.01
        },
        "columns": ["tconst", "titleType", "primaryTitle", "isAdult", "startYear", "endYear", "runtimeMinutes", "genres"]
    },
    
    "title_ratings": {
        "options": {
            "sample_fraction": None,
            "limit": None,
            "round": True,
            "max_in_count": 100,
            "distinct_ratio": 0.01,
            "max_null_ratio": 0.0,
            "remove_outliers": True,
            "outlier_columns": ["numVotes"],
            "num_sigmas": 3,
            "trim_strings": False,
            "max_empty_ratio": 0.0
        },
        "columns": ["tconst", "averageRating", "numVotes"]
    },
    
    "name_basics": {
        "options": {
            "sample_fraction": None,
            "limit": None,
            "round": True,
            "max_in_count": 50,
            "distinct_ratio": 0.001,
            "max_null_ratio": 0.25,
            "remove_outliers": True,
            "outlier_columns": ["birthYear", "deathYear"],
            "num_sigmas": 3,
            "trim_strings": True,
            "max_empty_ratio": 0.01
        },
        "columns": ["nconst", "primaryName", "birthYear", "deathYear", "primaryProfession", "knownForTitles"]
    },
    
    "title_principals": {
        "options": {
            "sample_fraction": None,
            "limit": None,
            "round": True,
            "max_in_count": 15,
            "distinct_ratio": 0.0001,
            "max_null_ratio": 0.5,
            "remove_outliers": True,
            "outlier_columns": ["ordering"],
            "num_sigmas": 3,
            "trim_strings": True,
            "max_empty_ratio": 0.3
        },
        "columns": ["tconst", "ordering", "nconst", "category", "job", "characters"]
    },
    
    "title_crew": {
        "options": {
            "sample_fraction": None,
            "limit": None,
            "round": True,
            "max_in_count": 20,
            "distinct_ratio": 0.01,
            "max_null_ratio": 0.4,
            "remove_outliers": False,
            "num_sigmas": 3,
            "trim_strings": True,
            "max_empty_ratio": 0.35
        },
        "columns": ["tconst", "directors", "writers"]
    },
    
    "title_episode": {
        "options": {
            "sample_fraction": None,
            "limit": None,
            "round": True,
            "max_in_count": 30,
            "distinct_ratio": 0.001,
            "max_null_ratio": 0.3,
            "remove_outliers": True,
            "outlier_columns": ["seasonNumber", "episodeNumber"],
            "num_sigmas": 3,
            "trim_strings": True,
            "max_empty_ratio": 0.25
        },
        "columns": ["tconst", "parentTconst", "seasonNumber", "episodeNumber"]
    },
    
    "title_akas": {
        "options": {
            "sample_fraction": None,
            "limit": None,
            "round": True,
            "max_in_count": 100,
            "distinct_ratio": 0.0001,
            "max_null_ratio": 0.5,
            "remove_outliers": False,
            "num_sigmas": 3,
            "trim_strings": True,
            "max_empty_ratio": 0.4
        },
        "columns": ["titleId", "ordering", "title", "region", "language", "types", "attributes", "isOriginalTitle"]
    }
}

In [0]:
# Profile all datasets with DQX
all_profiles = {}
all_summary_stats = {}
profiling_times = {}

for dataset_name, df in datasets.items():
    config = profiling_configs[dataset_name]
    
    # Select only specified columns
    df_filtered = df.select(config['columns'])
    
    start_time = time.time()
    
    # Profile dataset
    summary_stats, profiles = profiler.profile(
        df=df_filtered,
        options=config['options']
    )
    
    elapsed_time = time.time() - start_time
    
    # Store results
    all_summary_stats[dataset_name] = summary_stats
    all_profiles[dataset_name] = profiles
    profiling_times[dataset_name] = elapsed_time
    
    print(f"{dataset_name}: {len(profiles)} rules in {elapsed_time:.2f}s")

In [0]:
# Display generated profiles for each dataset
for dataset_name, profiles in all_profiles.items():
    print(f"\n{'='*100}")
    print(f"{dataset_name.upper()} - {len(profiles)} Rules")
    print(f"{'='*100}")
    
    for i, profile in enumerate(profiles, 1):
        print(f"{i}. {profile}")
    
    print(f"\nSummary Stats:")
    print(json.dumps(all_summary_stats[dataset_name], indent=2))