In [1]:
import pydeequ
from pyspark.sql import SparkSession

# Define spark session config
spark_configs = {
    'spark.master': 'spark://spark-iceberg:7077',
    'spark.sql.catalog.airline': 'org.apache.iceberg.spark.SparkCatalog',
    'spark.sql.catalog.airline.io-impl': 'org.apache.iceberg.aws.s3.S3FileIO',
    'spark.sql.catalog.airline.s3.endpoint': 'http://minio:9000',
    'spark.sql.catalog.airline.type': 'rest',
    'spark.sql.catalog.airline.uri': 'http://rest:8181',
    'spark.sql.catalog.airline.warehouse': 's3://warehouse',
    'spark.sql.defaultCatalog': 'airline',
    'spark.driver.memory': '2G',
    'spark.executor.memory': '2G',
    
    "spark.jars.packages": pydeequ.deequ_maven_coord,
    "spark.jars.excludes": pydeequ.f2j_maven_coord
}

# Initialize SparkSession
spark = (
    SparkSession
    .builder
    .appName('Deequ Experiment')
    .config(map=spark_configs)
    .getOrCreate()
)


# Read flights table from iceberg and cache
df = spark.table('airline.db.flights')
# df.cache()

24/12/31 04:24:55 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [9]:
from pyspark.sql import Window
from pyspark.sql.functions import count, rank

w = Window.partitionBy('date', 'airline', 'flight_number', 'scheduled_departure') #.orderBy('date')

# Identify any duplicate records
# df.withColumn('rank', rank().over(w)).orderBy('rank', ascending=False).show()
# df.withColumn('row_count', count('*').over(w)).filter('row_count > 1').show()

temp = df.groupBy(['date', 'airline', 'flight_number', 'scheduled_departure']).count().filter('count > 1')
temp.explain()
temp.show()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (count#1301L > 1)
   +- HashAggregate(keys=[date#1166, airline#1167, flight_number#1168, scheduled_departure#1172], functions=[count(1)])
      +- Exchange hashpartitioning(date#1166, airline#1167, flight_number#1168, scheduled_departure#1172, 200), ENSURE_REQUIREMENTS, [plan_id=67]
         +- HashAggregate(keys=[date#1166, airline#1167, flight_number#1168, scheduled_departure#1172], functions=[partial_count(1)])
            +- BatchScan airline.db.flights[date#1166, airline#1167, flight_number#1168, scheduled_departure#1172] airline.db.flights (branch=null) [filters=, groupedBy=] RuntimeFilters: []




24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:20:32 WARN RowBasedKeyValueBatch: Calling spill() on

+----------+-------+-------------+-------------------+-----+
|      date|airline|flight_number|scheduled_departure|count|
+----------+-------+-------------+-------------------+-----+
|2015-10-04|     UA|          707|               2101|    2|
|2015-08-29|     AA|          803|               1435|    2|
|2015-03-06|     AA|         1103|                600|    2|
|2015-10-16|     EV|         5660|                704|    2|
+----------+-------+-------------+-------------------+-----+



In [2]:
df.filter('date = "2015-10-04" AND airline = "UA" AND flight_number = "707"').show()

                                                                                

+----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+----------+
|      date|airline|flight_number|tail_number|origin_airport|destination_airport|scheduled_departure|departure_time|departure_delay|taxi_out|wheels_off|scheduled_time|elapsed_time|air_time|distance|wheels_on|taxi_in|scheduled_arrival|arrival_time|arrival_delay|diverted|cancelled|cancellation_reason|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|is_delayed|
+----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+

## Run profiler on data

In [6]:
from pydeequ.profiles import *

# Profiling all the columns: ColumnProfilerRunner.onData returns a ColumnProfilerRunBuilder
result = ColumnProfilerRunner(spark) \
    .onData(df) \
    .run()

# printing all the columns and their corresponding profiled data.
for col_name, profile in result.profiles.items():
    print(profile)

                                                                                

Unable to map type DateType


                                                                                

NumericProfiles for column: wheels_on: {
    "completeness": 0.9841017796802553,
    "approximateNumDistinctValues": 1496,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": null,
    "kll": "None",
    "mean": 1471.4686087613413,
    "maximum": 2400.0,
    "minimum": 1.0,
    "sum": 8426462105.0,
    "stdDev": 522.1878993765856,
    "approxPercentiles": []
}
NumericProfiles for column: departure_delay: {
    "completeness": 0.9851947361429532,
    "approximateNumDistinctValues": 1198,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": null,
    "kll": "None",
    "mean": 9.370158275198389,
    "maximum": 1988.0,
    "minimum": -82.0,
    "sum": 53718424.0,
    "stdDev": 37.08093926275402,
    "approxPercentiles": []
}
StandardProfiles for column: origin_airport: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 593,
    "dataType": "String",
    "isDataTypeInferred": false,
 

## Run constraint suggestion on data

In [14]:
from pydeequ.suggestions import ConstraintSuggestionRunner, DEFAULT

# Run constraint suggestion
suggestionResult = ConstraintSuggestionRunner(spark) \
    .onData(df) \
    .addConstraintRule(DEFAULT()) \
    .run()

# Print constraint suggestions
for constraint in suggestionResult['constraint_suggestions']:
    print(f'Constraint suggestion for {constraint["description"]}')
    print(f'Code suggestion is \"{constraint["code_for_constraint"]}\"')
    print()

                                                                                

Unable to map type DateType


[Stage 65:>                                                       (0 + 12) / 12]

Constraint suggestion for 'wheels_on' has no negative values
Code suggestion is ".isNonNegative("wheels_on")"

Constraint suggestion for 'wheels_on' has less than 2% missing values
Code suggestion is ".hasCompleteness("wheels_on", lambda x: x >= 0.98, "It should be above 0.98!")"

Constraint suggestion for 'departure_delay' has less than 2% missing values
Code suggestion is ".hasCompleteness("departure_delay", lambda x: x >= 0.98, "It should be above 0.98!")"

Constraint suggestion for 'origin_airport' is not null
Code suggestion is ".isComplete("origin_airport")"

Constraint suggestion for 'taxi_in' has no negative values
Code suggestion is ".isNonNegative("taxi_in")"

Constraint suggestion for 'taxi_in' has less than 2% missing values
Code suggestion is ".hasCompleteness("taxi_in", lambda x: x >= 0.98, "It should be above 0.98!")"

Constraint suggestion for 'weather_delay' has no negative values
Code suggestion is ".isNonNegative("weather_delay")"

Constraint suggestion for 'taxi_out

                                                                                

In [15]:
# Print constraint suggestions
for constraint in suggestionResult['constraint_suggestions']:
    print(constraint["code_for_constraint"])
    print()

.isNonNegative("wheels_on")

.hasCompleteness("wheels_on", lambda x: x >= 0.98, "It should be above 0.98!")

.hasCompleteness("departure_delay", lambda x: x >= 0.98, "It should be above 0.98!")

.isComplete("origin_airport")

.isNonNegative("taxi_in")

.hasCompleteness("taxi_in", lambda x: x >= 0.98, "It should be above 0.98!")

.isNonNegative("weather_delay")

.isNonNegative("taxi_out")

.hasCompleteness("taxi_out", lambda x: x >= 0.98, "It should be above 0.98!")

.isContainedIn("diverted", ["0", "1"])

.isComplete("diverted")

.isContainedIn("diverted", ["0"], lambda x: x >= 0.99, "It should be above 0.99!")

.isNonNegative("diverted")

.isNonNegative("departure_time")

.hasCompleteness("departure_time", lambda x: x >= 0.98, "It should be above 0.98!")

.isNonNegative("arrival_time")

.hasCompleteness("arrival_time", lambda x: x >= 0.98, "It should be above 0.98!")

.isNonNegative("air_system_delay")

.isComplete("scheduled_arrival")

.isNonNegative("scheduled_arrival")

.isContaine

## Write Checks and verification

In [2]:
from pydeequ.checks import *
from pydeequ.verification import *

# Completeness Checks
completeness_check = Check(spark, CheckLevel.Error, "Completeness Checks") \
    .isComplete("date") \
    .isComplete("airline") \
    .isComplete("flight_number") \
    .isComplete("origin_airport") \
    .isComplete("destination_airport") \
    .isComplete("scheduled_departure") \
    .isComplete("scheduled_arrival") \
    .isComplete("distance") \
    .isComplete("is_delayed") \
    .isComplete("cancelled") \
    .isComplete("diverted")

# Non-Negative Checks
non_negative_check = Check(spark, CheckLevel.Error, "Non-Negative Checks") \
    .isNonNegative("flight_number") \
    .isNonNegative("scheduled_departure") \
    .isNonNegative("departure_time") \
    .isNonNegative("taxi_out") \
    .isNonNegative("wheels_off") \
    .isNonNegative("scheduled_time") \
    .isNonNegative("elapsed_time") \
    .isNonNegative("air_time") \
    .isNonNegative("distance") \
    .isNonNegative("wheels_on") \
    .isNonNegative("taxi_in") \
    .isNonNegative("scheduled_arrival") \
    .isNonNegative("arrival_time") \
    .isNonNegative("air_system_delay") \
    .isNonNegative("security_delay") \
    .isNonNegative("airline_delay") \
    .isNonNegative("late_aircraft_delay") \
    .isNonNegative("weather_delay")

# Containment Checks
containment_check = Check(spark, CheckLevel.Error, "Containment Checks") \
    .isContainedIn("airline", ["WN", "DL", "AA", "OO", "EV", "UA", "MQ", "B6", "US", "AS", "NK", "F9", "HA", "VX"]) \
    .isContainedIn("is_delayed", ["0", "1"]) \
    .isContainedIn("diverted", ["0", "1"]) \
    .isContainedIn("cancelled", ["0", "1"]) \
    .isContainedIn("cancellation_reason", ["A", "B", "C", "D"]) \
    .isContainedIn("cancellation_reason", [""], lambda x: x >= 0.98)


Python Callback server started!


In [11]:
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(completeness_check) \
    .addCheck(non_negative_check) \
    .addCheck(containment_check) \
    .run()

print(f"Verification Run Status: {checkResult.status}")

# Checking the results of the verification
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show(100, truncate=False)

                                                                                

Verification Run Status: Success
+-------------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
|check              |check_level|check_status|constraint                                                                                                                                                                                                                        |constraint_status|constraint_message|
+-------------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
|Completeness Checks|Error      |S



In [6]:
# Perform uniqueness check for composite key
unique_check = Check(spark, CheckLevel.Error, '') \
    .hasUniqueness(['date', 'airline', 'flight_number', 'scheduled_departure'], lambda x: x == 1.0)
    # .hasUniqueness(['date', 'airline', 'flight_number', 'scheduled_departure'], lambda x: x == 1.0)

uniqueResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(unique_check) \
    .run()

# # Checking the results of the verification
# checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, uniqueResult)
# checkResult_df

24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/31 04:28:07 WARN RowBasedKeyValueBatch: Calling spill() on

In [7]:
print(f"Verification Run Status: {uniqueResult.status}")
uniqueResult

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, uniqueResult)
checkResult_df.show(truncate=False)

Verification Run Status: Error
+-----+-----------+------------+-----------------------------------------------------------+-----------------+-------------------------------------------------------------------+
|check|check_level|check_status|constraint                                                 |constraint_status|constraint_message                                                 |
+-----+-----------+------------+-----------------------------------------------------------+-----------------+-------------------------------------------------------------------+
|     |Error      |Error       |UniquenessConstraint(Uniqueness(Stream(date, ?),None,None))|Failure          |Value: 0.9999986252119966 does not meet the constraint requirement!|
+-----+-----------+------------+-----------------------------------------------------------+-----------------+-------------------------------------------------------------------+



In [None]:
from 