In [1]:
## Optional
# Format output of Jupyter Notebook
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

# Hide python warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import lit, isnull, when, count, col, regexp_extract, concat_ws, to_date, expr, quarter, when, date_add, year, month, day, dayofweek, broadcast, avg, min, max, like

# Define spark session config
spark_configs = {
    'spark.master': 'spark://spark-iceberg:7077',
    'spark.sql.catalog.prod': 'org.apache.iceberg.spark.SparkCatalog',
    'spark.sql.catalog.prod.io-impl': 'org.apache.iceberg.aws.s3.S3FileIO',
    'spark.sql.catalog.prod.s3.endpoint': 'http://minio:9000',
    'spark.sql.catalog.prod.type': 'rest',
    'spark.sql.catalog.prod.uri': 'http://rest:8181',
    'spark.sql.catalog.prod.warehouse': 's3://warehouse',
    'spark.sql.defaultCatalog': 'prod',
    'spark.driver.memory': '1G',
    'spark.executor.memory': '1G'
}

# Initialize SparkSession
spark = (
    SparkSession
    .builder
    .appName('Dedup testing')
    .config(map=spark_configs)
    .getOrCreate()
)

25/01/04 17:46:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
def do_raw_flights_transformation(spark: SparkSession, input_df: DataFrame) -> DataFrame:
    # Add date column and remove other date-related columns
    flights_df = input_df \
        .withColumn('date', to_date(concat_ws('-', 'year', 'month', 'day'))) \
        .drop('year', 'month', 'day', 'day_of_week')

    # Add is_delayed column (when scheduled_departure > 0)
    flights_df = flights_df \
        .withColumn(
            'is_delayed',
            when(col('departure_delay') > 0, lit(1)).otherwise(lit(0))
        )

    # Rearrange date to be first column,
    flights_df = flights_df.select('date', *[col(c) for c in flights_df.columns if c != 'date'])

    # Sort by date and schedule_departure time
    flights_df = flights_df.sort(['date', 'scheduled_departure'])
    
    return flights_df

def remove_duplicate_flights(spark: SparkSession, input_df: DataFrame) -> DataFrame:
    """
    Remove duplicate flights based on unique columns in the input DataFrame.

    Args:
        spark: SparkSession object
        input_df: Input DataFrame
    
    Returns:
        DataFrame with duplicates removed
    """
    # Deduplicate over unique columns
    deduped_df = input_df.dropDuplicates(["date", "airline", "flight_number", "scheduled_departure"])
    return deduped_df

flights_schema = StructType([
    StructField("year", IntegerType(), False),   # Non-Nullable
    StructField("month", IntegerType(), False),  # Non-Nullable
    StructField("day", IntegerType(), False),    # Non-Nullable
    StructField("day_of_week", IntegerType(), False),    # Non-Nullable
    StructField("airline", StringType(), False),         # Non-Nullable
    StructField("flight_number", IntegerType(), False),  # Non-Nullable
    StructField("tail_number", StringType(), True),
    StructField("origin_airport", StringType(), False),  # Non-Nullable
    StructField("destination_airport", StringType(), False),   # Non-Nullable
    StructField("scheduled_departure", IntegerType(), False),  # Non-Nullable
    StructField("departure_time", IntegerType(), True),
    StructField("departure_delay", IntegerType(), True),
    StructField("taxi_out", IntegerType(), True),
    StructField("wheels_off", IntegerType(), True),
    StructField("scheduled_time", IntegerType(), True),
    StructField("elapsed_time", IntegerType(), True),
    StructField("air_time", IntegerType(), True),
    StructField("distance", IntegerType(), False),  # Non-Nullable
    StructField("wheels_on", IntegerType(), True),
    StructField("taxi_in", IntegerType(), True),
    StructField("scheduled_arrival", IntegerType(), False),  # Non-Nullable
    StructField("arrival_time", IntegerType(), True),
    StructField("arrival_delay", IntegerType(), True),
    StructField("diverted", IntegerType(), False),   # Non-Nullable
    StructField("cancelled", IntegerType(), False),  # Non-Nullable
    StructField("cancellation_reason", StringType(), True),
    StructField("air_system_delay", IntegerType(), True),
    StructField("security_delay", IntegerType(), True),
    StructField("airline_delay", IntegerType(), True),
    StructField("late_aircraft_delay", IntegerType(), True),
    StructField("weather_delay", IntegerType(), True)
])

filename = '/home/iceberg/data/flights.csv'

input_df = spark.read \
                .schema(flights_schema) \
                .option('header', True) \
                .csv(filename)

input_df = do_raw_flights_transformation(spark, input_df)
input_df = remove_duplicate_flights(spark, input_df)

input_df = input_df.filter(month(col('date')) == 1)

input_df.persist()

25/01/04 17:46:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[date: date, airline: string, flight_number: int, tail_number: string, origin_airport: string, destination_airport: string, scheduled_departure: int, departure_time: int, departure_delay: int, taxi_out: int, wheels_off: int, scheduled_time: int, elapsed_time: int, air_time: int, distance: int, wheels_on: int, taxi_in: int, scheduled_arrival: int, arrival_time: int, arrival_delay: int, diverted: int, cancelled: int, cancellation_reason: string, air_system_delay: int, security_delay: int, airline_delay: int, late_aircraft_delay: int, weather_delay: int, is_delayed: int]

In [4]:
source_df = spark.table('prod.db.fact_flights')
source_df = source_df.filter(month(col('date')) == 1)

source_df.persist()

DataFrame[date: date, airline: string, flight_number: int, tail_number: string, origin_airport: string, destination_airport: string, scheduled_departure: int, departure_time: int, departure_delay: int, taxi_out: int, wheels_off: int, scheduled_time: int, elapsed_time: int, air_time: int, distance: int, wheels_on: int, taxi_in: int, scheduled_arrival: int, arrival_time: int, arrival_delay: int, diverted: int, cancelled: int, cancellation_reason: string, air_system_delay: int, security_delay: int, airline_delay: int, late_aircraft_delay: int, weather_delay: int, is_delayed: int]

In [5]:
input_df.show()
source_df.show()

                                                                                

+----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+----------+
|      date|airline|flight_number|tail_number|origin_airport|destination_airport|scheduled_departure|departure_time|departure_delay|taxi_out|wheels_off|scheduled_time|elapsed_time|air_time|distance|wheels_on|taxi_in|scheduled_arrival|arrival_time|arrival_delay|diverted|cancelled|cancellation_reason|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|is_delayed|
+----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+



+----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+----------+
|      date|airline|flight_number|tail_number|origin_airport|destination_airport|scheduled_departure|departure_time|departure_delay|taxi_out|wheels_off|scheduled_time|elapsed_time|air_time|distance|wheels_on|taxi_in|scheduled_arrival|arrival_time|arrival_delay|diverted|cancelled|cancellation_reason|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|is_delayed|
+----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+

                                                                                

In [7]:
# Generate null-safe join condition for all columns
join_condition = " AND ".join([f"input_df.{col} <=> source_df.{col}" for col in input_df.columns])
print(join_condition)

anti_join_df = input_df.alias('input_df').join(
    source_df.alias('source_df'),
    on=expr(join_condition),
    how='left_anti'
)
anti_join_df.show()
anti_join_df is None

input_df.date <=> source_df.date AND input_df.airline <=> source_df.airline AND input_df.flight_number <=> source_df.flight_number AND input_df.tail_number <=> source_df.tail_number AND input_df.origin_airport <=> source_df.origin_airport AND input_df.destination_airport <=> source_df.destination_airport AND input_df.scheduled_departure <=> source_df.scheduled_departure AND input_df.departure_time <=> source_df.departure_time AND input_df.departure_delay <=> source_df.departure_delay AND input_df.taxi_out <=> source_df.taxi_out AND input_df.wheels_off <=> source_df.wheels_off AND input_df.scheduled_time <=> source_df.scheduled_time AND input_df.elapsed_time <=> source_df.elapsed_time AND input_df.air_time <=> source_df.air_time AND input_df.distance <=> source_df.distance AND input_df.wheels_on <=> source_df.wheels_on AND input_df.taxi_in <=> source_df.taxi_in AND input_df.scheduled_arrival <=> source_df.scheduled_arrival AND input_df.arrival_time <=> source_df.arrival_time AND input_d



+----+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+----------+
|date|airline|flight_number|tail_number|origin_airport|destination_airport|scheduled_departure|departure_time|departure_delay|taxi_out|wheels_off|scheduled_time|elapsed_time|air_time|distance|wheels_on|taxi_in|scheduled_arrival|arrival_time|arrival_delay|diverted|cancelled|cancellation_reason|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|is_delayed|
+----+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+

                                                                                

False

In [21]:
df1 = input_df.filter('date="2015-01-01" AND airline="AA" AND flight_number="310"')
df2 = source_df.filter('date="2015-01-01" AND airline="AA" AND flight_number="310"')

df1.show()
df2.show()

import chispa
chispa.assert_df_equality(df1, df2, ignore_nullable=True)

                                                                                

+----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+----------+
|      date|airline|flight_number|tail_number|origin_airport|destination_airport|scheduled_departure|departure_time|departure_delay|taxi_out|wheels_off|scheduled_time|elapsed_time|air_time|distance|wheels_on|taxi_in|scheduled_arrival|arrival_time|arrival_delay|diverted|cancelled|cancellation_reason|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|is_delayed|
+----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+

                                                                                

In [27]:
# input_df.filter('date="2015-01-01" AND airline="AA" AND flight_number="310"').join(
#     source_df.filter('date="2015-01-01" AND airline="AA" AND flight_number="310"'),
#     on=input_df.filter('date="2015-01-01" AND airline="AA" AND flight_number="310"').columns,
#     how='left_anti'
# ).show()

# Generate null-safe join condition for all columns
join_condition = " AND ".join([f"df1.{col} <=> df2.{col}" for col in df1.columns])
print(join_condition)

df1.alias('df1').join(
    df2.alias('df2'),
    on=expr(join_condition),
    how='left_anti'
).show()

df1.id <=> df2.id AND df1.name <=> df2.name
+---+----+
| id|name|
+---+----+
+---+----+



In [25]:
# Create example DataFrames
df1 = spark.createDataFrame([(1, "Alice"), (2, "Bob"), (None, "Charlie")], ["id", "name"])
df2 = spark.createDataFrame([(1, "Alice"), (2, "Bob"), (None, "Charlie")], ["id", "name"])
# df2 = spark.createDataFrame([(2, "Bob"), (3, "Charlie")], ["id", "name"])

# Perform anti join on all columns
anti_join_df = df1.join(
    df2,
    on=df1.columns,
    how="left_anti"
)
anti_join_df.show()

+----+-------+
|  id|   name|
+----+-------+
|NULL|Charlie|
+----+-------+



In [26]:
from pyspark.sql.functions import col, expr

# Generate null-safe join condition for all columns
join_condition = " AND ".join([f"df1.{col} <=> df2.{col}" for col in df1.columns])

# Perform anti join with the null-safe condition
anti_join_df = df1.alias("df1").join(df2.alias("df2"), expr(join_condition), how="left_anti")
anti_join_df.show()

+---+----+
| id|name|
+---+----+
+---+----+



In [53]:
input_df.columns

str

In [10]:
input_df.printSchema()
source_df.printSchema()

root
 |-- date: date (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight_number: integer (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- origin_airport: string (nullable = true)
 |-- destination_airport: string (nullable = true)
 |-- scheduled_departure: integer (nullable = true)
 |-- departure_time: integer (nullable = true)
 |-- departure_delay: integer (nullable = true)
 |-- taxi_out: integer (nullable = true)
 |-- wheels_off: integer (nullable = true)
 |-- scheduled_time: integer (nullable = true)
 |-- elapsed_time: integer (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- wheels_on: integer (nullable = true)
 |-- taxi_in: integer (nullable = true)
 |-- scheduled_arrival: integer (nullable = true)
 |-- arrival_time: integer (nullable = true)
 |-- arrival_delay: integer (nullable = true)
 |-- diverted: integer (nullable = true)
 |-- cancelled: integer (nullable = true)
 |-- cancellation_reason:

In [28]:
input_df.unpersist()
source_df.unpersist()

DataFrame[date: date, airline: string, flight_number: int, tail_number: string, origin_airport: string, destination_airport: string, scheduled_departure: int, departure_time: int, departure_delay: int, taxi_out: int, wheels_off: int, scheduled_time: int, elapsed_time: int, air_time: int, distance: int, wheels_on: int, taxi_in: int, scheduled_arrival: int, arrival_time: int, arrival_delay: int, diverted: int, cancelled: int, cancellation_reason: string, air_system_delay: int, security_delay: int, airline_delay: int, late_aircraft_delay: int, weather_delay: int, is_delayed: int]