# Data Quality Checks with PySpark
This notebook demonstrates data quality checks on a dataset, including schema validation, null value detection, and duplicate checks, using PySpark.

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when

In [8]:
spark = SparkSession.builder.appName("Data Quality Checks").getOrCreate()

In [9]:
df = spark.read.parquet("./../datasets/etl_input.parquet")

In [10]:
# Schema validation
expected_schema = ["id", "name", "age", "department", "salary"]
assert set(df.columns) == set(expected_schema), "Schema mismatch detected!"

In [11]:
# Null values check
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+----+---+----------+------+
| id|name|age|department|salary|
+---+----+---+----------+------+
|  0|   0|  0|         0|     0|
+---+----+---+----------+------+



In [12]:
# Duplicate check
duplicates = df.groupBy(df.columns).count().filter(col("count") > 1)
if duplicates.count() > 0:
    print("Duplicates found!")
    duplicates.show()

In [13]:
spark.stop()