# Databricks Notebook: Data Profiling

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev, count, percentile_approx

In [7]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Data Profiling").getOrCreate()

In [8]:
# Load Dataset
df = spark.read.parquet("./../datasets/etl_input.parquet")

In [9]:
# Summary Statistics
df.describe().show()

+-------+---+-------+------------------+----------+-------+
|summary| id|   name|               age|department| salary|
+-------+---+-------+------------------+----------+-------+
|  count|  3|      3|                 3|         3|      3|
|   mean|2.0|   NULL|              28.0|      NULL|55000.0|
| stddev|1.0|   NULL|2.6457513110645907|      NULL| 5000.0|
|    min|  1|  Alice|                25|   Finance|  50000|
|    max|  3|Charlie|                30|        IT|  60000|
+-------+---+-------+------------------+----------+-------+



In [10]:
# IQR Outlier Detection
q1 = df.approxQuantile("salary", [0.25], 0)[0]
q3 = df.approxQuantile("salary", [0.75], 0)[0]
iqr = q3 - q1

In [11]:
outliers = df.filter((col("salary") < q1 - 1.5 * iqr) | (col("salary") > q3 + 1.5 * iqr))
if outliers.count() > 0:
    print("Outliers detected!")
    outliers.show()

In [12]:
# Stop Spark Session
spark.stop()