In [1]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("manual_data_validation_example")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443")\
        .appName("manual_data_validation_example")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

22/02/22 11:17:28 WARN Utils: Your hostname, pliu-SATELLITE-P850 resolves to a loopback address: 127.0.1.1; using 172.22.0.33 instead (on interface wlp3s0)
22/02/22 11:17:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/02/22 11:17:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
data_file_path = "../data/adult_with_duplicates.csv"

In [4]:
df_raw = spark.read.csv(data_file_path, header=True, inferSchema=True)

In [22]:
df_raw.show(5,truncate=False)

+----+--------------+------+---------+-------------+--------------+------------+-------------+-----+----+------------+------------+--------------+--------------+------+
|age |workclass     |fnlwgt|education|education-num|marital-status|occupation  |relationship |race |sex |capital-gain|capital-loss|hours-per-week|native-country|income|
+----+--------------+------+---------+-------------+--------------+------------+-------------+-----+----+------------+------------+--------------+--------------+------+
|139 |State-gov     |77516 |Bachelors|13           |Never-married |Adm-clerical|Not-in-family|White|Male|2174        |0           |40            |United-States |<=50K |
|-12 |State-gov     |77516 |Bachelors|13           |Never-married |Adm-clerical|Not-in-family|White|Male|2174        |0           |40            |United-States |<=50K |
|null|emp-by-pengfei|77516 |Bachelors|13           |Never-married |Adm-clerical|Not-in-family|White|Male|2174        |0           |40            |United-St

# Validate age in range

Three validation rule:
1. Column Age must have value between 0 and 120
2. Column Age can't have null
3. Table can't have duplicate rows

## Validation Rule 1

In [8]:
df_age_range_anomaly = df_raw.filter((col("age") > 120) | (col("age") < 0))

In [10]:
anomaly_number = df_age_range_anomaly.count()
print(f"Anomaly number: {anomaly_number}")

Anomaly number: 4


In [23]:
df_age_range_anomaly.show(5, truncate=False)

+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|age|workclass       |fnlwgt|education|education-num|marital-status    |occupation     |relationship |race |sex   |capital-gain|capital-loss|hours-per-week|native-country|income|
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|139|State-gov       |77516 |Bachelors|13           |Never-married     |Adm-clerical   |Not-in-family|White|Male  |2174        |0           |40            |United-States |<=50K |
|-12|State-gov       |77516 |Bachelors|13           |Never-married     |Adm-clerical   |Not-in-family|White|Male  |2174        |0           |40            |United-States |<=50K |
|152|Self-emp-not-inc|209642|HS-grad  |9            |Married-civ-spouse|Exec-managerial|Husband      |Whi

## Validation Rule 2

In [13]:
df_age_null = df_raw.filter(col("age").isNull())

In [14]:
df_age_null.count()

0

## Validation Rule 3

In [19]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import count

In [24]:

def check_duplicate_rows(df: DataFrame, indicator_columns=None) -> bool:
    """
    This function checks if a dataFrame contains duplicate rows
    :param indicator_columns: The group of columns that can indicate if two row is identical
    :param df: input data frame
    :return: If no duplicate is found, return True, else return False
    """
    if indicator_columns:
        indicator_cols=indicator_columns
    else:
        indicator_cols=df.columns
    df_indicate_duplicate = df.join(
        df.groupBy(indicator_cols).agg((count("*") > 1).cast("int").alias("Duplicate_indicator")), on=df.columns,
        how="left")
    df_duplicate=df_indicate_duplicate.filter(col("Duplicate_indicator") == 1)
    duplicate_row_num=df_duplicate.count()
    df_duplicate.show()
    if duplicate_row_num>1:
        return False
    else:
        return True

In [25]:
check_duplicate_rows(df_raw)

+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+-------------------+
|age|       workclass|fnlwgt|education|education-num|    marital-status|     occupation| relationship| race|   sex|capital-gain|capital-loss|hours-per-week|native-country|income|Duplicate_indicator|
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+-------------------+
| 19|         Private| 97261|  HS-grad|            9|     Never-married|Farming-fishing|Not-in-family|White|  Male|           0|           0|            40| United-States| <=50K|                  1|
| 19|         Private| 97261|  HS-grad|            9|     Never-married|Farming-fishing|Not-in-family|White|  Male|           0|           0|            40| United-States| <=50K|                  1|
| 31|

True