In [1]:
import os

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
from pyspark.sql.types import StringType,IntegerType,StructType,StructField


In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("manual_data_validation_example") \
        .getOrCreate()
else:
    spark = SparkSession.builder \
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("manual_data_validation_example") \
        .config("spark.kubernetes.container.image", os.getenv("IMAGE_NAME")) \
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
        .config("spark.executor.instances", "4") \
        .config("spark.executor.memory", "8g") \
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
        .getOrCreate()

22/05/03 14:59:51 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.184.146 instead (on interface ens33)
22/05/03 14:59:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/03 14:59:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
data_file_path = "../../data/adult_with_duplicates.csv"
# data_file_path = "../../data/adult_cleaned.csv"

In [4]:
schema=StructType([
    StructField("age", StringType(), nullable = True),
    StructField("workclass", StringType(), nullable = True),
    StructField("fnlwgt", IntegerType(), nullable = True),
    StructField("education", StringType(), nullable = True),
    StructField("education-num", IntegerType(), nullable = True),
    StructField("marital-status", StringType(), nullable = True),
    StructField("occupation", StringType(), nullable = True),
    StructField("relationship", StringType(), nullable = True),
    StructField("race", StringType(), nullable = True),
    StructField("sex", StringType(), nullable = True),
    StructField("capital-gain", IntegerType(), nullable = True),
    StructField("capital-loss", IntegerType(), nullable = True),
    StructField("hours-per-week", IntegerType(), nullable = True),
    StructField("native-country", StringType(), nullable = True),
    StructField("income", StringType(), nullable = True)
])

In [5]:
df_raw = spark.read.csv(data_file_path, header=True, schema=schema)

In [6]:
df_raw.show(5, truncate=False)

+-----+--------------+------+---------+-------------+--------------+------------+-------------+-----+----+------------+------------+--------------+--------------+------+
|age  |workclass     |fnlwgt|education|education-num|marital-status|occupation  |relationship |race |sex |capital-gain|capital-loss|hours-per-week|native-country|income|
+-----+--------------+------+---------+-------------+--------------+------------+-------------+-----+----+------------+------------+--------------+--------------+------+
|139  |State-gov     |77516 |Bachelors|13           |Never-married |Adm-clerical|Not-in-family|White|Male|2174        |0           |40            |United-States |<=50K |
|34.56|State-gov     |77516 |Bachelors|13           |Never-married |Adm-clerical|Not-in-family|White|Male|2174        |0           |40            |United-States |<=50K |
|-12  |State-gov     |77516 |Bachelors|13           |Never-married |Adm-clerical|Not-in-family|White|Male|2174        |0           |40            |Uni

In [7]:
df_raw.printSchema()

root
 |-- age: string (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



# Validation rules

Table level validation rule:
1. Table must have 32603 rows and 15 columns
2. Table can't have duplicate rows

Column level validation rule:
1. Column Age must be a number
2. Column Age can't have null
3. Column Age must have value between 0 and 120




## Check table level validation rule 1

Table must have 32603 rows and 15 columns

In [8]:
row_numbers = df_raw.count()
column_numbers = len(df_raw.columns)

print(f"row numbers: {row_numbers}, column numbers: {column_numbers}")

row numbers: 32608, column numbers: 15


## Check table level validation rule 2

Table can't have duplicate rows

In [9]:
df_indicate_duplicate = df_raw.join(
    df_raw.groupBy(df_raw.columns).agg((count("*") > 1).cast("int").alias("Duplicate_indicator")), on=df_raw.columns,
    how="left")
df_duplicate = df_indicate_duplicate.filter(col("Duplicate_indicator") == 1)
duplicate_row_num = df_duplicate.count()
if duplicate_row_num > 1:
    print(f"Table has Duplicate rows")
    print(f"Duplicate row number: {duplicate_row_num}")
    print("Duplicate row samples: ")
    df_duplicate.show()
else:
    print(f"Table does not have Duplicate rows")

                                                                                

Table has Duplicate rows
Duplicate row number: 102
Duplicate row samples: 


                                                                                

+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+-------------------+
|age|       workclass|fnlwgt|education|education-num|    marital-status|     occupation| relationship| race|   sex|capital-gain|capital-loss|hours-per-week|native-country|income|Duplicate_indicator|
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+-------------------+
| 19|         Private| 97261|  HS-grad|            9|     Never-married|Farming-fishing|Not-in-family|White|  Male|           0|           0|            40| United-States| <=50K|                  1|
| 19|         Private| 97261|  HS-grad|            9|     Never-married|Farming-fishing|Not-in-family|White|  Male|           0|           0|            40| United-States| <=50K|                  1|
| 31|

To make your code more portable, we can write a function.

In [25]:
def check_duplicate_rows(df: DataFrame, indicator_columns=None) -> bool:
    """
    This function checks if a dataFrame contains duplicate rows
    :param indicator_columns: The group of columns that can indicate if two row is identical
    :param df: input data frame
    :return: If no duplicate is found, return True, else return False
    """
    if indicator_columns:
        indicator_cols = indicator_columns
    else:
        indicator_cols = df.columns
    df_indicate_duplicate = df.join(
        df.groupBy(indicator_cols).agg((count("*") > 1).cast("int").alias("Duplicate_indicator")), on=df.columns,
        how="left")
    df_duplicate = df_indicate_duplicate.filter(col("Duplicate_indicator") == 1)
    duplicate_row_num = df_duplicate.count()
    if duplicate_row_num > 1:
        print(f"Table has Duplicate rows")
        print(f"Duplicate row number: {duplicate_row_num}")
        print("Duplicate row samples: ")
        df_duplicate.show()
        return False
    else:
        print(f"Table does not have Duplicate rows")
        return True

In [26]:
check_duplicate_rows(df_raw)

Table has Duplicate rows
Duplicate row number: 102
Duplicate row samples: 
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+-------------------+
|age|       workclass|fnlwgt|education|education-num|    marital-status|     occupation| relationship| race|   sex|capital-gain|capital-loss|hours-per-week|native-country|income|Duplicate_indicator|
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+-------------------+
| 19|         Private| 97261|  HS-grad|            9|     Never-married|Farming-fishing|Not-in-family|White|  Male|           0|           0|            40| United-States| <=50K|                  1|
| 19|         Private| 97261|  HS-grad|            9|     Never-married|Farming-fishing|Not-in-family|White|  Male|           0| 

False

## Column level validation rule 1

Column Age must be a number

In [10]:
regex="^[-+]?\d+$"
df_age_not_number=df_raw.filter(~ col("age").rlike(regex))
df_age_not_number.select("age","education","education-num", "marital-status", "occupation","workclass","race").show()

+-----+---------+-------------+--------------+------------+---------+-----+
|  age|education|education-num|marital-status|  occupation|workclass| race|
+-----+---------+-------------+--------------+------------+---------+-----+
|34.56|Bachelors|           13| Never-married|Adm-clerical|State-gov|White|
| 39.5|Bachelors|           13| Never-married|Adm-clerical|State-gov|White|
+-----+---------+-------------+--------------+------------+---------+-----+



## Column level validation rule 2

Column Age can't have null

In [11]:
col_name="age"
df_age_null = df_raw.filter((col(col_name).isNull())|(col(col_name)=="null"))

print(f"Column {col_name} has null value : {df_age_null.count()}")
print("Null value sample: ")
df_age_null.show()

Column age has null value : 1
Null value sample: 
+----+--------------+------+---------+-------------+--------------+------------+-------------+-----+----+------------+------------+--------------+--------------+------+
| age|     workclass|fnlwgt|education|education-num|marital-status|  occupation| relationship| race| sex|capital-gain|capital-loss|hours-per-week|native-country|income|
+----+--------------+------+---------+-------------+--------------+------------+-------------+-----+----+------------+------------+--------------+--------------+------+
|null|emp-by-pengfei| 77516|Bachelors|           13| Never-married|Adm-clerical|Not-in-family|White|Male|        2174|           0|            40| United-States| <=50K|
+----+--------------+------+---------+-------------+--------------+------------+-------------+-----+----+------------+------------+--------------+--------------+------+



## Column level validation rule 3

Column Age must have value between 0 and 120

In [13]:
df_age_range_anomaly = df_raw.filter((col(col_name) > 120) | (col(col_name) < 0))

In [14]:
anomaly_number = df_age_range_anomaly.count()
print(f"Anomaly row number for column {col_name}: {anomaly_number}")
print(f"Anomaly row sample for column {col_name}:")
df_age_range_anomaly.show(5, truncate=False)

Anomaly row number for column age: 4
Anomaly row sample for column age:
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|age|workclass       |fnlwgt|education|education-num|marital-status    |occupation     |relationship |race |sex   |capital-gain|capital-loss|hours-per-week|native-country|income|
+---+----------------+------+---------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|139|State-gov       |77516 |Bachelors|13           |Never-married     |Adm-clerical   |Not-in-family|White|Male  |2174        |0           |40            |United-States |<=50K |
|-12|State-gov       |77516 |Bachelors|13           |Never-married     |Adm-clerical   |Not-in-family|White|Male  |2174        |0           |40            |United-States |<=50K |
|152|Self-emp-not-inc|209642|HS-g