# Validate data with spark

In [None]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import sum as spark_sum
from pyspark.sql.functions import col, when, isnan, trim
import pyspark.sql.types as spark_types

In [None]:
# create a spark session in local mode
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Use_pyspark_in_CASD") \
    .getOrCreate()

In [None]:
# you can get and set configuration of your spark session any moments
# get all conf
spark.sparkContext.getConf().getAll()

In [None]:

def get_empty_row_count_per_column(df: DataFrame):
    """
    This function is used to get the count of empty rows per column. We have four types empty:
    1. null value
    2. NaN value
    3. Empty string value
    4. Special character to present null value(e.g. ?, -, etc.)
    """
    totalRowCount = df.count()

    nullSymbols = ["?", "-"]
    aggExpression = []

    # step2: build the condition expression for detecting various null case
    for colName in df.columns:
        # temporal col name
        nullCountCol = f"{colName}__null"
        nanCountCol = f"{colName}__nan"
        blankCountCol = f"{colName}__blank"
        nullSymbolCountCol = f"{colName}__symbol"
        c = col(colName)
        colType = df.schema[colName].dataType
        # always test null
        nullExpr = when(c.isNull(), 1).otherwise(0).alias(nullCountCol)
        aggExpression.append(nullExpr)
        # test isnan for only numeric columns
        nanExpr = when(isnan(c), 1).otherwise(0).alias(nanCountCol)
        if isinstance(colType, spark_types.NumericType):
            aggExpression.append(nanExpr)
        # string null value only for string columns
        if isinstance(colType, spark_types.StringType):
            aggExpression.append(when(trim(c) == "", 1).otherwise(0).alias(blankCountCol))
            aggExpression.append(when(c.isin(nullSymbols), 1).otherwise(0).alias(nullSymbolCountCol))

    # Perform full-column conditional tagging
    flaggedDf = df.select(*aggExpression)

    # step3: sum all per-column null case flags in one single pass
    try:
        summed = flaggedDf.agg(*[spark_sum(c).alias(c) for c in flaggedDf.columns]).collect()[0].asDict()
    except Exception as e:
        print(f"Aggregation failed on flaggedDf columns: {flaggedDf.columns}: {e}")

    result = []
    # step4: build a list of dict which contains all info for the final result dataframe
    for colName in df.columns:
        # temporal col name
        nullCountCol = f"{colName}__null"
        nanCountCol = f"{colName}__nan"
        blankCountCol = f"{colName}__blank"
        nullSymbolCountCol = f"{colName}__symbol"
        nullCount = summed.get(nullCountCol, 0)
        nanCount = summed.get(nanCountCol, 0)
        blankCount = summed.get(blankCountCol, 0)
        symbolCount = summed.get(nullSymbolCountCol, 0)
        totalEmpty = nullCount + nanCount + blankCount + symbolCount

        result.append((
            colName, nullCount, nanCount, blankCount,
            symbolCount, totalEmpty, totalRowCount
        ))
    # convert the list of dict into a new dataframe
    resDf = spark.createDataFrame(result, ["column_name", "null_count", "nan_count", "blank_count",
                                           "null_symbol_count", "total_empty_row_count",
                                           "total_row_count"])
    #

    return resDf


In [None]:
def get_duplicated_row_count(df: DataFrame):
    duplicate_row_count = df.count() - df.dropDuplicates().count()
    print(f"Duplicate row count: {duplicate_row_count}")