In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Applying Rules to PySpark DataFrame") \
    .getOrCreate()

# Assuming you have already loaded your data into the DataFrame 'df'
# Let's define your DataFrame 'df'

# List of dictionaries representing the rules and their corresponding columns, values, and descriptions
rules = [
    {
        "rule_type": "rows_with_null_blank_whitespace",
        "columns": ["Column1", "Column2", "Column3"],
        "description": "Count of Rows with Null, Blank, or White Space Values in Columns Column1, Column2, and Column3"
    },
    {
        "rule_type": "average_length",
        "column": "Column1",
        "description": "Average Length of Column1"
    },
    {
        "rule_type": "percentage_null_values",
        "column": "Column2",
        "value": 0,
        "description": "Percentage of Null Values in Column2"
    },
    {
        "rule_type": "percentage_specific_length",
        "column": "Column3",
        "value": 3,
        "description": "Percentage of Values with Specific Length in Column3"
    },
    {
        "rule_type": "maximum_length",
        "column": "Column4",
        "description": "Maximum Length in Column4"
    },
    {
        "rule_type": "minimum_length",
        "column": "Column5",
        "description": "Minimum Length in Column5"
    },
    {
        "rule_type": "length_greater_than_10",
        "columns": ["Column5", "Column6", "Column7"],
        "value": 10,
        "description": "Count of Rows with Length Greater Than 10 in Columns Column5, Column6, and Column7"
    }
]

# Function to apply a rule to the DataFrame and return the result
def apply_rule(df, rule):
    rule_type = rule["rule_type"]
    columns = rule.get("columns", [])
    column_name = rule.get("column")
    value = rule.get("value")
    description = rule["description"]

    if rule_type == "rows_with_null_blank_whitespace":
        # Count rows with null, blank, or white space values across specific columns
        filter_expr = col(columns[0]).isNull() | (length(col(columns[0])) == 0) | (col(columns[0]) == " ")
        for col_name in columns[1:]:
            filter_expr |= col(col_name).isNull() | (length(col(col_name)) == 0) | (col(col_name) == " ")

        result = df.filter(filter_expr).count()

    elif rule_type == "average_length":
        result = df.select(length(col(column_name))).agg({"length(" + column_name + ")": "avg"}).first()[0]

    elif rule_type == "percentage_null_values":
        result = df.filter(col(column_name).isNull()).count()
        total_count = df.count()
        result = (result * 100.0) / total_count if total_count > 0 else 0.0

    elif rule_type == "percentage_specific_length":
        result = df.filter((col(column_name).isNotNull()) & (length(col(column_name)) == value)).count()
        total_count = df.filter(col(column_name).isNotNull()).count()
        result = (result * 100.0) / total_count if total_count > 0 else 0.0

    elif rule_type == "maximum_length":
        result = df.select(max(length(col(column_name)))).first()[0]

    elif rule_type == "minimum_length":
        result = df.select(min(length(col(column_name)))).first()[0]

    elif rule_type == "length_greater_than_10":
        filter_expr = col(columns[0]).isNotNull() & (length(col(columns[0])) > value)
        for col_name in columns[1:]:
            filter_expr |= col(col_name).isNotNull() & (length(col(col_name)) > value)

        result = df.filter(filter_expr).count()

    return description, result

# Calculate the results for each rule and append them to a list
results_list = []
for rule in rules:
    description, result = apply_rule(df, rule)
    results_list.append((description, result))

# Print the results list
print(results_list)


NameError: name 'df' is not defined