In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Audit Report Testing") \
    .getOrCreate()


In [2]:
data = [
    ("2024-01-01", 1001, "ProductA", 500),
    ("2024-01-01", 1002, "ProductB", None),
    ("2024-01-02", 1003, "ProductA", 300),
    (None, 1004, "ProductC", 200),
    ("2024-01-03", 1005, "ProductB", 1500)
]
columns = ["date", "transaction_id", "product", "amount"]
df = spark.createDataFrame(data, columns)


In [3]:
from pyspark.sql.functions import col

def audit_dataframe(df, source_name):
    audit_results = {}

    # Check for missing values
    missing_values = df.select([col(c).isNull().sum().alias(c) for c in df.columns])
    missing_values = missing_values.collect()
    missing = {row[0]: row[1] for row in missing_values[0]}  # Convert to dictionary
    audit_results['missing_values'] = missing
    print(f"Missing values in {source_name}: {missing}")

    # Check for duplicates (assuming 'transaction_id' is the unique key)
    duplicates = df.count() - df.dropDuplicates().count()
    audit_results['duplicates'] = duplicates
    print(f"Duplicates in {source_name}: {duplicates}")

    # Check for data type inconsistencies
    schema = {field.name: field.dataType.simpleString() for field in df.schema.fields}
    audit_results['data_types'] = schema
    print(f"Data types in {source_name}: {schema}")

    # Check for outliers (numeric columns)
    numeric_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, (IntegerType, FloatType))]
    outliers = {}
    for col_name in numeric_cols:
        summary = df.describe(col_name).collect()
        min_value = float(summary[0][1])
        max_value = float(summary[1][1])
        outliers[col_name] = {"min": min_value, "max": max_value}
    audit_results['outliers'] = outliers
    print(f"Outliers in {source_name}: {outliers}")

    return audit_results

# Test the function on the sample data
audit_results = audit_dataframe(df, "sales")


TypeError: 'Column' object is not callable