In [None]:
import sys
import numpy as np
import pandas as pd
import sklearn as skl

print("PATH: {}".format(os.environ['PATH']))
print("PYTHONPATH: {}".format(os.environ['PYTHONPATH']))
print("")
print("Spark: {}".format(spark.version))
print("Python: {}".format(sys.version))
spark.sparkContext

## Import PyDeequ and init PySpark DataFrame

In [None]:
from pyspark.sql import SparkSession, Row
import pydeequ

df = spark.sparkContext.parallelize([
            Row(a="foo", b=1, c=5),
            Row(a="bar", b=2, c=6),
            Row(a="baz", b=3, c=None)]).toDF()
df.toPandas()

## Example Analyzer

In [None]:
from pydeequ.analyzers import *

analyzer = AnalysisRunner(spark).onData(df).addAnalyzer(Size()).addAnalyzer(Completeness("b"))
analysisResult = analyzer.run()

analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.toPandas()

## Example Profile

In [None]:
from pydeequ.profiles import *

profiler = ColumnProfilerRunner(spark).onData(df)
result = profiler.run()

for col, profile in result.profiles.items():
    print(f"{col} => {profile}")

## Example Constraint Suggestions

In [None]:
from pydeequ.suggestions import *

csrunner = ConstraintSuggestionRunner(spark).onData(df).addConstraintRule(DEFAULT())
suggestionResult = csrunner.run()

# Constraint Suggestions in JSON format
print(suggestionResult['constraint_suggestions'])

## Example Constraint Verification

In [None]:
from pydeequ.checks import *
from pydeequ.verification import *

check = Check(spark, CheckLevel.Warning, "Review Check")

checkCond = check.hasSize(lambda x: x >= 3)\
    .hasMin("b", lambda x: x == 0)\
    .isUnique("a").isNonNegative("b").isComplete("c")\
    .isContainedIn("a", ["foo", "bar", "baz"])

verificator = VerificationSuite(spark).onData(df).addCheck(checkCond)
checkResult = verificator.run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.toPandas()