* Master DAC - BDLE
* Author: Mohamed-Amine Baazizi
* Affiliation: LIP6 - Faculté des Sciences - Sorbonne Université
* Email: mohamed-amine.baazizi@lip6.fr
* October 2023

# Data quality verification

https://github.com/awslabs/deequ

and its python-based version

https://github.com/awslabs/python-deequ

https://www.amazon.science/publications/automating-large-scale-data-quality-verification

https://medium.com/codex/how-to-check-data-quality-in-pyspark-8a882e45bc95


## Spark setup

In [1]:
!pip install --upgrade -q pyspark==3.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.7/204.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
!pyspark --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.0.0
      /_/
                        
Using Scala version 2.12.10, OpenJDK 64-Bit Server VM, 11.0.20.1
Branch HEAD
Compiled by user ubuntu on 2020-06-06T11:32:25Z
Revision 3fdfce3120f307147244e5eaf46d61419a723d50
Url https://gitbox.apache.org/repos/asf/spark.git
Type --help for more information.


In [3]:
import os
os.environ["SPARK_VERSION"] = "3.0"

In [4]:
!pip install  --upgrade -q pydeequ

In [5]:
from pyspark.sql import SparkSession, Row
import pydeequ

spark = SparkSession.builder\
    .master("local")\
    .appName("pyDeequ")\
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

In [6]:
spark

In [7]:
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## usage

### examples for the official doc

In [8]:
df = spark.sparkContext.parallelize([
            Row(a="foo", b=1, c=5),
            Row(a="bar", b=2, c=6),
            Row(a="baz", b=3, c=None)]).toDF()
df.show()

+---+---+----+
|  a|  b|   c|
+---+---+----+
|foo|  1|   5|
|bar|  2|   6|
|baz|  3|null|
+---+---+----+



#### analyzers

In [16]:
from pydeequ.analyzers import *

analysisResult = AnalysisRunner(spark) \
                    .onData(df) \
                    .addAnalyzer(Size()) \
                    .addAnalyzer(Completeness("b")) \
                    .run()
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()

+-------+--------+------------+-----+
| entity|instance|        name|value|
+-------+--------+------------+-----+
|Dataset|       *|        Size|  3.0|
| Column|       b|Completeness|  1.0|
+-------+--------+------------+-----+



#### profile

In [17]:
from pydeequ.profiles import *

result = ColumnProfilerRunner(spark) \
    .onData(df) \
    .run()

for col, profile in result.profiles.items():
    print(profile)

StandardProfiles for column: a: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 3,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {
        "Boolean": 0,
        "Fractional": 0,
        "Integral": 0,
        "Unknown": 0,
        "String": 3
    },
    "histogram": [
        [
            "baz",
            1,
            0.3333333333333333
        ],
        [
            "foo",
            1,
            0.3333333333333333
        ],
        [
            "bar",
            1,
            0.3333333333333333
        ]
    ]
}
NumericProfiles for column: b: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 3,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": [
        [
            "1",
            1,
            0.3333333333333333
        ],
        [
            "2",
            1,
            0.3333333333333333
        ],
        [
            "3",
            1,
           

#### constraint suggestion

In [18]:
from pydeequ.suggestions import *

suggestionResult = ConstraintSuggestionRunner(spark) \
             .onData(df) \
             .addConstraintRule(DEFAULT()) \
             .run()

# Constraint Suggestions in JSON format
print(suggestionResult)

{'constraint_suggestions': [{'constraint_name': 'CompletenessConstraint(Completeness(b,None))', 'column_name': 'b', 'current_value': 'Completeness: 1.0', 'description': "'b' is not null", 'suggesting_rule': 'CompleteIfCompleteRule()', 'rule_description': 'If a column is complete in the sample, we suggest a NOT NULL constraint', 'code_for_constraint': '.isComplete("b")'}, {'constraint_name': "ComplianceConstraint(Compliance('b' has no negative values,b >= 0,None))", 'column_name': 'b', 'current_value': 'Minimum: 1.0', 'description': "'b' has no negative values", 'suggesting_rule': 'NonNegativeNumbersRule()', 'rule_description': 'If we see only non-negative numbers in a column, we suggest a corresponding constraint', 'code_for_constraint': '.isNonNegative("b")'}, {'constraint_name': 'UniquenessConstraint(Uniqueness(List(b),None))', 'column_name': 'b', 'current_value': 'ApproxDistinctness: 1.0', 'description': "'b' is unique", 'suggesting_rule': 'UniqueIfApproximatelyUniqueRule()', 'rule_

#### constraint verification

In [None]:
from pydeequ.checks import *
from pydeequ.verification import *

check = Check(spark, CheckLevel.Warning, "Review Check")

checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 3) \
        .hasMin("b", lambda x: x == 0) \
        .isComplete("c")  \
        .isUnique("a")  \
        .isContainedIn("a", ["foo", "bar", "baz"]) \
        .isNonNegative("b")) \
    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show()

Python Callback server started!
+------------+-----------+------------+--------------------+-----------------+--------------------+
|       check|check_level|check_status|          constraint|constraint_status|  constraint_message|
+------------+-----------+------------+--------------------+-----------------+--------------------+
+------------+-----------+------------+--------------------+-----------------+--------------------+



### other examples

In [19]:
!wget --no-verbose https://nuage.lip6.fr/s/89BG8HD9r3iE693/download/MLData.tgz -O /tmp/MLData.tgz

2023-10-20 14:32:27 URL:https://nuage.lip6.fr/s/89BG8HD9r3iE693/download/MLData.tgz [19397838/19397838] -> "/tmp/MLData.tgz" [1]


In [20]:
!tar -xzvf /tmp/MLData.tgz  --directory /tmp/

MLData/
MLData/._loan.csv
MLData/loan.csv
MLData/autos.csv


In [21]:
!rm  /tmp/MLData.tgz
!rm /tmp/MLData/\._loan.csv
!ls -hal /tmp/MLData

total 73M
drwxr-xr-x 2  501 staff 4.0K Oct 20 14:32 .
drwxrwxrwt 1 root root  4.0K Oct 20 14:32 ..
-rw-r--r-- 1  501 staff  66M Jan  6  2022 autos.csv
-rw-r--r-- 1  501 staff 6.8M Jan  6  2022 loan.csv


In [22]:
#load
dir = "/tmp/MLData/"
data =  spark.read\
            .format("csv").option("header", "true")\
            .option("inferSchema", "true")\
            .load(dir +"autos.csv")

In [23]:
data.count()

371824

In [24]:
sample =data.sample(0.1)
sample.count()

37093

In [25]:
sample.show()

+-------------------+--------------------+------+---------+-----+-------+-----------+------------------+---------+-------+--------+---------+-------------------+--------+-------------+-----------------+-------------------+------------+----------+-------------------+
|        dateCrawled|                name|seller|offerType|price| abtest|vehicleType|yearOfRegistration|  gearbox|powerPS|   model|kilometer|monthOfRegistration|fuelType|        brand|notRepairedDamage|        dateCreated|nrOfPictures|postalCode|           lastSeen|
+-------------------+--------------------+------+---------+-----+-------+-----------+------------------+---------+-------+--------+---------+-------------------+--------+-------------+-----------------+-------------------+------------+----------+-------------------+
|2016-03-23 15:48:05|Ford_C_MAX_2.0_TD...|privat|  Angebot| 7550|   test|        bus|              2007|  manuell|    136|   c_max|   150000|                  6|  diesel|         ford|             ne

In [26]:
sample.printSchema()

root
 |-- dateCrawled: string (nullable = true)
 |-- name: string (nullable = true)
 |-- seller: string (nullable = true)
 |-- offerType: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- abtest: string (nullable = true)
 |-- vehicleType: string (nullable = true)
 |-- yearOfRegistration: integer (nullable = true)
 |-- gearbox: string (nullable = true)
 |-- powerPS: integer (nullable = true)
 |-- model: string (nullable = true)
 |-- kilometer: integer (nullable = true)
 |-- monthOfRegistration: integer (nullable = true)
 |-- fuelType: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- notRepairedDamage: string (nullable = true)
 |-- dateCreated: string (nullable = true)
 |-- nrOfPictures: integer (nullable = true)
 |-- postalCode: integer (nullable = true)
 |-- lastSeen: string (nullable = true)



#### Analyzers

In [27]:
# set these variables
_data = sample
_colName = "vehicleType"

#reuse from tutorial
from pydeequ.analyzers import *

analysisResult = AnalysisRunner(spark) \
                    .onData(_data) \
                    .addAnalyzer(Size()) \
                    .addAnalyzer(Completeness(_colName)) \
                    .addAnalyzer(CountDistinct(_colName)) \
                    .addAnalyzer(Distinctness(_colName))\
                    .run()


In [28]:
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show(truncate=False)

+-------+-----------+-------------+---------------------+
|entity |instance   |name         |value                |
+-------+-----------+-------------+---------------------+
|Dataset|*          |Size         |37093.0              |
|Column |vehicleType|Completeness |0.9010055805677621   |
|Column |vehicleType|CountDistinct|8.0                  |
|Column |vehicleType|Distinctness |2.3937045570150505E-4|
+-------+-----------+-------------+---------------------+



Now consider more metrics (consistency)

In [29]:
_targetColName = 'price'
_otherColName = 'powerPS'
analysisResult = AnalysisRunner(spark) \
                    .onData(_data) \
                    .addAnalyzer(Entropy(_colName))\
                    .addAnalyzer(Correlation(_otherColName,_targetColName))\
                    .run()


In [30]:
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show(truncate=False)

+-----------+-------------+-----------+-------------------+
|entity     |instance     |name       |value              |
+-----------+-------------+-----------+-------------------+
|Mutlicolumn|powerPS,price|Correlation|0.02011889664147869|
|Column     |vehicleType  |Entropy    |1.7716931190362275 |
+-----------+-------------+-----------+-------------------+



#### profile

In [31]:
from pydeequ.profiles import *

result = ColumnProfilerRunner(spark) \
    .onData(_data) \
    .run()


In [32]:
for col, profile in result.profiles.items():
   print(profile)

StandardProfiles for column: lastSeen: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 31610,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {
        "Boolean": 0,
        "Fractional": 0,
        "Integral": 0,
        "Unknown": 0,
        "String": 37093
    },
    "histogram": null
}
StandardProfiles for column: name: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 28466,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {
        "Boolean": 0,
        "Fractional": 0,
        "Integral": 8,
        "Unknown": 0,
        "String": 37085
    },
    "histogram": null
}
NumericProfiles for column: yearOfRegistration: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 94,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": [
        [
            "2014",
            475,
            0.012805650661849945
        ],
        [
            "196

In [33]:
result.columnProfileClasses

{'StandardColumnProfile': pydeequ.profiles.StandardColumnProfile,
 'NumericColumnProfile': pydeequ.profiles.NumericColumnProfile}

In [34]:
result.profiles

{'lastSeen': <pydeequ.profiles.StandardColumnProfile at 0x7de888113820>,
 'name': <pydeequ.profiles.StandardColumnProfile at 0x7de888113580>,
 'yearOfRegistration': <pydeequ.profiles.NumericColumnProfile at 0x7de888112a10>,
 'model': <pydeequ.profiles.StandardColumnProfile at 0x7de8880fba00>,
 'abtest': <pydeequ.profiles.StandardColumnProfile at 0x7de8880f9ff0>,
 'powerPS': <pydeequ.profiles.NumericColumnProfile at 0x7de888111ba0>,
 'fuelType': <pydeequ.profiles.StandardColumnProfile at 0x7de888113370>,
 'notRepairedDamage': <pydeequ.profiles.StandardColumnProfile at 0x7de8880fb9d0>,
 'price': <pydeequ.profiles.NumericColumnProfile at 0x7de888110f70>,
 'vehicleType': <pydeequ.profiles.StandardColumnProfile at 0x7de888111b40>,
 'dateCrawled': <pydeequ.profiles.StandardColumnProfile at 0x7de8880fb4f0>,
 'offerType': <pydeequ.profiles.StandardColumnProfile at 0x7de8880fac20>,
 'gearbox': <pydeequ.profiles.StandardColumnProfile at 0x7de888113280>,
 'monthOfRegistration': <pydeequ.profiles.

In [35]:
ex_result = ColumnProfilerRunner(spark) \
    .onData(_data.select('name','powerPs')) \
    .run()

In [36]:
for col, profile in ex_result.profiles.items():
    print(profile)

StandardProfiles for column: name: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 28466,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {
        "Boolean": 0,
        "Fractional": 0,
        "Integral": 8,
        "Unknown": 0,
        "String": 37085
    },
    "histogram": null
}
NumericProfiles for column: powerPs: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 406,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": null,
    "kll": "None",
    "mean": 115.45855013075243,
    "maximum": 20000.0,
    "minimum": 0.0,
    "sum": 4282704.0,
    "stdDev": 199.11718427917157,
    "approxPercentiles": []
}


##### [TODO] import the JSON object to a DF to facilitate its processing

#### constraint suggestion

In [51]:
from pydeequ.suggestions import *

suggestionResult = ConstraintSuggestionRunner(spark) \
             .onData(_data) \
             .addConstraintRule(DEFAULT()) \
             .run()

# Constraint Suggestions in JSON format
# print(suggestionResult)


In [48]:
import pprint as pprint
pprint.pprint(suggestionResult)

{'constraint_suggestions': [{'code_for_constraint': '.isComplete("lastSeen")',
                             'column_name': 'lastSeen',
                             'constraint_name': 'CompletenessConstraint(Completeness(lastSeen,None))',
                             'current_value': 'Completeness: 1.0',
                             'description': "'lastSeen' is not null",
                             'rule_description': 'If a column is complete in '
                                                 'the sample, we suggest a NOT '
                                                 'NULL constraint',
                             'suggesting_rule': 'CompleteIfCompleteRule()'},
                            {'code_for_constraint': '.isComplete("name")',
                             'column_name': 'name',
                             'constraint_name': 'CompletenessConstraint(Completeness(name,None))',
                             'current_value': 'Completeness: 1.0',
                           

In [49]:
from pydeequ.suggestions import *

suggestionResult = ConstraintSuggestionRunner(spark) \
             .onData(_data.select('fuelType','powerPs')) \
             .addConstraintRule(DEFAULT()) \
             .run()

# Constraint Suggestions in JSON format
pprint.pprint(suggestionResult)

{'constraint_suggestions': [{'code_for_constraint': '.isContainedIn("fuelType", '
                                                    '["benzin", "diesel", '
                                                    '"lpg", "cng", "hybrid", '
                                                    '"andere", "elektro"])',
                             'column_name': 'fuelType',
                             'constraint_name': "ComplianceConstraint(Compliance('fuelType' "
                                                "has value range 'benzin', "
                                                "'diesel', 'lpg', 'cng', "
                                                "'hybrid', 'andere', "
                                                "'elektro',`fuelType` IN "
                                                "('benzin', 'diesel', 'lpg', "
                                                "'cng', 'hybrid', 'andere', "
                                                "'elektro'),None))",
            

##### [TODO] import the JSON object to a DF to facilitate its processing

#### constraint verification

In [50]:
from pydeequ.checks import *
from pydeequ.verification import *

check = Check(spark, CheckLevel.Warning, "Review Check")

checkResult = VerificationSuite(spark) \
    .onData(_data) \
    .addCheck(
        check
        .isComplete('price')  \
        .isNonNegative('price')  \
        .isUnique('seller'))\
    .run()



In [52]:
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show()#truncate=False)

+------------+-----------+------------+--------------------+-----------------+--------------------+
|       check|check_level|check_status|          constraint|constraint_status|  constraint_message|
+------------+-----------+------------+--------------------+-----------------+--------------------+
+------------+-----------+------------+--------------------+-----------------+--------------------+

