In [1]:
!pip install pyspark==3.2.0
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark==3.2.0
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.3/281.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.2 (from pyspark==3.2.0)
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.8/198.8 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805894 sha256=098fe2375a876996b08382b6a4db66389e44b22166d5c8c019adf7d1b8337120
  Stored in directory: /root/.cache/pip/wheels/32/97/d3/8b6d964c8700e4fbb561c71638a92ec55dac9be51eb5fea86d
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py

In [2]:
!pip install pydeequ

Collecting pydeequ
  Downloading pydeequ-1.1.1-py3-none-any.whl (36 kB)
Installing collected packages: pydeequ
Successfully installed pydeequ-1.1.1


In [3]:
import os
import pyspark
os.environ['SPARK_VERSION'] = str(pyspark.__version__)

!echo $SPARK_VERSION

3.2.0


In [4]:
import pydeequ
from pydeequ.profiles import *
from pydeequ.checks import *
from pydeequ.verification import *

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

In [6]:
spark

In [7]:
df = spark.read.parquet("shop-clients.parquet")
df.show()
df.printSchema()

+--------------------+------+---------+----------+-------------+------------+---+------------+---------------+
|                  id|   sex|     name|   surname|  second_name|passport_num|age|total_orders|           city|
+--------------------+------+---------+----------+-------------+------------+---+------------+---------------+
|0000f59d342448a6a...|  male|    Артем|    Коваль|     Петрович|  8634393217| 64|          20|Санкт-Петербург|
|000389762aa34cf99...|female|    Ольга|    Попова|    Денисович|  1468464122| 64|          14|         Липецк|
|000494226d7149699...|  male|    Денис|     Тупин|     Олегович|  4111709475| 36|          55|           Омск|
|000b516589bf4024a...|female|    Елена|  Ермолова|     Павлович|  7119323175| 64|          76|       Новгород|
|004bb7d7b03a4715a...|  male|Александр|    Иванов|    Денисович|  5511145370| 74|          36|         Москва|
|004bc5df7dfe47a29...|  male|  Дмитрий|    Коваль|     Иванович|  4296589650| 20|          17|         Москва|
|

1) Analyzer

- Size - размер датафрейма
- Completeness - насколько колонка полна (по сути поиск пробелов и null)
- Compliance - проверяет поле на определенное условие, которое мы задаем

In [8]:
from pydeequ.analyzers import *

analyzer = AnalysisRunner(spark) \
                    .onData(df) \
                    .addAnalyzer(Size()) \
                    .addAnalyzer(Completeness("name")) \
                    .addAnalyzer(Completeness("surname")) \
                    .addAnalyzer(Completeness("second_name")) \
                    .addAnalyzer(Completeness("passport_num")) \
                    .addAnalyzer(Completeness("city")) \
                    .addAnalyzer(Compliance("age less than 0", 'age<0')) \
                    .addAnalyzer(Compliance("age great than 100", 'age>100')) \
                    .addAnalyzer(Compliance("orders less than 0", 'total_orders<0')) \
                    .run()

analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analyzer)
analysisResult_df.show()

+-------+------------------+------------+-------+
| entity|          instance|        name|  value|
+-------+------------------+------------+-------+
| Column|              city|Completeness|    1.0|
| Column|      passport_num|Completeness|    1.0|
| Column|orders less than 0|  Compliance| 7.5E-4|
| Column|   age less than 0|  Compliance|    0.0|
| Column|       second_name|Completeness|    1.0|
|Dataset|                 *|        Size|20000.0|
| Column|              name|Completeness|    1.0|
| Column|           surname|Completeness|    1.0|
| Column|age great than 100|  Compliance|  0.001|
+-------+------------------+------------+-------+



2) Constraint Verification

Аргументы объекта Check:

- сессия Spark
- уровень проверки, в нашем случае Warning
- наименование чекера

Далее указываются условия

Первый чекер проверяет возраст на ограничение 100 лет, отрицательных значений и отсутствие пропусков.

Второй чекер проверяет наличие ФИО.

Третий чекер проверяет наличие только male и female в столбце age

Четвертый чекер проверяет, что размер датасета не менее 20к записей и уникальность столбца id.

In [9]:
check_age = Check(spark, CheckLevel.Warning, "Users Age Check")\
        .hasMax("age", lambda x: x <= 100.0)  \
        .isNonNegative("age") \
        .isComplete("age")
check_fio = Check(spark, CheckLevel.Warning, "Users FIO Check")\
        .isComplete("name") \
        .isComplete("surname") \
        .isComplete("second_name")
check_sex = Check(spark, CheckLevel.Warning, "Users Sex Check")\
        .isContainedIn("sex", ["male", "female"])
check_dataset = Check(spark, CheckLevel.Error, "Users Dataset Check")\
        .hasSize(lambda x: x >= 20000)\
        .isUnique("id")

Python Callback server started!


In [10]:
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(check_age) \
    .addCheck(check_fio) \
    .addCheck(check_sex) \
    .addCheck(check_dataset) \
    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.toPandas().head(20)

Unnamed: 0,check,check_level,check_status,constraint,constraint_status,constraint_message
0,Users Age Check,Warning,Warning,"MaximumConstraint(Maximum(age,None))",Failure,Value: 19399.0 does not meet the constraint re...
1,Users Age Check,Warning,Warning,ComplianceConstraint(Compliance(age is non-neg...,Success,
2,Users Age Check,Warning,Warning,"CompletenessConstraint(Completeness(age,None))",Success,
3,Users FIO Check,Warning,Success,"CompletenessConstraint(Completeness(name,None))",Success,
4,Users FIO Check,Warning,Success,"CompletenessConstraint(Completeness(surname,No...",Success,
5,Users FIO Check,Warning,Success,CompletenessConstraint(Completeness(second_nam...,Success,
6,Users Sex Check,Warning,Success,ComplianceConstraint(Compliance(sex contained ...,Success,
7,Users Dataset Check,Error,Success,SizeConstraint(Size(None)),Success,
8,Users Dataset Check,Error,Success,"UniquenessConstraint(Uniqueness(List(id),None))",Success,


---

In [12]:
!pip install great_expectations

Collecting great_expectations
  Downloading great_expectations-0.18.3-py3-none-any.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting colorama>=0.4.3 (from great_expectations)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting jsonpatch>=1.22 (from great_expectations)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting makefun<2,>=1.7.0 (from great_expectations)
  Downloading makefun-1.15.2-py2.py3-none-any.whl (22 kB)
Collecting marshmallow<4.0.0,>=3.7.1 (from great_expectations)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml<0.17.18,>=0.16 (from great_expectations)
  Downloading ruamel.yaml-0.17.17-py3-none-any.whl (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 kB[0m [