# pydeequ

Use pydeequ to run several data validation checks on our movie dataset
1. Completeness of the `homepage` column
2. `budget` has a value of at least 1000 dollars
3. Uniqueness of `id` column

Requirements
* apache-spark 2.4.6
* python 3.7.X
* java8

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import IntegerType
import pydeequ

spark = (SparkSession
    .builder
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())

In [72]:
df = spark.read.format('csv').option("header",True).option('quote', '"').load('data/tmdb_5000_movies.csv')
df = df.withColumn("budget", df["budget"].cast(IntegerType()))

In [73]:
df.printSchema()

root
 |-- budget: integer (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)



In [75]:
from pydeequ.checks import *
from pydeequ.verification import *

check_warning = Check(spark, CheckLevel.Warning, "Review Check Warning")
check_error = Check(spark, CheckLevel.Error, "Review Check Error")

checkResult = (
    VerificationSuite(spark)
        .onData(df)
        .addCheck(
            check_warning
                .isComplete("homepage")
                .hasMin("budget", lambda x: x == 1000)
        )
        .addCheck(
            check_error
                .isUnique("id")
        )
        .run()
)
    
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show()

+--------------------+-----------+------------+--------------------+-----------------+--------------------+
|               check|check_level|check_status|          constraint|constraint_status|  constraint_message|
+--------------------+-----------+------------+--------------------+-----------------+--------------------+
|  Review Check Error|      Error|     Success|UniquenessConstra...|          Success|                    |
+--------------------+-----------+------------+--------------------+-----------------+--------------------+



# Great Expectations

In [5]:
import great_expectations as ge
import pandas as pd

In [6]:
df = pd.read_csv('data/tmdb_5000_movies.csv')
df = ge.dataset.PandasDataset(df)
print(df.columns)

Index(['budget', 'homepage', 'id', 'original_language', 'original_title',
       'overview', 'popularity', 'release_date', 'revenue', 'runtime',
       'status', 'tagline', 'title', 'vote_average', 'vote_count'],
      dtype='object')


In [8]:
df.expect_column_values_to_be_unique(column="id")

{
  "result": {
    "element_count": 4803,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true
}

In [9]:
df.expect_column_values_to_be_between(column='budget', min_value=1000)

{
  "result": {
    "element_count": 4803,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 1069,
    "unexpected_percent": 22.25692275661045,
    "unexpected_percent_total": 22.25692275661045,
    "unexpected_percent_nonmissing": 22.25692275661045,
    "partial_unexpected_list": [
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": false
}

# Tensorflow DataValidation (TFDV)

Example of using Tensorflow Data Validation to:
1. generate descriptive statistics
2. infer a schema
3. check for anomalies

In [3]:
import tensorflow_data_validation as tfdv
import pandas as pd

In [4]:
df = pd.read_csv('data/tmdb_5000_movies.csv')

In [7]:
stats = tfdv.generate_statistics_from_dataframe(df)

schema = tfdv.infer_schema(statistics=stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'budget',INT,required,,-
'homepage',BYTES,optional,single,-
'id',INT,required,,-
'original_language',STRING,required,,'original_language'
'original_title',BYTES,required,,-
'overview',BYTES,optional,single,-
'popularity',FLOAT,required,,-
'release_date',BYTES,optional,single,-
'revenue',INT,required,,-
'runtime',FLOAT,optional,single,-


In [10]:
anomalies = tfdv.validate_statistics(statistics=stats, schema=schema)
tfdv.display_anomalies(anomalies)