In [0]:
 %pip install great-expectations

In [0]:
from great_expectations.dataset import SparkDFDataset

import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

### Load Data and Initialize Great Expectations

In [0]:
merged_op = (
  spark
  .read
  .table(TABLE_NAME)
  .filter(F.col('year') == '2022')
  .filter(F.col('month') == '05')
  .filter(F.col('day') == '01')
  .filter(F.col('hour') == '01')
  .withColumn('conv', F.col('conv').cast(IntegerType()))
)
  

raw_test_df = SparkDFDataset(merged_op)

### Null test
Test is specific columns have `null` values

In [0]:
null_id_test_result = (
  raw_test_df
  .expect_column_values_to_not_be_null('id')
)

In [0]:
null_id_test_result

In [0]:
null_id_test_result.success

In [0]:
null_id_test_result.result

### Uniqueness test
Test is specific columns have duplicates values

In [0]:
unique_id_test_result = (
  raw_test_df
  .expect_column_values_to_not_be_null('id')
)

In [0]:
unique_id_test_result

In [0]:
unique_id_test_result.success

In [0]:
unique_id_test_result.result

In [0]:
unique_country_test_result = (
  raw_test_df
  .expect_column_values_to_be_unique('country')
)

In [0]:
unique_country_test_result

In [0]:
unique_country_test_result.success

In [0]:
unique_country_test_result.result

### Contains test
Test is specific columns contains some defined values

In [0]:
contains_test_result = (
  raw_test_df
  .expect_column_values_to_be_in_set(
    'game', 
    GAMES_LIST)
)

In [0]:
contains_test_result

In [0]:
contains_test_result.success

In [0]:
contains_test_result.result

### Contains (mostly) test
Test is specific columns contains some defined values in a defined percentage of the values

In [0]:
contains_mostly_test_result = (
  raw_test_df
  .expect_column_values_to_be_in_set(
    'game', 
    GAMES_LIST,
    mostly=0.99
  )
)

In [0]:
contains_mostly_test_result

In [0]:
contains_mostly_test_result.success

In [0]:
contains_mostly_test_result.result

### Column Mean value between test
Test is specific numeric column's mean value is between a defined range of values.

In [0]:
click_rate_average_test = (
  raw_test_df
  .expect_column_mean_to_be_between(
    'conv',
    min_value=0.2,
    max_value=0.3
  )
)

In [0]:
click_rate_average_test

In [0]:
click_rate_average_test.success

In [0]:
click_rate_average_test.result