# Data Quality Dev

Data Quality tests:
- All Tables: has_rows, i.e. record count > 0 --> passed
- All Table Primary Keys: has_nulls, i.e. zero null record count --> test passed
- Fact Tables: All Foreign Keys: has_nulls, i.e. zero null record count --> test passed

In [1]:
from datetime import datetime, timedelta

import configparser
import os
from pprint import pprint

import pandas as pd, numpy as np
import matplotlib.pyplot as plt

import findspark
findspark.init()
print(findspark.find())
print(os.environ['SPARK_HOME'])
print(os.environ['JAVA_HOME'])
print(os.environ['HADOOP_HOME'])

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, asc, desc, min, max, coalesce, lit, md5
from pyspark.sql.types import *

pd.set_option('display.max_rows', 50)

c:\spark
c:\spark
C:\Program Files\Zulu\zulu-8-jre\
c:\Hadoop


In [2]:
def create_spark_session(local=True):
    """
    Creates and returns spark session.
    """
    spark = SparkSession \
        .builder \
        .enableHiveSupport() \
        .getOrCreate()
    return spark

# create spark session
spark = create_spark_session()

In [3]:
# test definition has_rows

def has_rows(path, df):
    d = tuple()
    # has rows test
    if df is None:
        d = (path, 'has_rows', 'failed', None, None)
    else:
        num_recs = df.count()
        if num_recs == 0:
            d = (path, 'has_rows', 'failed', None, num_recs)
        else:
            d = (path, 'has_rows', 'passed', None, num_recs)
    return d

In [4]:
# test definition has_nulls

def has_nulls(path, df, fields=[]):
    l = []
    d = tuple()
    for field in fields:
        num_recs = df.select(field).where(col(field).isNull()).count()
        if num_recs > 0:
            d = (path, 'has_nulls', 'failed', field, num_recs)
        else:
            d = (path, 'has_nulls', 'passed', field, num_recs)
        l.append(d)

    return l

In [17]:
input_data = {"../../model/i94.parquet":['cit_country_id', 'res_country_id', 'mode_id'], 
              "../../model/visa_categories.csv":['visa_id'],
              "../../model/us_states.csv": ['state_id'],
              "../../model/countries.csv":['country_id'],
              "../../model/temperature.csv":['country_id', 'month']}

In [18]:
has_rows_result = []
has_nulls_result = []

for path, fields in input_data.items():
    print(path)
    
    try:
        if "parquet" in path:
            df_spark = spark.read.parquet(path)
        else:
            df_spark = spark.read.csv(path, header=True, sep=";")       
    except:
        print("issue")
    else:
        has_rows_result.append(has_rows(path, df_spark))
        has_nulls_result.append(has_nulls(path, df_spark, fields))

../../model/i94.parquet
../../model/visa_categories.csv
../../model/us_states.csv
../../model/countries.csv
../../model/temperature.csv


In [19]:
flatten = lambda t: [item for sublist in t for item in sublist]


In [20]:
results = has_rows_result + flatten(has_nulls_result)

In [21]:
dq_schema = StructType([       
    StructField('path', StringType(), True),
    StructField('test', StringType(), True),
    StructField('result', StringType(), True),
    StructField('field', StringType(), True),
    StructField('num_recs', StringType(), True),
])
df_results = spark.createDataFrame(data=results, schema=dq_schema)

In [22]:
df_results.show(truncate=False)

+-------------------------------+---------+------+--------------+--------+
|path                           |test     |result|field         |num_recs|
+-------------------------------+---------+------+--------------+--------+
|../../model/i94.parquet        |has_rows |passed|null          |300     |
|../../model/visa_categories.csv|has_rows |passed|null          |63      |
|../../model/us_states.csv      |has_rows |passed|null          |55      |
|../../model/countries.csv      |has_rows |passed|null          |216     |
|../../model/temperature.csv    |has_rows |passed|null          |1836    |
|../../model/i94.parquet        |has_nulls|passed|cit_country_id|0       |
|../../model/i94.parquet        |has_nulls|passed|res_country_id|0       |
|../../model/i94.parquet        |has_nulls|passed|mode_id       |0       |
|../../model/visa_categories.csv|has_nulls|passed|visa_id       |0       |
|../../model/us_states.csv      |has_nulls|passed|state_id      |0       |
|../../model/countries.cs

In [28]:
# test if result is one of the results is failed
# note 'passed' must be exchanged with 'failed' in production environment
# used 'passed' to simulate the error

In [26]:
failed_count = df_results.select("result").where(col("result")=="passed").count()

In [27]:
if failed_count > 0:
    raise Exception("Data quality test did not pass.")

Exception: Data quality test did not pass.