# Data Quality

In this notebook we are going to run the following data quality checks to these tables:


In [7]:
tables = ['immigration', 'personal', 'city', 'temperature']

## All tables has been correctly created

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder\
                    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0")\
                    .enableHiveSupport().getOrCreate()

In [8]:
for table in tables:
    spark.read.parquet(table).printSchema()

root
 |-- immigration_id: double (nullable = true)
 |-- state_code: string (nullable = true)
 |-- arrival_date: timestamp (nullable = true)
 |-- departure_date: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- visa: string (nullable = true)
 |-- tem_id: string (nullable = true)

root
 |-- immigration_id: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- country_from: string (nullable = true)
 |-- biryear: double (nullable = true)

root
 |-- State: string (nullable = true)
 |-- Median_Age: double (nullable = true)
 |-- Male_Population: double (nullable = true)
 |-- Female_Population: double (nullable = true)
 |-- Total_Population: double (nullable = true)
 |-- State_Code: string (nullable = true)
 |-- Male_percentage: double (nullable = true)
 |-- Female_percentage: double (nullable = true)

root
 |-- dt: timestamp (nullable = true)
 |-- AverageTemperature: double (nullable = true)
 |-- Country: string (nullable = tru

## There is any table empty

In [9]:
for table in tables:
    num_rows = spark.read.parquet(table).count()
    if num_rows == 0:
        raise ValueError('ATTENTION!!', table ,'EMPTY')
    else:
        print('TABLE', table, ' IS FILLED')

TABLE immigration  IS FILLED
TABLE personal  IS FILLED
TABLE city  IS FILLED
TABLE temperature  IS FILLED


## There is any immigrant ID duplicated

In [12]:
immigration_table = spark.read.parquet('immigration')
if immigration_table.count() > immigration_table.dropDuplicates(['immigration_id']).count():
    raise ValueError('Data has duplicates')
else:
    print('DATA IS CORRECT')

DATA IS CORRECT
