**Udacity Data Engineering Capstone Project**<br/>
Avraam Marimpis <avraam.marimpis@gmail.com>, October 2020

- - -


# Imports 

In [1]:
import sys
sys.path.append('config/')
sys.path.append('common/')

import config
import data as cnf_data
import aws_dwh
import preprocess_fn
import redshift

In [2]:
import pyspark
import pyspark.sql.functions as fn
import pyspark.sql.types as t

In [3]:
import psycopg2

In [4]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Local AWS credentials and settings 

In [5]:
dwh = aws_dwh.parse_dwh()

# Connect to Redshift 

In [6]:
conn = psycopg2.connect(f"host={dwh['redshift']['host']} dbname=dev port=5439 user=awsuser password={dwh['redshift']['db_pass']}")

In [7]:
cur = conn.cursor()

# Run Quality Checks

In [8]:
for qc, test in redshift.Redshift.QualityChecks.items():
    print(f"Running QC for table {qc}...", end=" ")
    
    q = test['query']
    
    try:
        cur.execute(q)
        
        result = cur.fetchall()
        num_records = result[0][0]
        
        if "expected" in test:
            test_pass = num_records == test['expected']
        elif "not_expected" in test:
            test_pass = num_records != test['not_expected']
            
        if test_pass:
            print("PASS")
        else:
            print("FAIL")
        
    except Exception as err:
        print(err)
        cur.execute("ROLLBACK")
    conn.commit()


Running QC for table stage_wildfires... PASS
Running QC for table stage_airquality... PASS
Running QC for table stage_temperatures... PASS
Running QC for table stage_droughts... PASS
Running QC for table wildfires... PASS
Running QC for table temperatures... PASS
Running QC for table fips... PASS
Running QC for table states_abbrv... PASS
Running QC for table states_counties... PASS
Running QC for table droughts... PASS
Running QC for table airquality... PASS
Running QC for table annual_reports... PASS


# Clean up 

In [9]:
cur.close()
conn.close()