In [None]:
import pandas as pd
from pathlib import Path
import os
import sqlite3
import hashlib
import datetime

In [None]:
# 'Batch' settings
ODATE = datetime.datetime(2022, 3, 20)
ODATE_MINUS_1 = ODATE - datetime.timedelta(days=1)
HI_DATE = '9999-12-31 00:00:00'

# Database settings
DBNAME = "DPC_IQ.db"
DROP_INITIAL_DATABASE = True

In [None]:
file_path = Path(os.getcwd()) / "../" / "data"

In [None]:
conn = sqlite3.connect(file_path / DBNAME)

In [None]:
def execute_query(connection, query, auto_commit=True):

    cur = connection.cursor()
    cur.execute(query)
    
    if auto_commit:
        connection.commit()
        
def initialise_db(connection, drop_tables=False):
    
    if drop_tables:
        for _t in ["health_fact"]:
            execute_query(connection, f"DROP TABLE IF EXISTS {_t}")
          
    _dim_create = """
    create table if not exists health_fact (
       health_id INTEGER,
       country_id INTEGER,
       date_taken TEXT,
       nk_key TEXT,
       life_expectancy REAL,
       smoking_prevalence REAL,
       diabetes_prevalence REAL,
       infant_mortality_rate REAL,
       adult_male_mortality_rate REAL,
       adult_female_mortality_rate REAL,
       pollution_mortality_rate REAL,
       hospital_beds REAL
    )
    """
    
    execute_query(connection, _dim_create)

In [None]:
initialise_db(conn, drop_tables=DROP_INITIAL_DATABASE)

In [None]:
fact1_csv, fact2_csv = file_path / "health-1.csv", file_path / "health-2.csv"

In [None]:
fact1_df = pd.read_csv(filepath_or_buffer=fact1_csv, header=0)
fact2_df = pd.read_csv(filepath_or_buffer=fact2_csv, header=0)

In [None]:
def process_fact(df):   
    
    df.to_sql('stg_health_fact', conn, if_exists='replace', index=False)
    
    augment_query = """
    insert into country_dim
    (country_id, key, place_id, country_code, country_name, subregion1_code, subregion1_name, subregion2_name, effective_start_date, effective_end_date, hash_val, current_indicator, augment_indicator)
    select row_number() over () + id_tbl.max_id as country_id,
           augment_key,
           'UNKNOWN',
           'UNKNOWN',
           'UNKNOWN',
           'UNKNOWN', 
           'UNKNOWN',
           'UNKNOWN',
           '{ODATE}' as 'effective_start_date',
           '{HI_DATE}' as 'effective_end_date',
           -1,
           1 as current_indicator,
           1 as augment_indicator
    from 
    (
        select distinct(key) as augment_key
          from stg_health_fact fact
        where not exists (
         select 1 
           from country_dim dim
          where current_indicator = 1 
            and dim.key = fact.key
        )
    ) augment
    cross join (select coalesce(max(country_id),0) as max_id from country_dim) id_tbl
    """
    
    fact_load_query = """
    insert into health_fact 
    (health_id, country_id, date_taken, nk_key, life_expectancy, smoking_prevalence, diabetes_prevalence, infant_mortality_rate, adult_male_mortality_rate, adult_female_mortality_rate, pollution_mortality_rate, hospital_beds)
    select row_number() over () + max_id.id as health_id,
           dim.country_id,
           fact.date_taken,
           fact.key,
           fact.life_expectancy,
           fact.smoking_prevalence,
           fact.diabetes_prevalence,
           fact.infant_mortality_rate,
           fact.adult_male_mortality_rate,
           fact.adult_female_mortality_rate,
           fact.pollution_mortality_rate,
           fact.hospital_beds
    from stg_health_fact fact
    inner join country_dim dim
    on fact.key = dim.key
    and dim.current_indicator = 1
    cross join(select coalesce(max(health_id), 0) as id from health_fact) as max_id
    """
       
    execute_query(conn, augment_query)
    execute_query(conn, fact_load_query)

In [None]:
process_fact(fact1_df)

In [None]:
process_fact(fact2_df)

In [None]:
conn.close()