In [None]:
import pandas as pd
from pathlib import Path
import os
import sqlite3
import hashlib
import datetime

#### Program variables

In [None]:
ODATE = datetime.datetime(2022, 3, 20)
ODATE_MINUS_1 = ODATE - datetime.timedelta(days=1)

DBNAME = "DPC_IQ.db"

DROP_INITIAL_DATABASE = False

### Connect to sqlite

In [None]:
file_path = Path(os.getcwd()) / "../" / "data"

In [None]:
conn = sqlite3.connect(file_path / DBNAME)

In [None]:
def execute_query(connection, query):

    cur = connection.cursor()
    cur.execute(query)

def initialise_db(connection, drop_tables=False):
    
    if drop_tables:
        for _t in ["country_dim"]:
            execute_query(connection, f"DROP TABLE IF EXISTS {_t}")
          
    _dim_create = """
    create table if not exists country_dim (
        country_id INTEGER,
        key TEXT,
        place_id TEXT,
        country_code TEXT,
        country_name TEXT,
        subregion1_code TEXT,
        subregion1_name TEXT,
        subregion2_name TEXT,
        effective_start_date DATE,
        effective_end_date DATE,
        hash_val TEXT,
        current_indicator INTEGER
    )    
    """
    
    execute_query(connection, _dim_create)

### Some pandas helper functions

In [None]:
def add_hash(df):
    # better to drop any existing hash_val incase this is run multiple times.
    df.drop(['__str','hash_val'], axis=1, errors='ignore', inplace=True)
    df['__str'] = df.astype(str).values.sum(axis=1)
    df['hash_val'] = df['__str'].apply(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest())

# main()

In [None]:
initialise_db(conn, drop_tables=DROP_INITIAL_DATABASE)

In [None]:
customer_dimension = pd.read_sql_query("select key, hash_val from country_dim where current_indicator='1'", conn)

customer_dimension.set_index('key', inplace=True)

print(customer_dimension.shape)

In [None]:
customer_dimension

In [None]:
ref = file_path / "reference.csv"

In [None]:
df = pd.read_csv(filepath_or_buffer=ref, header=0)

In [None]:
add_hash(df)

In [None]:
df.iloc[0]

### Check for duplicate 'key' values

In [None]:
duplicate_keys_df = df[df.duplicated(keep=False)]

In [None]:
print(f"Found {duplicate_keys_df.shape[0]} records that are duplicates")

#### Show the duplicates (for debug)

In [None]:
df[df.duplicated(keep='first')]

#### Drop the duplicates

In [None]:
df.drop_duplicates(keep='first', inplace=True)

In [None]:
customer_load_pre = df.join(customer_dimension, how='left', on=['key'], rsuffix='_existing')

In [None]:
# create insert
customer_dim_insert = customer_load_pre[customer_load_pre['hash_val_existing'].isnull()]
del customer_dim_insert['hash_val_existing']

In [None]:
customer_matched = customer_load_pre[~customer_load_pre['hash_val_existing'].isnull()]

In [None]:
# create update
customer_matched = customer_load_pre[~customer_load_pre['hash_val_existing'].isnull()]
customer_dim_upsert = customer_matched.query('hash_val != hash_val_existing')

In [None]:
customer_dim_insert.to_sql('stg_country_dim_insert', conn, if_exists='replace')
customer_dim_upsert.to_sql('stg_country_dim_upsert', conn, if_exists='replace')

In [None]:
update_query = f"""
update country_dim
   set effective_end_date = '{ODATE_MINUS_1}',
       current_indicator = 0
 where current_indicator = 1
   and exists (
         select 1 
           from stg_country_dim_upsert
          where key = country_dim.key
          );
"""
execute_query(conn, update_query)
conn.commit()

In [None]:
insert_query = f"""
insert into country_dim
(country_id, key, place_id, country_code, country_name, subregion1_code, subregion1_name, subregion2_name, effective_start_date, effective_end_date, hash_val, current_indicator)
select row_number() over () + id_tbl.min_id as country_id,
       stg.key,
       stg.place_id,
       stg.country_code,
       stg.country_name,
       stg.subregion1_code,
       stg.subregion1_name,
       stg.subregion2_name,
       '{ODATE}' as 'effective_start_date',
       '9999-12-31 00:00:00' as 'effective_end_date',
       stg.hash_val,
       1 as current_indicator
from (
select key,
       place_id,
       country_code,
       country_name,
       subregion1_code,
       subregion1_name,
       subregion2_name,
       hash_val
 from stg_country_dim_insert
union
select key,
       place_id,
       country_code,
       country_name,
       subregion1_code,
       subregion1_name,
       subregion2_name,
       hash_val
  from stg_country_dim_upsert
     ) stg
cross join (select coalesce(max(country_id),0) as min_id from country_dim) id_tbl
"""

execute_query(conn, insert_query)
conn.commit()

In [None]:
print("Done.")

In [None]:
conn.commit()

In [None]:
conn.close()