In [1]:
# !pip install SQLAlchemy
# !pip install psycopg2
# !pip install pandas
# !pip install PyYAML

In [1]:
from sqlalchemy import create_engine, MetaData, Table, select
from dateutil.parser import parse
import pandas as pd
import numpy as np
import psycopg2
import yaml
import re

In [3]:
# Test code for Psycopg2
# conn = psycopg2.connect(
#     host=ENDPOINT,
#     port=PORT,
#     database=DATABASE,
#     user=USER,
#     password=PASSWORD
# )

In [2]:
yaml_file_path = '../db_creds.yaml'

# Read the YAML file and store its contents in a Python data structure (dictionary)
with open(yaml_file_path, 'r') as file:
    yaml_data = yaml.safe_load(file)


In [3]:
# credentials
DATABASE_TYPE = 'postgresql'
# DBAPI = 'psycopg2'
ENDPOINT = yaml_data['RDS_HOST']
USER = yaml_data['RDS_USER']
PASSWORD = yaml_data['RDS_PASSWORD']
PORT = yaml_data['RDS_PORT']
DATABASE = yaml_data['RDS_DATABASE']

In [4]:
# setup sql engine and connect
sql_engine = create_engine(f"{DATABASE_TYPE}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}")
sql_connection = sql_engine.connect()

# metadata, holds collection of table info, their data types, schema names etc., obtained from here : https://docs.sqlalchemy.org/en/20/core/metadata.html
# pros of MetaData() includes thread safety -> meaning it can handle concurrent tasks from multiple thread (computationally efficient when multiple threads need access to same resource)
metadata = MetaData()
metadata.reflect(sql_engine)
table_names = metadata.tables.keys()

# reflect allows us
names_ = list(table_names)

In [5]:
# read table
users_table = sql_connection.execute(select(Table('legacy_users', metadata, autoload=True, autoload_with=sql_engine)))
headers = users_table.keys()
users_df = pd.DataFrame(users_table.fetchall(), columns=headers)

In [6]:
table_name = 'legacy_users'

In [7]:
def is_alphanumeric(in_str):
    """
    @desc: function to check if the column has alphanumeric entries
    """
    return bool(re.fullmatch(r'^[a-zA-Z0-9_]*$', in_str))

def has_yyyy_mm_dd_format(in_str):
    """
    @desc: function to decide if the a column of a data has date format yyyy-mm-dd
    """
    return bool(re.fullmatch(r'\d{4}-\d{2}-\d{2}', in_str))

def convert_date_to_yyyy_mm_dd(in_column : pd.core.series.Series):
    """
    @desc: function to set the date column with date format yyyy-mm-dd
    """
    in_column = in_column.apply(parse)
    in_column = pd.to_datetime(in_column, infer_datetime_format=True, errors='coerce')
    
    return in_column

In [8]:
# check for nulls or NaNs, alternative is np.unique(users_df.isnull()), or just use df.info()
if users_df.isnull().sum().sum() and users_df.isna().sum().sum():
    raise f"The database : {table_name}, has total {users_df.isnull().sum().sum()} NULL values and {users_df.isna().sum().sum()} NaN values"
else:
    print(f"[usrmsg] No NULLs or NaNs found in {table_name}")

users_df_processed = users_df[~users_df.apply(lambda row: row.astype(str).str.contains('NULL').any(), axis=1)]

# check for data types 
#   -1) always begin with dropping duplicates and storing as a seperate file
users_df_processed = users_df_processed.drop_duplicates()
#   -2) set all columns except index to be of string format
str_convert_dict = {col: 'string' for col in users_df_processed.columns if col not in ['index']}
users_df_processed = users_df_processed.astype(str_convert_dict)
#   -3) remove all entries that are pure alphanumeric
users_df_processed = users_df_processed[~users_df_processed['email_address'].apply(is_alphanumeric)]
#   -4) DoB and join_date should be datetime format and of type yyyy-mm-dd
users_df_processed['date_of_birth'] = convert_date_to_yyyy_mm_dd(users_df_processed['date_of_birth'])
users_df_processed['join_date'] = convert_date_to_yyyy_mm_dd(users_df_processed['join_date'])
#   -5) convert all 'GGB' country code to 'GB'
users_df_processed['country_code'] = users_df_processed['country_code'].str.replace('GGB', 'GB', regex=False)


[usrmsg] No NULLs or NaNs found in legacy_users


  in_column = pd.to_datetime(in_column, infer_datetime_format=True, errors='coerce')
  in_column = pd.to_datetime(in_column, infer_datetime_format=True, errors='coerce')


In [9]:
users_df_processed.head(5)

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0 59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a Lynne terrace McCarthymouth TF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive Joanborough SK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow New Tracy W22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass Hunterborough NN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579


In [17]:
yaml_file_path = '../local_db_creds.yaml'

# Read the YAML file and store its contents in a Python data structure (dictionary)
with open(yaml_file_path, 'r') as file:
    yaml_data = yaml.safe_load(file)

In [21]:
# credentials
DATABASE_TYPE = 'postgresql+psycopg2'
ENDPOINT = yaml_data['LOCAL_HOST']
USER = yaml_data['LOCAL_USER']
PASSWORD = yaml_data['LOCAL_PASSWORD']
PORT = yaml_data['LOCAL_PORT']
DATABASE = yaml_data['LOCAL_DATABASE']

engine = create_engine(f"{DATABASE_TYPE}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}")
connection = engine.connect()
users_df_processed.to_sql("dim_users", engine, if_exists='replace', index=False)

284

In [15]:
users_df_processed

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0 59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a Lynne terrace McCarthymouth TF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive Joanborough SK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow New Tracy W22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass Hunterborough NN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579
...,...,...,...,...,...,...,...,...,...,...,...,...
15315,14913,Stephen,Jenkins,1943-08-09,"Thornton, Carroll and Newman",s.jenkins@smith.com,Studio 41I Jones lodge Oliviaborough E8 3DU,United Kingdom,GB,+44(0)292018946,2016-04-15,2bd3a12f-a92d-4cdd-b99c-fc70572db302
15316,14994,Stephen,Smith,1948-08-20,Robinson-Harris,s.smith@smith.com,530 Young parkway Millsfurt L4G 7NX,United Kingdom,GB,+44(0)1144960977,2020-07-20,d234c04b-c07c-46a5-a902-526f91478ecc
15317,15012,Stephen,Losekann,1940-10-09,Rosenow,s.losekann@smith.com,Viviane-Fritsch-Straße 3/5 15064 Bad Liebenwerda,Germany,DE,02984 08192,2021-03-07,1a0a8b7b-7c17-42d8-a946-8a85d5495651
15318,15269,Stephen,Rivera,1952-06-04,"Taylor, Fry and Jones",s.rivera@smith.com,"660 Ross Falls Suite 357 Anthonymouth, MA 09610",United States,US,239.711.3836,2011-01-03,187fe06e-bd5f-4381-af2f-d7ac37ca7572


In [None]:
print(users_df_processed.iloc[1996], users_df_processed.iloc[1046], users_df_processed.iloc[866])

In [None]:
# check if the date of the column has the format yyyy-mm-dd
# incorrect_date_format_df = filtered_alphnum_df[~filtered_alphnum_df['date_of_birth'].apply(has_yyyy_mm_dd_format)]

In [None]:
def null_n_nans_pass(df_):
    """
    @desc: this function checks if the input dataframe has any NaNs or Nulls
    """
    if df_.isnull().sum().sum() and df_.isna().sum().sum():
        raise f"The database : {table_name}, has total {df_.isnull().sum().sum()} NULL values and {df_.isna().sum().sum()} NaN values"
    else:
        print(f"[usrmsg] No NULLs or NaNs found in {table_name}")
