In [1]:
from dotenv import load_dotenv
import logging
import os

from config import date_times_config, order_config, product_config, store_config, card_config, \
    user_config, valid_months, valid_categories, valid_country_codes, valid_card_providers, endpoint
from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

logging.basicConfig(filename='pipeline.log', encoding='utf-8', level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)s] %(name)s - %(funcName).40s - %(message)s",)
logger = logging.getLogger(__name__)

load_dotenv()
API_KEY = os.getenv('x-api-key')


def setup_database(filename):
    db_conn = DatabaseConnector()
    db_credentials = db_conn.read_db_creds(filename)
    engine = db_conn.init_db_engine(db_credentials)
    return db_conn, engine


In [3]:
src_db, src_engine = setup_database(filename='config/db_creds.yaml')
tgt_db, tgt_engine = setup_database(filename='config/db_creds_target.yaml')

"""Extract -> Clean -> Load Order data"""
print("Processing Order Data")
extractor, cleaner = DataExtractor(), DataCleaning()

with src_engine.execution_options(isolation_level='AUTOCOMMIT').connect() as conn:
    df_orders = extractor.read_rds_table(conn, 'orders_table')
print(f"Order rows extracted: {len(df_orders.index)}")
assert len(df_orders.index) == order_config.extracted_count

df_orders = cleaner.clean_order_data(df=df_orders)
print(f"Order rows after cleaning: {len(df_orders.index)}")
assert len(df_orders.index) == order_config.clean_count

tgt_db.upload_to_db(tgt_engine, df=df_orders, table_name='orders_table')

Processing Order Data
Order rows extracted: 120123
Order rows after cleaning: 120123


In [8]:
# len(df_orders['card_number'])

# df_orders['card_number'] = df_orders['card_number'].astype(str)

df_orders.info()

# import numpy as np
# measurer = np.vectorize(len)

# res2 = measurer(df_orders.select_dtypes(include=[object]).values.astype(str)).max(axis=0)
# res2

# array([36, 36, 19, 12, 11])


<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 120122
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   date_uuid         120123 non-null  object
 1   user_uuid         120123 non-null  object
 2   card_number       120123 non-null  object
 3   store_code        120123 non-null  object
 4   product_code      120123 non-null  object
 5   product_quantity  120123 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 6.0+ MB
