In [1]:
from dotenv import load_dotenv
import logging
import os

from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

logging.basicConfig(filename='pipeline.log', encoding='utf-8', level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)s] %(name)s - %(funcName).40s - %(message)s",)
logger = logging.getLogger(__name__)

load_dotenv()
CARD_DATA_PDF_PATH = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf'
API_KEY = os.getenv('x-api-key')
NUMBER_STORES_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'
STORE_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/'
S3_ADDRESS = 's3://data-handling-public/products.csv'
DATE_TIMES_S3_ADDRESS = ' s3://data-handling-public/date_details.json'

def setup_database(filename):
    db_conn = DatabaseConnector()
    db_credentials = db_conn.read_db_creds(filename)
    engine = db_conn.init_db_engine(db_credentials)
    return db_conn, engine


In [2]:
src_db, src_engine = setup_database(filename='config/db_creds.yaml')
tgt_db, tgt_engine = setup_database(filename='config/db_creds_target.yaml')

extractor, cleaner = DataExtractor(), DataCleaning()

df_date_times = extractor.extract_from_s3(s3_address=DATE_TIMES_S3_ADDRESS)
# assert len(df_orders.index) == 15284
# target_db.upload_to_db(target_engine, df=df_orders, table_name='dim_users')

In [3]:
df_date_times.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120161 entries, 0 to 120160
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   timestamp    120161 non-null  object
 1   month        120161 non-null  object
 2   year         120161 non-null  object
 3   day          120161 non-null  object
 4   time_period  120161 non-null  object
 5   date_uuid    120161 non-null  object
dtypes: object(6)
memory usage: 6.4+ MB


In [5]:
df_date_times['month'].unique()

array(['9', '2', '4', '11', '12', '8', '1', '3', '7', '10', '5', '6',
       '1YMRDJNU2T', '9GN4VIO5A8', 'NULL', 'NF46JOZMTA', 'LZLLPZ0ZUA',
       'YULO5U0ZAM', 'SAT4V9O2DL', '3ZZ5UCZR5D', 'DGQAH7M1HQ',
       '4FHLELF101', '22JSMNGJCU', 'EB8VJHYZLE', '2VZEREEIKB',
       'K9ZN06ZS1X', '9P3C0WBWTU', 'W6FT760O2B', 'DOIR43VTCM',
       'FA8KD82QH3', '03T414PVFI', 'FNPZFYI489', '67RMH5U2R6',
       'J9VQLERJQO', 'ZRH2YT3FR8', 'GYSATSCN88'], dtype=object)