In [1]:
from dotenv import load_dotenv
import logging
import os

from config import ColumnEntries
from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

logging.basicConfig(filename='pipeline.log', encoding='utf-8', level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)s] %(name)s - %(funcName).40s - %(message)s",)
logger = logging.getLogger(__name__)

load_dotenv()
CARD_DATA_PDF_PATH = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf'
API_KEY = os.getenv('x-api-key')
NUMBER_STORES_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'
STORE_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/'
PRODUCTS_S3_ADDRESS = 's3://data-handling-public/products.csv'
DATE_TIMES_S3_ADDRESS = ' s3://data-handling-public/date_details.json'


def setup_database(filename):
    db_conn = DatabaseConnector()
    db_credentials = db_conn.read_db_creds(filename)
    engine = db_conn.init_db_engine(db_credentials)
    return db_conn, engine


In [2]:
src_db, src_engine = setup_database(filename='config/db_creds.yaml')
tgt_db, tgt_engine = setup_database(filename='config/db_creds_target.yaml')

print(f"Processing Date & Times Data")
extractor = DataExtractor()
cleaner = DataCleaning(column_entries=ColumnEntries(column_name='month', entries=DataCleaning.valid_months))

df_date_times = extractor.extract_from_s3(s3_address=DATE_TIMES_S3_ADDRESS)
df_date_times = cleaner.clean_date_times_data(df=df_date_times)

Processing Date & Times Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = df['year'] + '-' + df['month'] + '-' + df['day'] + ' ' + df['timestamp']


In [3]:
df_date_times.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 120160
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   timestamp    120123 non-null  object
 1   month        120123 non-null  object
 2   year         120123 non-null  object
 3   day          120123 non-null  object
 4   time_period  120123 non-null  object
 5   date_uuid    120123 non-null  object
 6   date         120123 non-null  object
dtypes: object(7)
memory usage: 7.3+ MB


In [21]:
# import pandas as pd
# df_date_times
# df_date_times['year'].unique() - change to int?
# df_date_times['day'].unique() - change to int?
# df_date_times['time_period'].unique() - change to category?
# Lets add a proper date column!! with the date information combined!
# df_date_times['date'] = df_date_times['year'] + '-' + df_date_times['month'].astype(str) + '-' + df_date_times['day'] + ' ' + df_date_times['timestamp']
# df_date_times['date'] = pd.to_datetime(df_date_times['date'])

In [5]:
df_date_times.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 120160
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   timestamp    120123 non-null  object
 1   month        120123 non-null  object
 2   year         120123 non-null  object
 3   day          120123 non-null  object
 4   time_period  120123 non-null  object
 5   date_uuid    120123 non-null  object
 6   date         120123 non-null  object
dtypes: object(7)
memory usage: 7.3+ MB
