In [37]:
from dotenv import load_dotenv
import logging
import os

from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

logging.basicConfig(filename='pipeline.log', encoding='utf-8', level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)s] %(name)s - %(funcName).40s - %(message)s",)
logger = logging.getLogger(__name__)

load_dotenv()
CARD_DATA_PDF_PATH = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf'
API_KEY = os.getenv('x-api-key')
NUMBER_STORES_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'
STORE_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/'


def setup_database(filename):
    db_conn = DatabaseConnector()
    db_credentials = db_conn.read_db_creds(filename)
    engine = db_conn.init_db_engine(db_credentials)
    return db_conn, engine

In [38]:
logger.info('****************************** Starting pipeline ******************************')
db_extractor = DataExtractor()
cleaner = DataCleaning()
source_db, source_engine = setup_database(filename='config/db_creds.yaml')

# Extract -> Clean -> Load Product data
headers = {
    "Content-Type": "application/json",
    "X-API-KEY": API_KEY
}
num_stores = db_extractor.list_number_of_stores(url=NUMBER_STORES_ENDPOINT_URL, headers=headers)
df_stores = db_extractor.retrieve_stores_data(url=STORE_ENDPOINT_URL, headers=headers, number_stores=num_stores)
df_stores = cleaner.clean_store_data(df_stores)


In [39]:
df_stores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 441 entries, 0 to 450
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   address        441 non-null    object        
 1   longitude      441 non-null    object        
 2   lat            1 non-null      object        
 3   locality       441 non-null    object        
 4   store_code     441 non-null    object        
 5   staff_numbers  441 non-null    uint16        
 6   opening_date   441 non-null    datetime64[ns]
 7   store_type     441 non-null    object        
 8   latitude       440 non-null    object        
 9   country_code   441 non-null    category      
 10  continent      441 non-null    category      
dtypes: category(2), datetime64[ns](1), object(7), uint16(1)
memory usage: 30.4+ KB


In [31]:
# copy_of_df_stores = df_stores.copy()

# df_stores

# The below N/A entry is valid!!
# df_stores.loc[df_stores['address'] == 'N/A', :] 

# Look into latitude - and remove column lat!
# df_stores[df_stores['latitude'].isna()]
# df_stores

Unnamed: 0_level_0,address,longitude,lat,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,,,,,WEB-1388012W,325,2010-06-12,Web Portal,,GB,Europe
