In [1]:
from dotenv import load_dotenv
import logging
import os

from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

logging.basicConfig(filename='pipeline.log', encoding='utf-8', level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)s] %(name)s - %(funcName).40s - %(message)s",)
logger = logging.getLogger(__name__)

load_dotenv()
CARD_DATA_PDF_PATH = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf'
API_KEY = os.getenv('x-api-key')
NUMBER_STORES_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'
STORE_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/'


def setup_database(filename):
    db_conn = DatabaseConnector()
    db_credentials = db_conn.read_db_creds(filename)
    engine = db_conn.init_db_engine(db_credentials)
    return db_conn, engine

In [2]:
logger.info('****************************** Starting pipeline ******************************')
db_extractor = DataExtractor()
cleaner = DataCleaning()
source_db, source_engine = setup_database(filename='config/db_creds.yaml')

# Extract -> Clean -> Load Product data
headers = {
    "Content-Type": "application/json",
    "X-API-KEY": API_KEY
}
num_stores = db_extractor.list_number_of_stores(url=NUMBER_STORES_ENDPOINT_URL, headers=headers)
df_stores = db_extractor.retrieve_stores_data(url=STORE_ENDPOINT_URL, headers=headers, number_stores=num_stores)
df_stores = cleaner.clean_store_data(df_stores)


In [4]:
df_stores[df_stores['country_code'] == 'US']

Unnamed: 0_level_0,address,longitude,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent,address_2,address_3,address_4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8,510 Jill Mill,44.27804,Kaukauna,KA-FA7ED3B8,31,2022-09-05,Local,-88.27205,US,America,South Laura FL 38723,,
13,520 Fisher Inlet Suite 594,42.24113,Crystal Lake,CR-792AA8BB,138,2018-04-05,Super Store,-88.3162,US,America,Port Eric IL 71753,,
43,92863 Kathryn Ford Suite 747,42.35843,Boston,BO-17E7B6CE,75,2019-07-28,Super Store,-71.05977,US,America,East Craig DC 24439,,
49,0493 Alicia Station Suite 475,27.71809,Sun City Center,SU-95D20AE9,30,2004-05-23,Local,-82.35176,US,America,Elizabethfort RI 70019,,
52,220 Holt Unions Suite 688,39.45621,Martinsburg,MA-F0E23355,8,2011-07-07,Mall Kiosk,-77.96389,US,America,Ramirezstad IL 74968,,
56,9066 Rosales Port Suite 675,27.09978,Venice,VE-93DA8430,22,2006-02-23,Local,-82.45426,US,America,Meyerville NV 79336,,
69,406 Charles Ramp,36.06523,Porterville,PO-EB96293A,36,2003-04-19,Local,-119.01677,US,America,Port Jessicaview WI 37544,,
77,91374 Stanley Rapid Apt. 083,40.81,Morningside Heights,MO-E8CFF8FE,40,2004-12-19,Local,-73.9625,US,America,South Christian MI 26541,,
83,5608 Jason Falls Apt. 917,35.25064,Searcy,SE-F428A035,34,2012-05-10,Local,-91.73625,US,America,Ericaside ND 55443,,
93,14577 Thomas Station,32.9156,Mira Mesa,MI-20EE1BFA,20,2006-07-07,Local,-117.14392,US,America,North Andrew SD 63439,,


In [43]:
# Remove 
# df_stores['address_locality'] = df_stores.loc[
#     :,
#     'address'
# ].apply(lambda x: x.split(',')[0])

# df_stores['post_code'] = df_stores.loc[
#     :,
#     'address_locality'
# ].apply(lambda x: x.split('\n')[-1])

# df_stores[df_stores['country_code'] == 'GB']  # works
# df_stores[df_stores['country_code'] == 'DE'] # Needs work!!!
# df_stores[df_stores['country_code'] == 'US']


# df_stores.loc[df_stores['country_code'] == 'GB', 'post_code'].unique()



# df_gb_stores = df_stores.loc[df_stores['country_code'] == 'GB', :].copy()
# df_stores['post_code'] = df_stores.loc[
#     :,
#     'address_locality'
# ].apply(lambda x: x.split('\n')[-1])
# df_gb_stores


# gb_address = "Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, High Wycombe"

df_stores = cleaner._clean_address(df_stores)

AttributeError: 'DataCleaning' object has no attribute '_clean_address'

In [36]:


# def gb_address_constituents(address):
#     """
#     Remove locality, which is after the comma
#     """
#     locality_removed = address.split(',')[:-1]
#     address = "".join(locality_removed)
#     # print(f"locality removed {address}")
    
#     rows = address.split('\n')
#     print(f"rows {rows}")
    
#     post_code = rows.pop()
#     print(f"post code is {post_code}")
    
#     print(f"rows {rows}")

# gb_address = "Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, High Wycombe"
# gb_address_constituents(gb_address)


rows ['Flat 72W', 'Sally isle', 'East Deantown', 'E7B 8EB']
post code is E7B 8EB
rows ['Flat 72W', 'Sally isle', 'East Deantown']
