In [1]:
from dotenv import load_dotenv
import logging
import os

from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

logging.basicConfig(filename='pipeline.log', encoding='utf-8', level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)s] %(name)s - %(funcName).40s - %(message)s",)
logger = logging.getLogger(__name__)

load_dotenv()
CARD_DATA_PDF_PATH = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf'
API_KEY = os.getenv('x-api-key')
NUMBER_STORES_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'
STORE_ENDPOINT_URL = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/'
S3_ADDRESS = 's3://data-handling-public/products.csv'

def setup_database(filename):
    db_conn = DatabaseConnector()
    db_credentials = db_conn.read_db_creds(filename)
    engine = db_conn.init_db_engine(db_credentials)
    return db_conn, engine


In [2]:
logger.info('****************************** Starting pipeline ******************************')
src_db, source_engine = setup_database(filename='config/db_creds.yaml')
tgt_db, tgt_engine = setup_database(filename='config/db_creds_target.yaml')

extractor, cleaner = DataExtractor(), DataCleaning()

df_products = extractor.extract_from_s3(s3_address=S3_ADDRESS)
df_products = cleaner.clean_product_data(df_products)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = df['category'].astype('category')


In [5]:
# df_products['product_price'].unique()
# df_products[df_products['product_price'].apply(type) != str]

df_products['product_price'] = df_products['product_price'].apply(lambda x: x.replace('£', ''))


In [3]:
df_products['product_price'].unique()

array(['£39.99', '£12.99', '£7.00', '£30.00', '£89.99', '£24.99',
       '£22.00', '£20.00', '£16.99', '£25.00', '£45.00', '£18.99',
       '£29.99', '£15.99', '£19.99', '£11.99', '£14.99', '£32.99',
       '£120.00', '£10.00', '£21.99', '£13.99', '£30.99', '£270.00',
       '£40.99', '£42.99', '£71.99', '£34.99', '£52.99', '£13.00',
       '£2.25', '£3.00', '£3.50', '£4.00', '£26.99', '£17.99', '£31.99',
       '£44.99', '£27.99', '£33.49', '£23.49', '£15.00', '£22.99',
       '£6.99', '£12.00', '£5.99', '£3.99', '£50.00', '£49.99', '£36.49',
       '£1.50', '£2.50', '£8.00', '£2.99', '£4.49', '£5.49', '£4.35',
       '£2.75', '£3.69', '£6.49', '£9.49', '£7.99', '£1.29', '£2.19',
       '£2.69', '£0.39', '£2.00', '£1.69', '£1.39', '£1.99', '£3.25',
       '£4.29', '£4.69', '£3.49', '£10.99', '£1.15', '£35.00', '£38.00',
       '£18.00', '£32.00', '£14.00', '£6.00', '£17.00', '£16.00', '£5.00',
       '£40.00', '£65.00', '£70.00', '£80.00', '£60.00', '£99.00',
       '£110.00', '£100.0

In [4]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1846 entries, 0 to 1852
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   product_name   1846 non-null   object        
 1   product_price  1846 non-null   object        
 2   weight         1846 non-null   float64       
 3   category       1846 non-null   category      
 4   EAN            1846 non-null   object        
 5   date_added     1846 non-null   datetime64[ns]
 6   uuid           1846 non-null   object        
 7   removed        1846 non-null   category      
 8   product_code   1846 non-null   object        
dtypes: category(2), datetime64[ns](1), float64(1), object(5)
memory usage: 108.6+ KB
