In [1]:
from dotenv import load_dotenv
import logging
import os

from config import endpoints, card, user, store, order, product, date_times, DataType
from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

logging.basicConfig(filename='pipeline.log', encoding='utf-8', level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)s] %(name)s - %(funcName).40s - %(message)s",)
logger = logging.getLogger(__name__)
load_dotenv()


def setup_database(filename):
    db_conn = DatabaseConnector()
    db_credentials = db_conn.read_db_creds(filename)
    engine = db_conn.init_db_engine(db_credentials)
    return db_conn, engine


In [2]:
source_db, source_engine = setup_database(filename='config/db_creds.yaml')
target_db, target_engine = setup_database(filename='config/db_creds_target.yaml')

data_type = card

"""Extract from a pdf file -> Clean -> Load Card data"""
print(f"Processing {data_type.name} Data")

# Extract
data_extractor = DataExtractor()
df_extracted = data_extractor.retrieve_pdf_data(pdf_path=endpoints.card_data)
print(f"{data_type.name} rows extracted: {len(df_extracted.index)}")
assert len(df_extracted.index) == data_type.extracted_count

# Clean
data_cleaner = DataCleaning(column_entries=data_type.column_entries)
df_cleaned = df_extracted.copy()
df_cleaned = data_cleaner.clean_card_data(df=df_cleaned)
print(f"{data_type.name} rows after cleaning: {len(df_cleaned.index)}")
assert len(df_cleaned.index) == data_type.clean_count


Processing Card Data
Card rows extracted: 15309
Card rows after cleaning: 15284


In [33]:
df_cleaned



Unnamed: 0,card_number,expiry_date,card_provider,date_payment_confirmed
0,30060773296197,09/26,Diners Club / Carte Blanche,2015-11-25
1,349624180933183,10/23,American Express,2001-06-18
2,3529023891650490,06/23,JCB 16 digit,2000-12-26
3,213142929492281,09/27,JCB 15 digit,2011-02-12
4,502067329974,10/25,Maestro,1997-03-13
...,...,...,...,...
15279,180036921556789,12/28,JCB 15 digit,1997-06-06
15280,180018030448512,11/24,JCB 15 digit,2004-06-16
15281,3569953313547220,04/24,JCB 16 digit,2020-02-05
15282,4444521712606810,06/27,VISA 16 digit,2008-06-16


In [3]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15284 entries, 0 to 15283
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   card_number             15284 non-null  object        
 1   expiry_date             15284 non-null  object        
 2   card_provider           15284 non-null  object        
 3   date_payment_confirmed  15284 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 477.8+ KB


In [4]:
# len(df_orders['card_number'])

# df_orders['card_number'] = df_orders['card_number'].astype(str)

# df_orders.info()

import numpy as np
measurer = np.vectorize(len)

res2 = measurer(df_cleaned.select_dtypes(include=[object]).values.astype(str)).max(axis=0)
res2

# array([36, 36, 19, 12, 11])


array([19,  5, 27])