In [1]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from data_cleaning import DataCleaning

aws_connector = DatabaseConnector(
    creds_file="aws_db_creds.yaml"
)  # Initialize AWS connector for extraction
data_extractor = DataExtractor()
data_cleaner = DataCleaning()
local_connector = DatabaseConnector(
    creds_file="local_db_creds.yaml"
)  # Initialise local connector for uploading

## Task 3

In [4]:
# Step 1: Extract data from AWS RDS
table_names_list = aws_connector.list_db_tables()  # Find table name
print("List of table names:", table_names_list)
# Output: List of table names: ['legacy_store_details', 'dim_card_details', 'legacy_users', 'orders_table']
user_table_name = table_names_list[2]
user_df = data_extractor.read_rds_table(aws_connector, user_table_name)

# Step 2: Clean data
cleaned_user_df = data_cleaner.clean_user_data(user_df)
print(cleaned_user_df.info())

# Step 3: Use local connector for uploading
local_connector.upload_to_db(cleaned_user_df, "dim_users")
print("Uploaded cleaned user data to the local sales_data database.")

List of table names: ['legacy_store_details', 'dim_card_details', 'legacy_users', 'orders_table']
Initial number of rows: 15320
Rows after replacing 'NULL' strings: 15320
Rows after dropping NULLs: 15299
Rows after converting 'join_date': 15299
Final rows after date cleaning in user data: 15284
<class 'pandas.core.frame.DataFrame'>
Index: 15284 entries, 0 to 15319
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   index          15284 non-null  int64         
 1   first_name     15284 non-null  object        
 2   last_name      15284 non-null  object        
 3   date_of_birth  15284 non-null  object        
 4   company        15284 non-null  object        
 5   email_address  15284 non-null  object        
 6   address        15284 non-null  object        
 7   country        15284 non-null  object        
 8   country_code   15284 non-null  object        
 9   phone_number   15284 non-null  obj

## Task 4

In [5]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from data_cleaning import DataCleaning

data_extractor = DataExtractor()
data_cleaner = DataCleaning()
local_connector = DatabaseConnector(
    creds_file="local_db_creds.yaml"
)  # Initialise local connector for uploading
pdf_link = "https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf"
card_df = data_extractor.retrieve_pdf_data(pdf_link)
# print("Card data extracted from PDF:")
# print(card_df.info()

# Step 2: Clean card data
cleaned_card_df = data_cleaner.clean_card_data(card_df)
print(cleaned_card_df.info())
# Step 3: Initialize local connector for uploading
local_connector.upload_to_db(cleaned_card_df, "dim_card_details")
print("Uploaded cleaned card data to the local sales_data database.")

Initial number of rows: 15309
Rows after replacing 'NULL' strings: 15309
Rows after dropping NULLs: 15298
Rows after removing duplicates: 15298
Rows after removing non-numeric card numbers: 15284
Final rows after date cleaning: 15284
<class 'pandas.core.frame.DataFrame'>
Index: 15284 entries, 0 to 15308
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   card_number             15284 non-null  object        
 1   expiry_date             15284 non-null  object        
 2   card_provider           15284 non-null  object        
 3   date_payment_confirmed  15284 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 597.0+ KB
None
Uploaded cleaned card data to the local sales_data database.


## Task 5

In [1]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from data_cleaning import DataCleaning
import logging

try:
    # Initialize classes
    data_extractor = DataExtractor()
    data_cleaner = DataCleaning()
    local_connector = DatabaseConnector(creds_file="local_db_creds.yaml")

    # Step 1: Retrieve the number of stores
    number_of_stores_endpoint = (
        "https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores"
    )
    num_stores = data_extractor.list_number_of_stores(number_of_stores_endpoint)
    print(f"Number of stores to retrieve: {num_stores}")

    # Step 2: Retrieve all stores data
    retrieve_store_endpoint = (
        "https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details"
    )
    stores_df = data_extractor.retrieve_stores_data(
        retrieve_store_endpoint, num_stores
    )
    print(f"Retrieved {len(stores_df)} stores from the API.")

    # Step 3: Clean the store data
    cleaned_stores_df = data_cleaner.clean_store_data(stores_df)
    print(f"Cleaned store data has {len(cleaned_stores_df)} rows.")
    print(cleaned_stores_df.info())

    # Step 4: Upload the cleaned data to the database
    local_connector.upload_to_db(cleaned_stores_df, "dim_store_details")
    print("Uploaded cleaned store data to the local sales_data database.")

except Exception as e:
    logging.error(f"Error in main workflow: {e}")

Number of stores to retrieve: 451
Retrieved 451 stores from the API.
Initial rows: 451
Rows after replacing invalid strings with pd.NA: 451
Rows after dropping NULLs: 447
Rows after staff number cleanup: 447
Rows after converting opening_date column into a datetime data type: 440
Store 0 reintegrated after cleaning.
Final rows after date validation: 441
Cleaned store data has 441 rows.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          441 non-null    int64 
 1   address        440 non-null    object
 2   longitude      440 non-null    object
 3   lat            0 non-null      object
 4   locality       440 non-null    object
 5   store_code     441 non-null    object
 6   staff_numbers  441 non-null    object
 7   opening_date   441 non-null    object
 8   store_type     441 non-null    object
 9   latitude       440 non-null 

## Task 6

In [1]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from data_cleaning import DataCleaning

# Step 1: Extract data from S3
data_extractor = DataExtractor()
s3_address = "s3://data-handling-public/products.csv"
products_df = data_extractor.extract_from_s3(s3_address)

# Step 2: Convert product weights unit to kg
data_cleaner = DataCleaning()
products_df = data_cleaner.convert_product_weights(products_df)

# Step 3: Clean products data
products_df = data_cleaner.clean_products_data(products_df)
print(products_df.info())

# Step 4: Upload to database
local_connector = DatabaseConnector(creds_file="local_db_creds.yaml")
local_connector.upload_to_db(products_df, "dim_products")

print("Uploaded cleaned products data to the local sales_data database.")

Initial rows: 1853
Rows after replacing invalid strings: 1853
Row 1779: Set weight to 0 (previously NULL) as all the other values are valid.
Rows after dropping NULLs: 1846
Rows after converting weights to kg: 1846
Final rows after cleaning: 1846
<class 'pandas.core.frame.DataFrame'>
Index: 1846 entries, 0 to 1852
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     1846 non-null   int64 
 1   product_name   1846 non-null   object
 2   product_price  1846 non-null   object
 3   weight         1846 non-null   object
 4   category       1846 non-null   object
 5   EAN            1846 non-null   object
 6   date_added     1846 non-null   object
 7   uuid           1846 non-null   object
 8   removed        1846 non-null   object
 9   product_code   1846 non-null   object
dtypes: int64(1), object(9)
memory usage: 158.6+ KB
None
Uploaded cleaned products data to the local sales_data database.


## Task 7

### Step 1: List All Tables in the Database

In [1]:
from database_utils import DatabaseConnector

# Initialize the AWS RDS connector
aws_connector = DatabaseConnector(creds_file="aws_db_creds.yaml")

# List all tables in the database
table_names = aws_connector.list_db_tables()
print("List of table names:", table_names)

List of table names: ['legacy_store_details', 'dim_card_details', 'legacy_users', 'orders_table']


### Step 2: Extract Orders Data

In [2]:
from data_extraction import DataExtractor

# Initialize the data extractor
data_extractor = DataExtractor()

# Extract the orders data
orders_df = data_extractor.read_rds_table(aws_connector, "orders_table")
print("Orders data extracted successfully.")
print(orders_df.info())

Orders data extracted successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120123 entries, 0 to 120122
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   level_0           120123 non-null  int64  
 1   index             120123 non-null  int64  
 2   date_uuid         120123 non-null  object 
 3   first_name        15284 non-null   object 
 4   last_name         15284 non-null   object 
 5   user_uuid         120123 non-null  object 
 6   card_number       120123 non-null  int64  
 7   store_code        120123 non-null  object 
 8   product_code      120123 non-null  object 
 9   1                 0 non-null       float64
 10  product_quantity  120123 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 10.1+ MB
None


### Step 3: Clean Orders Data

### Step 4: Upload Cleaned Data to Database

In [5]:
from data_cleaning import DataCleaning

# Initialize the data cleaner
data_cleaner = DataCleaning()

# Clean the orders data
cleaned_orders_df = data_cleaner.clean_orders_data(orders_df)
print(cleaned_orders_df.info())

# Initialize the local database connector
local_connector = DatabaseConnector(creds_file="local_db_creds.yaml")

# Upload the cleaned data to the local database
local_connector.upload_to_db(cleaned_orders_df, "orders_table")
print("Uploaded cleaned orders data to the local sales_data database.")

Rows after cleaning: 120123
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120123 entries, 0 to 120122
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   level_0           120123 non-null  int64 
 1   index             120123 non-null  int64 
 2   date_uuid         120123 non-null  object
 3   user_uuid         120123 non-null  object
 4   card_number       120123 non-null  int64 
 5   store_code        120123 non-null  object
 6   product_code      120123 non-null  object
 7   product_quantity  120123 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 7.3+ MB
None
Uploaded cleaned orders data to the local sales_data database.


## Task 8

In [2]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from data_cleaning import DataCleaning

# Step 1: Extract JSON data from S3
data_extractor = DataExtractor()
s3_url = "https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json"
date_times_df = data_extractor.extract_json_from_s3(s3_url)

if not date_times_df.empty:

    # Step 2: Clean the date times data
    data_cleaner = DataCleaning()
    cleaned_date_times_df = data_cleaner.clean_date_times_data(date_times_df)
    print(date_times_df.info())

    # Step 3: Upload cleaned data to the local database
    local_connector = DatabaseConnector(creds_file="local_db_creds.yaml")
    local_connector.upload_to_db(cleaned_date_times_df, "dim_date_times")
    print("Uploaded cleaned date times data to the local sales_data database.")
else:
    print("Failed to extract date times data from S3.")

Date times data extracted successfully.
Rows after cleaning: 120123
<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 120160
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   timestamp    120123 non-null  object
 1   month        120123 non-null  int64 
 2   year         120123 non-null  int64 
 3   day          120123 non-null  int64 
 4   time_period  120123 non-null  object
 5   date_uuid    120123 non-null  object
dtypes: int64(3), object(3)
memory usage: 6.4+ MB
None
Uploaded cleaned date times data to the local sales_data database.
