# Checking Data Quality and Validations

In [52]:
import pandas as pd
import os

# Ensure directories exist
os.makedirs('../processed_data/cleaned_data', exist_ok=True)
os.makedirs('../processed_data/bad_data', exist_ok=True)

# Load datasets
orders_df = pd.read_csv('../original_data/orders.csv')  # Update with actual file path
inventory_df = pd.read_csv('../original_data/inventory.csv')  # Update with actual file path



## Inventory Validation

In [54]:
import pandas as pd
import os

def validate_inventory(df):
    # Initialize lists to store indices of bad rows and their respective comments
    bad_indices = []
    comments = []

    # Function to add bad row index and comment
    def add_bad_row(index, comment):
        if index not in bad_indices:
            bad_indices.append(index)
            comments.append(comment)

    # Validate 'productId' column (should be string)
    for idx, value in df['productId'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "productId is not a string")

    # Validate 'name' column (should be string)
    for idx, value in df['name'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "name is not a string")

    # Validate 'quantity' column (should be integer and >= 0)
    for idx, value in df['quantity'].items():
        if not isinstance(value, int) or value < 0:
            add_bad_row(idx, "quantity is not a non-negative integer")

    # Validate 'category' column (should be string)
    for idx, value in df['category'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "category is not a string")

    # Validate 'subCategory' column (should be string)
    for idx, value in df['subCategory'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "subCategory is not a string")

    # Create bad_rows dataframe
    bad_rows = df.loc[bad_indices].copy()
    bad_rows['comment'] = comments

    # Remove bad rows from the original dataframe to get cleaned rows
    cleaned_rows = df.drop(bad_indices)

    return cleaned_rows, bad_rows



# Assuming inventory_df is the already existing dataframe
cleaned_inventory, bad_inventory = validate_inventory(inventory_df)

# Save cleaned data
cleaned_inventory.to_csv('../processed_data/cleaned_data/cleaned_inventory.csv', index=False)

# Save bad data
bad_inventory.to_csv('../processed_data/bad_data/bad_inventory.csv', index=False)



## Orders Validation

In [53]:
import pandas as pd
import os

def validate_orders(df):
    # Initialize lists to store indices of bad rows and their respective comments
    bad_indices = []
    comments = []
    original_dateTimes = []

    # Function to add bad row index, comment, and original dateTime
    def add_bad_row(index, comment, original_dateTime=None):
        if index not in bad_indices:
            bad_indices.append(index)
            comments.append(comment)
            original_dateTimes.append(original_dateTime)

    # Validate 'orderId' column (should be string)
    for idx, value in df['orderId'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "orderId is not a string")

    # Validate 'productId' column (should be string)
    for idx, value in df['productId'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "productId is not a string")

    # Validate 'currency' column (should be string)
    for idx, value in df['currency'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "currency is not a string")

    # Validate 'quantity' column (should be a positive integer)
    for idx, value in df['quantity'].items():
        if not isinstance(value, int) or value < 0:
            add_bad_row(idx, "quantity is not a positive integer")

    # Validate 'shippingCost' column (should be float or int)
    for idx, value in df['shippingCost'].items():
        if not isinstance(value, (int, float)):
            add_bad_row(idx, "shippingCost is not a float or int")

    # Validate 'amount' column (should be a positive float or int)
    for idx, value in df['amount'].items():
        if not isinstance(value, (int, float)) or value < 0:
            add_bad_row(idx, "amount is not a positive float or int")

    # Validate 'channel' column (should be string)
    for idx, value in df['channel'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "channel is not a string")

    # Validate 'channelGroup' column (should be string)
    for idx, value in df['channelGroup'].items():
        if not isinstance(value, str):
            add_bad_row(idx, "channelGroup is not a string")

    # Validate 'campaign' column (should be string or NaN)
    for idx, value in df['campaign'].items():
        if not (isinstance(value, str) or pd.isna(value)):
            add_bad_row(idx, "campaign is not a string or NaN")

    # Validate 'dateTime' column (should be datetime format)
    original_dateTime_col = df['dateTime'].copy()  # Keep original dateTime values
    df['dateTime'] = pd.to_datetime(df['dateTime'], format="%Y-%m-%dT%H:%M:%SZ", errors='coerce')
    for idx, value in df['dateTime'].items():
        if pd.isna(value):
            add_bad_row(idx, "dateTime is not in the correct format", original_dateTime_col[idx])

    # Create bad_rows dataframe
    bad_rows = df.loc[bad_indices].copy()
    bad_rows['comment'] = comments
    bad_rows['original_dateTime'] = original_dateTimes

    # Remove bad rows from the original dataframe to get cleaned rows
    cleaned_rows = df.drop(bad_indices)

    return cleaned_rows, bad_rows

# Assuming orders_df is the already existing dataframe
cleaned_orders, bad_orders = validate_orders(orders_df)

# Save cleaned data
cleaned_orders.to_csv('../processed_data/cleaned_data/cleaned_orders.csv', index=False)

# Save bad data
bad_orders.to_csv('../processed_data/bad_data/bad_orders.csv', index=False)


