In [None]:
import pandas as pd

# Load the dataset
file_path = "../data/clean/Global_Super_Store.csv"  # Adjust this to your actual file path
data = pd.read_csv(file_path, delimiter=';', encoding='utf-8')

# Print the column names to confirm they match expectations
print("Columns in the dataset:", data.columns.tolist())

# Clean up column names to remove leading/trailing spaces
data.columns = data.columns.str.strip()

# 1. Prepare the Customers table (unique by CustomerID)
customers = data[['CustomerID', 'CustomerName', 'Segment']].drop_duplicates(subset='CustomerID')
customers.to_csv('unique_customers.csv', index=False)

# 2. Prepare the CustomerLocations table (unique by CustomerID and location details)
customer_locations = data[['CustomerID', 'City', 'State', 'Country', 'PostalCode']].drop_duplicates()
customer_locations.to_csv('customer_locations.csv', index=False)

# 3. Prepare the Orders table (unique by OrderID)
orders = data[['OrderID', 'OrderDate', 'ShippingDate', 'ShippingMode', 'CustomerID', 'Sales', 'Profit', 'Discount']].drop_duplicates(subset='OrderID')
orders.to_csv('unique_orders.csv', index=False)


# 5. Prepare the OrderDetails table (retain all rows)
order_details = data[['Row ID', 'OrderID', 'ProductID', 'Quantity', 'Shipping Cost', 'OrderPriority']].rename(
    columns={'Row ID': 'RowID', 'Shipping Cost': 'ShippingCost'}
)
order_details.to_csv('order_details.csv', index=False)

print("Data preprocessing complete. Files generated:")
print("1. unique_customers.csv")
print("2. customer_locations.csv")
print("3. unique_orders.csv")
print("4. unique_products.csv")
print("5. order_details.csv")


Columns in the dataset: ['Row ID', 'OrderID', 'OrderDate', 'ShippingDate', 'ShippingMode', 'CustomerID', 'CustomerName', 'Segment', 'City', 'State', 'Country', 'PostalCode', 'Market', 'Region', 'ProductID', 'Category', 'SubCategory', 'ProductName', 'Price', 'Quantity', 'Discount', 'Profit', 'Shipping Cost', 'OrderPriority', 'Sales']
Data preprocessing complete. Files generated:
1. unique_customers.csv
2. customer_locations.csv
3. unique_orders.csv
4. unique_products.csv
5. order_details.csv


In [26]:
# 4. Problems with Products table - Prepare the Products table (unique by ProductID)
products = data[['ProductID', 'Category', 'SubCategory', 'ProductName', 'Price']].drop_duplicates(subset='ProductID')
products.to_csv('unique_products.csv', index=False)


In [13]:
print("Columns in the dataset:", data.columns.tolist())

Columns in the dataset: ['Row ID', 'OrderID', 'OrderDate', 'ShippingDate', 'ShippingMode', 'CustomerID', 'CustomerName', 'Segment', 'City', 'State', 'Country', 'PostalCode', 'Market', 'Region', 'ProductID', 'Category', 'SubCategory', 'ProductName', 'Price', 'Quantity', 'Discount', 'Profit', 'Shipping Cost', 'OrderPriority', 'Sales']


In [16]:
required_columns = ['ProductID', 'Category', 'SubCategory', 'ProductName', 'Price']
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Missing columns for Products table: {missing_columns}")
    exit()

In [17]:
products = data[['ProductID', 'Category', 'SubCategory', 'ProductName', 'Price']].drop_duplicates(subset='ProductID')
if products.empty:
    print("Products DataFrame is empty after processing. Verify input data.")
    print(data[['ProductID', 'Category', 'SubCategory', 'ProductName', 'Price']].dropna().head())  # Check non-empty rows
    exit()

In [19]:
products.shape

(10292, 5)

In [20]:
print(products.isnull().sum())

ProductID      0
Category       0
SubCategory    0
ProductName    0
Price          0
dtype: int64


In [21]:
print(products['ProductID'].is_unique)

True


In [22]:
missing_product_ids = products[products['ProductID'].isnull()]
print(missing_product_ids)

Empty DataFrame
Columns: [ProductID, Category, SubCategory, ProductName, Price]
Index: []


In [23]:
import pandas as pd

# Load the generated CSV file
file_path = 'unique_products.csv'
try:
    data = pd.read_csv(file_path)
    print(f"File loaded successfully with shape: {data.shape}")
    
    # Validate row consistency
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        header_cols = len(lines[0].strip().split(','))
        for i, line in enumerate(lines[1:], start=2):  # Start from the second line
            if len(line.strip().split(',')) != header_cols:
                print(f"Inconsistent column count at line {i}: {line.strip()}")
except Exception as e:
    print(f"Error loading or validating the file: {e}")


File loaded successfully with shape: (10292, 5)
Inconsistent column count at line 3: OFF-FA-10000490,Office Supplies,Fasteners,"OIC Binder Clips, Mini, 1/4"" Capacity, Black1.24",1.0
Inconsistent column count at line 18: OFF-FA-10001332,Office Supplies,Fasteners,"Acco Banker's Clasps, 5 3/4""-Long2.88",1.0
Inconsistent column count at line 45: OFF-PA-10002615,Office Supplies,Paper,"Ampad Gold Fibre Wirebound Steno Books, 6"" x 9"", Gregg Ruled4.41",1.0
Inconsistent column count at line 46: OFF-PA-10001622,Office Supplies,Paper,"Ampad Poly Cover Wirebound Steno Book, 6"" x 9"" Assorted Colors, Gregg Ruled4.54",1.0
Inconsistent column count at line 49: OFF-BI-10000343,Office Supplies,Binders,"Pressboard Covers with Storage Hooks, 9 1/2"" x 11"", Light Blue4.91",1.0
Inconsistent column count at line 67: OFF-PA-10001838,Office Supplies,Paper,"Adams Telephone Message Book W/Dividers/Space For Phone Numbers, 5 1/4""X8 1/2"", 300/Messages5.88",1.0
Inconsistent column count at line 73: OFF-ST-

In [24]:
# Reload the DataFrame
products = pd.read_csv('unique_products.csv')

# Drop rows with NaN or mismatched column lengths
products = products.dropna()  # Drop rows with missing values
products.to_csv('unique_products_cleaned.csv', index=False)
print("Cleaned file saved as unique_products_cleaned.csv")

Cleaned file saved as unique_products_cleaned.csv


In [25]:
import csv

# File path to the CSV file
file_path = 'unique_products.csv'

# Validate CSV file row consistency
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Get the header row
        header_length = len(header)

        # Check each row for consistency
        for i, row in enumerate(reader, start=2):  # Start at row 2 (after header)
            if len(row) != header_length:
                print(f"Inconsistent column count at line {i}: {row}")
except Exception as e:
    print(f"Error reading file: {e}")
