In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("DataCoSupplyChainDataset.csv", encoding_errors = 'ignore')

Products Table

In [4]:
products = df[['Product Card Id', 'Category Name', 'Product Name', 'Product Price']]
products = products.drop_duplicates()
products.isna().sum() # no missing values
products.rename(columns = {"Product Card Id": "ProductID",
                           "Category Name": "Category"}, inplace = True)

# sorting all different categories into 3: Home Appliances, Electronics, Clothing
categories_mapping = {
    'Home Appliances': ['Garden'],
    'Electronics': ['Electronics', 'Consumer Electronics', 'Cameras ', 'Computers', 'DVDs', 'CDs '],
    'Clothing': ["Women's Apparel", "Girls' Apparel", "Women's Clothing", "Men's Clothing", "Children's Clothing", "Gold Apparel"]
}

def map_category(item, mapping_dict):
    for category, keywords in mapping_dict.items():
        for keyword in keywords:
            if keyword.lower() in item.lower():
                return category
    return None  # return None if it doesn't match any category

products['Category'] = products['Category'].apply(lambda x: map_category(x, categories_mapping))
products = products[products['Category'].notna()] # removing categories that don't fall into any of the 3 categories

products.reset_index(inplace = True, drop = True)
products.to_csv("Product Table.csv")

Orders Table

In [5]:
orders = df[['Order Id', 'Product Card Id', 'order date (DateOrders)', 'shipping date (DateOrders)', 'Days for shipping (real)', 'Days for shipment (scheduled)', 'Late_delivery_risk', 'Shipping Mode', 'Order Item Discount Rate']]

orders = orders.drop_duplicates()
orders.isna().sum() # no missing values
orders.rename(columns = {"Order Id": "OrderID",
                         "Product Card Id": "ProductID",
                         "order date (DateOrders)": "Order Date",
                         "shipping date (DateOrders)": "Shipping Date",
                         "Days for shipping (real)": "Actual days for shipping",
                         "Days for shipment (scheduled)": "Scheduled days for shipping",
                         "Late_delivery_risk": "Late Delivery Risk",
                         "Order Item Discount Rate": "Discount Percentage"}, inplace = True)

np.random.seed(42)
orders["CustomerID"] = np.random.randint(0, 100000, size = len(orders))
orders["Returns"] = np.random.choice([0, 1], size = len(orders), p = [0.9, 0.1]) # 10% of products are returned

productIDs = products['ProductID']
productIDs.to_list()
orders = orders[orders['ProductID'].isin(productIDs)] # removing orders for products that are not in any of the 3 categories

orders.reset_index(inplace = True, drop = True)
orders.to_csv("Orders Table.csv")

Suppliers Table

In [None]:
suppliers = products[["ProductID"]]
suppliers["SupplierID"] = np.random.randint(0, 25, size = len(suppliers))

suppliers.to_csv("Suppliers Table.csv")